intl/icu/source/common/rbbitblb.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (c) 2002-2009, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 */
michael@0 7 //
michael@0 8 // rbbitblb.cpp
michael@0 9 //
michael@0 10
michael@0 11
michael@0 12 #include "unicode/utypes.h"
michael@0 13
michael@0 14 #if !UCONFIG_NO_BREAK_ITERATION
michael@0 15
michael@0 16 #include "unicode/unistr.h"
michael@0 17 #include "rbbitblb.h"
michael@0 18 #include "rbbirb.h"
michael@0 19 #include "rbbisetb.h"
michael@0 20 #include "rbbidata.h"
michael@0 21 #include "cstring.h"
michael@0 22 #include "uassert.h"
michael@0 23 #include "cmemory.h"
michael@0 24
michael@0 25 U_NAMESPACE_BEGIN
michael@0 26
michael@0 27 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
michael@0 28 fTree(*rootNode) {
michael@0 29 fRB = rb;
michael@0 30 fStatus = fRB->fStatus;
michael@0 31 UErrorCode status = U_ZERO_ERROR;
michael@0 32 fDStates = new UVector(status);
michael@0 33 if (U_FAILURE(*fStatus)) {
michael@0 34 return;
michael@0 35 }
michael@0 36 if (U_FAILURE(status)) {
michael@0 37 *fStatus = status;
michael@0 38 return;
michael@0 39 }
michael@0 40 if (fDStates == NULL) {
michael@0 41 *fStatus = U_MEMORY_ALLOCATION_ERROR;;
michael@0 42 }
michael@0 43 }
michael@0 44
michael@0 45
michael@0 46
michael@0 47 RBBITableBuilder::~RBBITableBuilder() {
michael@0 48 int i;
michael@0 49 for (i=0; i<fDStates->size(); i++) {
michael@0 50 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
michael@0 51 }
michael@0 52 delete fDStates;
michael@0 53 }
michael@0 54
michael@0 55
michael@0 56 //-----------------------------------------------------------------------------
michael@0 57 //
michael@0 58 // RBBITableBuilder::build - This is the main function for building the DFA state transtion
michael@0 59 // table from the RBBI rules parse tree.
michael@0 60 //
michael@0 61 //-----------------------------------------------------------------------------
michael@0 62 void RBBITableBuilder::build() {
michael@0 63
michael@0 64 if (U_FAILURE(*fStatus)) {
michael@0 65 return;
michael@0 66 }
michael@0 67
michael@0 68 // If there were no rules, just return. This situation can easily arise
michael@0 69 // for the reverse rules.
michael@0 70 if (fTree==NULL) {
michael@0 71 return;
michael@0 72 }
michael@0 73
michael@0 74 //
michael@0 75 // Walk through the tree, replacing any references to $variables with a copy of the
michael@0 76 // parse tree for the substition expression.
michael@0 77 //
michael@0 78 fTree = fTree->flattenVariables();
michael@0 79 #ifdef RBBI_DEBUG
michael@0 80 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
michael@0 81 RBBIDebugPuts("Parse tree after flattening variable references.");
michael@0 82 fTree->printTree(TRUE);
michael@0 83 }
michael@0 84 #endif
michael@0 85
michael@0 86 //
michael@0 87 // If the rules contained any references to {bof}
michael@0 88 // add a {bof} <cat> <former root of tree> to the
michael@0 89 // tree. Means that all matches must start out with the
michael@0 90 // {bof} fake character.
michael@0 91 //
michael@0 92 if (fRB->fSetBuilder->sawBOF()) {
michael@0 93 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
michael@0 94 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
michael@0 95 // Delete and exit if memory allocation failed.
michael@0 96 if (bofTop == NULL || bofLeaf == NULL) {
michael@0 97 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 98 delete bofTop;
michael@0 99 delete bofLeaf;
michael@0 100 return;
michael@0 101 }
michael@0 102 bofTop->fLeftChild = bofLeaf;
michael@0 103 bofTop->fRightChild = fTree;
michael@0 104 bofLeaf->fParent = bofTop;
michael@0 105 bofLeaf->fVal = 2; // Reserved value for {bof}.
michael@0 106 fTree = bofTop;
michael@0 107 }
michael@0 108
michael@0 109 //
michael@0 110 // Add a unique right-end marker to the expression.
michael@0 111 // Appears as a cat-node, left child being the original tree,
michael@0 112 // right child being the end marker.
michael@0 113 //
michael@0 114 RBBINode *cn = new RBBINode(RBBINode::opCat);
michael@0 115 // Exit if memory allocation failed.
michael@0 116 if (cn == NULL) {
michael@0 117 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 118 return;
michael@0 119 }
michael@0 120 cn->fLeftChild = fTree;
michael@0 121 fTree->fParent = cn;
michael@0 122 cn->fRightChild = new RBBINode(RBBINode::endMark);
michael@0 123 // Delete and exit if memory allocation failed.
michael@0 124 if (cn->fRightChild == NULL) {
michael@0 125 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 126 delete cn;
michael@0 127 return;
michael@0 128 }
michael@0 129 cn->fRightChild->fParent = cn;
michael@0 130 fTree = cn;
michael@0 131
michael@0 132 //
michael@0 133 // Replace all references to UnicodeSets with the tree for the equivalent
michael@0 134 // expression.
michael@0 135 //
michael@0 136 fTree->flattenSets();
michael@0 137 #ifdef RBBI_DEBUG
michael@0 138 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
michael@0 139 RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
michael@0 140 fTree->printTree(TRUE);
michael@0 141 }
michael@0 142 #endif
michael@0 143
michael@0 144
michael@0 145 //
michael@0 146 // calculate the functions nullable, firstpos, lastpos and followpos on
michael@0 147 // nodes in the parse tree.
michael@0 148 // See the alogrithm description in Aho.
michael@0 149 // Understanding how this works by looking at the code alone will be
michael@0 150 // nearly impossible.
michael@0 151 //
michael@0 152 calcNullable(fTree);
michael@0 153 calcFirstPos(fTree);
michael@0 154 calcLastPos(fTree);
michael@0 155 calcFollowPos(fTree);
michael@0 156 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
michael@0 157 RBBIDebugPuts("\n");
michael@0 158 printPosSets(fTree);
michael@0 159 }
michael@0 160
michael@0 161 //
michael@0 162 // For "chained" rules, modify the followPos sets
michael@0 163 //
michael@0 164 if (fRB->fChainRules) {
michael@0 165 calcChainedFollowPos(fTree);
michael@0 166 }
michael@0 167
michael@0 168 //
michael@0 169 // BOF (start of input) test fixup.
michael@0 170 //
michael@0 171 if (fRB->fSetBuilder->sawBOF()) {
michael@0 172 bofFixup();
michael@0 173 }
michael@0 174
michael@0 175 //
michael@0 176 // Build the DFA state transition tables.
michael@0 177 //
michael@0 178 buildStateTable();
michael@0 179 flagAcceptingStates();
michael@0 180 flagLookAheadStates();
michael@0 181 flagTaggedStates();
michael@0 182
michael@0 183 //
michael@0 184 // Update the global table of rule status {tag} values
michael@0 185 // The rule builder has a global vector of status values that are common
michael@0 186 // for all tables. Merge the ones from this table into the global set.
michael@0 187 //
michael@0 188 mergeRuleStatusVals();
michael@0 189
michael@0 190 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
michael@0 191 }
michael@0 192
michael@0 193
michael@0 194
michael@0 195 //-----------------------------------------------------------------------------
michael@0 196 //
michael@0 197 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
michael@0 198 //
michael@0 199 //-----------------------------------------------------------------------------
michael@0 200 void RBBITableBuilder::calcNullable(RBBINode *n) {
michael@0 201 if (n == NULL) {
michael@0 202 return;
michael@0 203 }
michael@0 204 if (n->fType == RBBINode::setRef ||
michael@0 205 n->fType == RBBINode::endMark ) {
michael@0 206 // These are non-empty leaf node types.
michael@0 207 n->fNullable = FALSE;
michael@0 208 return;
michael@0 209 }
michael@0 210
michael@0 211 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
michael@0 212 // Lookahead marker node. It's a leaf, so no recursion on children.
michael@0 213 // It's nullable because it does not match any literal text from the input stream.
michael@0 214 n->fNullable = TRUE;
michael@0 215 return;
michael@0 216 }
michael@0 217
michael@0 218
michael@0 219 // The node is not a leaf.
michael@0 220 // Calculate nullable on its children.
michael@0 221 calcNullable(n->fLeftChild);
michael@0 222 calcNullable(n->fRightChild);
michael@0 223
michael@0 224 // Apply functions from table 3.40 in Aho
michael@0 225 if (n->fType == RBBINode::opOr) {
michael@0 226 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
michael@0 227 }
michael@0 228 else if (n->fType == RBBINode::opCat) {
michael@0 229 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
michael@0 230 }
michael@0 231 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
michael@0 232 n->fNullable = TRUE;
michael@0 233 }
michael@0 234 else {
michael@0 235 n->fNullable = FALSE;
michael@0 236 }
michael@0 237 }
michael@0 238
michael@0 239
michael@0 240
michael@0 241
michael@0 242 //-----------------------------------------------------------------------------
michael@0 243 //
michael@0 244 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
michael@0 245 //
michael@0 246 //-----------------------------------------------------------------------------
michael@0 247 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
michael@0 248 if (n == NULL) {
michael@0 249 return;
michael@0 250 }
michael@0 251 if (n->fType == RBBINode::leafChar ||
michael@0 252 n->fType == RBBINode::endMark ||
michael@0 253 n->fType == RBBINode::lookAhead ||
michael@0 254 n->fType == RBBINode::tag) {
michael@0 255 // These are non-empty leaf node types.
michael@0 256 // Note: In order to maintain the sort invariant on the set,
michael@0 257 // this function should only be called on a node whose set is
michael@0 258 // empty to start with.
michael@0 259 n->fFirstPosSet->addElement(n, *fStatus);
michael@0 260 return;
michael@0 261 }
michael@0 262
michael@0 263 // The node is not a leaf.
michael@0 264 // Calculate firstPos on its children.
michael@0 265 calcFirstPos(n->fLeftChild);
michael@0 266 calcFirstPos(n->fRightChild);
michael@0 267
michael@0 268 // Apply functions from table 3.40 in Aho
michael@0 269 if (n->fType == RBBINode::opOr) {
michael@0 270 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
michael@0 271 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
michael@0 272 }
michael@0 273 else if (n->fType == RBBINode::opCat) {
michael@0 274 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
michael@0 275 if (n->fLeftChild->fNullable) {
michael@0 276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
michael@0 277 }
michael@0 278 }
michael@0 279 else if (n->fType == RBBINode::opStar ||
michael@0 280 n->fType == RBBINode::opQuestion ||
michael@0 281 n->fType == RBBINode::opPlus) {
michael@0 282 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
michael@0 283 }
michael@0 284 }
michael@0 285
michael@0 286
michael@0 287
michael@0 288 //-----------------------------------------------------------------------------
michael@0 289 //
michael@0 290 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
michael@0 291 //
michael@0 292 //-----------------------------------------------------------------------------
michael@0 293 void RBBITableBuilder::calcLastPos(RBBINode *n) {
michael@0 294 if (n == NULL) {
michael@0 295 return;
michael@0 296 }
michael@0 297 if (n->fType == RBBINode::leafChar ||
michael@0 298 n->fType == RBBINode::endMark ||
michael@0 299 n->fType == RBBINode::lookAhead ||
michael@0 300 n->fType == RBBINode::tag) {
michael@0 301 // These are non-empty leaf node types.
michael@0 302 // Note: In order to maintain the sort invariant on the set,
michael@0 303 // this function should only be called on a node whose set is
michael@0 304 // empty to start with.
michael@0 305 n->fLastPosSet->addElement(n, *fStatus);
michael@0 306 return;
michael@0 307 }
michael@0 308
michael@0 309 // The node is not a leaf.
michael@0 310 // Calculate lastPos on its children.
michael@0 311 calcLastPos(n->fLeftChild);
michael@0 312 calcLastPos(n->fRightChild);
michael@0 313
michael@0 314 // Apply functions from table 3.40 in Aho
michael@0 315 if (n->fType == RBBINode::opOr) {
michael@0 316 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
michael@0 317 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
michael@0 318 }
michael@0 319 else if (n->fType == RBBINode::opCat) {
michael@0 320 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
michael@0 321 if (n->fRightChild->fNullable) {
michael@0 322 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
michael@0 323 }
michael@0 324 }
michael@0 325 else if (n->fType == RBBINode::opStar ||
michael@0 326 n->fType == RBBINode::opQuestion ||
michael@0 327 n->fType == RBBINode::opPlus) {
michael@0 328 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
michael@0 329 }
michael@0 330 }
michael@0 331
michael@0 332
michael@0 333
michael@0 334 //-----------------------------------------------------------------------------
michael@0 335 //
michael@0 336 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
michael@0 337 //
michael@0 338 //-----------------------------------------------------------------------------
michael@0 339 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
michael@0 340 if (n == NULL ||
michael@0 341 n->fType == RBBINode::leafChar ||
michael@0 342 n->fType == RBBINode::endMark) {
michael@0 343 return;
michael@0 344 }
michael@0 345
michael@0 346 calcFollowPos(n->fLeftChild);
michael@0 347 calcFollowPos(n->fRightChild);
michael@0 348
michael@0 349 // Aho rule #1
michael@0 350 if (n->fType == RBBINode::opCat) {
michael@0 351 RBBINode *i; // is 'i' in Aho's description
michael@0 352 uint32_t ix;
michael@0 353
michael@0 354 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
michael@0 355
michael@0 356 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
michael@0 357 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
michael@0 358 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
michael@0 359 }
michael@0 360 }
michael@0 361
michael@0 362 // Aho rule #2
michael@0 363 if (n->fType == RBBINode::opStar ||
michael@0 364 n->fType == RBBINode::opPlus) {
michael@0 365 RBBINode *i; // again, n and i are the names from Aho's description.
michael@0 366 uint32_t ix;
michael@0 367
michael@0 368 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
michael@0 369 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
michael@0 370 setAdd(i->fFollowPos, n->fFirstPosSet);
michael@0 371 }
michael@0 372 }
michael@0 373
michael@0 374
michael@0 375
michael@0 376 }
michael@0 377
michael@0 378
michael@0 379 //-----------------------------------------------------------------------------
michael@0 380 //
michael@0 381 // calcChainedFollowPos. Modify the previously calculated followPos sets
michael@0 382 // to implement rule chaining. NOT described by Aho
michael@0 383 //
michael@0 384 //-----------------------------------------------------------------------------
michael@0 385 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
michael@0 386
michael@0 387 UVector endMarkerNodes(*fStatus);
michael@0 388 UVector leafNodes(*fStatus);
michael@0 389 int32_t i;
michael@0 390
michael@0 391 if (U_FAILURE(*fStatus)) {
michael@0 392 return;
michael@0 393 }
michael@0 394
michael@0 395 // get a list of all endmarker nodes.
michael@0 396 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
michael@0 397
michael@0 398 // get a list all leaf nodes
michael@0 399 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
michael@0 400 if (U_FAILURE(*fStatus)) {
michael@0 401 return;
michael@0 402 }
michael@0 403
michael@0 404 // Get all nodes that can be the start a match, which is FirstPosition()
michael@0 405 // of the portion of the tree corresponding to user-written rules.
michael@0 406 // See the tree description in bofFixup().
michael@0 407 RBBINode *userRuleRoot = tree;
michael@0 408 if (fRB->fSetBuilder->sawBOF()) {
michael@0 409 userRuleRoot = tree->fLeftChild->fRightChild;
michael@0 410 }
michael@0 411 U_ASSERT(userRuleRoot != NULL);
michael@0 412 UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
michael@0 413
michael@0 414
michael@0 415 // Iteratate over all leaf nodes,
michael@0 416 //
michael@0 417 int32_t endNodeIx;
michael@0 418 int32_t startNodeIx;
michael@0 419
michael@0 420 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
michael@0 421 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
michael@0 422 RBBINode *endNode = NULL;
michael@0 423
michael@0 424 // Identify leaf nodes that correspond to overall rule match positions.
michael@0 425 // These include an endMarkerNode in their followPos sets.
michael@0 426 for (i=0; i<endMarkerNodes.size(); i++) {
michael@0 427 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
michael@0 428 endNode = tNode;
michael@0 429 break;
michael@0 430 }
michael@0 431 }
michael@0 432 if (endNode == NULL) {
michael@0 433 // node wasn't an end node. Try again with the next.
michael@0 434 continue;
michael@0 435 }
michael@0 436
michael@0 437 // We've got a node that can end a match.
michael@0 438
michael@0 439 // Line Break Specific hack: If this node's val correspond to the $CM char class,
michael@0 440 // don't chain from it.
michael@0 441 // TODO: Add rule syntax for this behavior, get specifics out of here and
michael@0 442 // into the rule file.
michael@0 443 if (fRB->fLBCMNoChain) {
michael@0 444 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
michael@0 445 if (c != -1) {
michael@0 446 // c == -1 occurs with sets containing only the {eof} marker string.
michael@0 447 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
michael@0 448 if (cLBProp == U_LB_COMBINING_MARK) {
michael@0 449 continue;
michael@0 450 }
michael@0 451 }
michael@0 452 }
michael@0 453
michael@0 454
michael@0 455 // Now iterate over the nodes that can start a match, looking for ones
michael@0 456 // with the same char class as our ending node.
michael@0 457 RBBINode *startNode;
michael@0 458 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
michael@0 459 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
michael@0 460 if (startNode->fType != RBBINode::leafChar) {
michael@0 461 continue;
michael@0 462 }
michael@0 463
michael@0 464 if (endNode->fVal == startNode->fVal) {
michael@0 465 // The end val (character class) of one possible match is the
michael@0 466 // same as the start of another.
michael@0 467
michael@0 468 // Add all nodes from the followPos of the start node to the
michael@0 469 // followPos set of the end node, which will have the effect of
michael@0 470 // letting matches transition from a match state at endNode
michael@0 471 // to the second char of a match starting with startNode.
michael@0 472 setAdd(endNode->fFollowPos, startNode->fFollowPos);
michael@0 473 }
michael@0 474 }
michael@0 475 }
michael@0 476 }
michael@0 477
michael@0 478
michael@0 479 //-----------------------------------------------------------------------------
michael@0 480 //
michael@0 481 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
michael@0 482 // Do an swizzle similar to chaining, modifying the followPos set of
michael@0 483 // the bofNode to include the followPos nodes from other {bot} nodes
michael@0 484 // scattered through the tree.
michael@0 485 //
michael@0 486 // This function has much in common with calcChainedFollowPos().
michael@0 487 //
michael@0 488 //-----------------------------------------------------------------------------
michael@0 489 void RBBITableBuilder::bofFixup() {
michael@0 490
michael@0 491 if (U_FAILURE(*fStatus)) {
michael@0 492 return;
michael@0 493 }
michael@0 494
michael@0 495 // The parse tree looks like this ...
michael@0 496 // fTree root ---> <cat>
michael@0 497 // / \ .
michael@0 498 // <cat> <#end node>
michael@0 499 // / \ .
michael@0 500 // <bofNode> rest
michael@0 501 // of tree
michael@0 502 //
michael@0 503 // We will be adding things to the followPos set of the <bofNode>
michael@0 504 //
michael@0 505 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
michael@0 506 U_ASSERT(bofNode->fType == RBBINode::leafChar);
michael@0 507 U_ASSERT(bofNode->fVal == 2);
michael@0 508
michael@0 509 // Get all nodes that can be the start a match of the user-written rules
michael@0 510 // (excluding the fake bofNode)
michael@0 511 // We want the nodes that can start a match in the
michael@0 512 // part labeled "rest of tree"
michael@0 513 //
michael@0 514 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
michael@0 515
michael@0 516 RBBINode *startNode;
michael@0 517 int startNodeIx;
michael@0 518 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
michael@0 519 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
michael@0 520 if (startNode->fType != RBBINode::leafChar) {
michael@0 521 continue;
michael@0 522 }
michael@0 523
michael@0 524 if (startNode->fVal == bofNode->fVal) {
michael@0 525 // We found a leaf node corresponding to a {bof} that was
michael@0 526 // explicitly written into a rule.
michael@0 527 // Add everything from the followPos set of this node to the
michael@0 528 // followPos set of the fake bofNode at the start of the tree.
michael@0 529 //
michael@0 530 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
michael@0 531 }
michael@0 532 }
michael@0 533 }
michael@0 534
michael@0 535 //-----------------------------------------------------------------------------
michael@0 536 //
michael@0 537 // buildStateTable() Determine the set of runtime DFA states and the
michael@0 538 // transition tables for these states, by the algorithm
michael@0 539 // of fig. 3.44 in Aho.
michael@0 540 //
michael@0 541 // Most of the comments are quotes of Aho's psuedo-code.
michael@0 542 //
michael@0 543 //-----------------------------------------------------------------------------
michael@0 544 void RBBITableBuilder::buildStateTable() {
michael@0 545 if (U_FAILURE(*fStatus)) {
michael@0 546 return;
michael@0 547 }
michael@0 548 RBBIStateDescriptor *failState;
michael@0 549 // Set it to NULL to avoid uninitialized warning
michael@0 550 RBBIStateDescriptor *initialState = NULL;
michael@0 551 //
michael@0 552 // Add a dummy state 0 - the stop state. Not from Aho.
michael@0 553 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
michael@0 554 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
michael@0 555 if (failState == NULL) {
michael@0 556 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 557 goto ExitBuildSTdeleteall;
michael@0 558 }
michael@0 559 failState->fPositions = new UVector(*fStatus);
michael@0 560 if (failState->fPositions == NULL) {
michael@0 561 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 562 }
michael@0 563 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
michael@0 564 goto ExitBuildSTdeleteall;
michael@0 565 }
michael@0 566 fDStates->addElement(failState, *fStatus);
michael@0 567 if (U_FAILURE(*fStatus)) {
michael@0 568 goto ExitBuildSTdeleteall;
michael@0 569 }
michael@0 570
michael@0 571 // initially, the only unmarked state in Dstates is firstpos(root),
michael@0 572 // where toot is the root of the syntax tree for (r)#;
michael@0 573 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
michael@0 574 if (initialState == NULL) {
michael@0 575 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 576 }
michael@0 577 if (U_FAILURE(*fStatus)) {
michael@0 578 goto ExitBuildSTdeleteall;
michael@0 579 }
michael@0 580 initialState->fPositions = new UVector(*fStatus);
michael@0 581 if (initialState->fPositions == NULL) {
michael@0 582 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 583 }
michael@0 584 if (U_FAILURE(*fStatus)) {
michael@0 585 goto ExitBuildSTdeleteall;
michael@0 586 }
michael@0 587 setAdd(initialState->fPositions, fTree->fFirstPosSet);
michael@0 588 fDStates->addElement(initialState, *fStatus);
michael@0 589 if (U_FAILURE(*fStatus)) {
michael@0 590 goto ExitBuildSTdeleteall;
michael@0 591 }
michael@0 592
michael@0 593 // while there is an unmarked state T in Dstates do begin
michael@0 594 for (;;) {
michael@0 595 RBBIStateDescriptor *T = NULL;
michael@0 596 int32_t tx;
michael@0 597 for (tx=1; tx<fDStates->size(); tx++) {
michael@0 598 RBBIStateDescriptor *temp;
michael@0 599 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
michael@0 600 if (temp->fMarked == FALSE) {
michael@0 601 T = temp;
michael@0 602 break;
michael@0 603 }
michael@0 604 }
michael@0 605 if (T == NULL) {
michael@0 606 break;
michael@0 607 }
michael@0 608
michael@0 609 // mark T;
michael@0 610 T->fMarked = TRUE;
michael@0 611
michael@0 612 // for each input symbol a do begin
michael@0 613 int32_t a;
michael@0 614 for (a = 1; a<=lastInputSymbol; a++) {
michael@0 615 // let U be the set of positions that are in followpos(p)
michael@0 616 // for some position p in T
michael@0 617 // such that the symbol at position p is a;
michael@0 618 UVector *U = NULL;
michael@0 619 RBBINode *p;
michael@0 620 int32_t px;
michael@0 621 for (px=0; px<T->fPositions->size(); px++) {
michael@0 622 p = (RBBINode *)T->fPositions->elementAt(px);
michael@0 623 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
michael@0 624 if (U == NULL) {
michael@0 625 U = new UVector(*fStatus);
michael@0 626 if (U == NULL) {
michael@0 627 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 628 goto ExitBuildSTdeleteall;
michael@0 629 }
michael@0 630 }
michael@0 631 setAdd(U, p->fFollowPos);
michael@0 632 }
michael@0 633 }
michael@0 634
michael@0 635 // if U is not empty and not in DStates then
michael@0 636 int32_t ux = 0;
michael@0 637 UBool UinDstates = FALSE;
michael@0 638 if (U != NULL) {
michael@0 639 U_ASSERT(U->size() > 0);
michael@0 640 int ix;
michael@0 641 for (ix=0; ix<fDStates->size(); ix++) {
michael@0 642 RBBIStateDescriptor *temp2;
michael@0 643 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
michael@0 644 if (setEquals(U, temp2->fPositions)) {
michael@0 645 delete U;
michael@0 646 U = temp2->fPositions;
michael@0 647 ux = ix;
michael@0 648 UinDstates = TRUE;
michael@0 649 break;
michael@0 650 }
michael@0 651 }
michael@0 652
michael@0 653 // Add U as an unmarked state to Dstates
michael@0 654 if (!UinDstates)
michael@0 655 {
michael@0 656 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
michael@0 657 if (newState == NULL) {
michael@0 658 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 659 }
michael@0 660 if (U_FAILURE(*fStatus)) {
michael@0 661 goto ExitBuildSTdeleteall;
michael@0 662 }
michael@0 663 newState->fPositions = U;
michael@0 664 fDStates->addElement(newState, *fStatus);
michael@0 665 if (U_FAILURE(*fStatus)) {
michael@0 666 return;
michael@0 667 }
michael@0 668 ux = fDStates->size()-1;
michael@0 669 }
michael@0 670
michael@0 671 // Dtran[T, a] := U;
michael@0 672 T->fDtran->setElementAt(ux, a);
michael@0 673 }
michael@0 674 }
michael@0 675 }
michael@0 676 return;
michael@0 677 // delete local pointers only if error occured.
michael@0 678 ExitBuildSTdeleteall:
michael@0 679 delete initialState;
michael@0 680 delete failState;
michael@0 681 }
michael@0 682
michael@0 683
michael@0 684
michael@0 685 //-----------------------------------------------------------------------------
michael@0 686 //
michael@0 687 // flagAcceptingStates Identify accepting states.
michael@0 688 // First get a list of all of the end marker nodes.
michael@0 689 // Then, for each state s,
michael@0 690 // if s contains one of the end marker nodes in its list of tree positions then
michael@0 691 // s is an accepting state.
michael@0 692 //
michael@0 693 //-----------------------------------------------------------------------------
michael@0 694 void RBBITableBuilder::flagAcceptingStates() {
michael@0 695 if (U_FAILURE(*fStatus)) {
michael@0 696 return;
michael@0 697 }
michael@0 698 UVector endMarkerNodes(*fStatus);
michael@0 699 RBBINode *endMarker;
michael@0 700 int32_t i;
michael@0 701 int32_t n;
michael@0 702
michael@0 703 if (U_FAILURE(*fStatus)) {
michael@0 704 return;
michael@0 705 }
michael@0 706
michael@0 707 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
michael@0 708 if (U_FAILURE(*fStatus)) {
michael@0 709 return;
michael@0 710 }
michael@0 711
michael@0 712 for (i=0; i<endMarkerNodes.size(); i++) {
michael@0 713 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
michael@0 714 for (n=0; n<fDStates->size(); n++) {
michael@0 715 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
michael@0 716 if (sd->fPositions->indexOf(endMarker) >= 0) {
michael@0 717 // Any non-zero value for fAccepting means this is an accepting node.
michael@0 718 // The value is what will be returned to the user as the break status.
michael@0 719 // If no other value was specified, force it to -1.
michael@0 720
michael@0 721 if (sd->fAccepting==0) {
michael@0 722 // State hasn't been marked as accepting yet. Do it now.
michael@0 723 sd->fAccepting = endMarker->fVal;
michael@0 724 if (sd->fAccepting == 0) {
michael@0 725 sd->fAccepting = -1;
michael@0 726 }
michael@0 727 }
michael@0 728 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
michael@0 729 // Both lookahead and non-lookahead accepting for this state.
michael@0 730 // Favor the look-ahead. Expedient for line break.
michael@0 731 // TODO: need a more elegant resolution for conflicting rules.
michael@0 732 sd->fAccepting = endMarker->fVal;
michael@0 733 }
michael@0 734 // implicit else:
michael@0 735 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
michael@0 736
michael@0 737 // If the end marker node is from a look-ahead rule, set
michael@0 738 // the fLookAhead field or this state also.
michael@0 739 if (endMarker->fLookAheadEnd) {
michael@0 740 // TODO: don't change value if already set?
michael@0 741 // TODO: allow for more than one active look-ahead rule in engine.
michael@0 742 // Make value here an index to a side array in engine?
michael@0 743 sd->fLookAhead = sd->fAccepting;
michael@0 744 }
michael@0 745 }
michael@0 746 }
michael@0 747 }
michael@0 748 }
michael@0 749
michael@0 750
michael@0 751 //-----------------------------------------------------------------------------
michael@0 752 //
michael@0 753 // flagLookAheadStates Very similar to flagAcceptingStates, above.
michael@0 754 //
michael@0 755 //-----------------------------------------------------------------------------
michael@0 756 void RBBITableBuilder::flagLookAheadStates() {
michael@0 757 if (U_FAILURE(*fStatus)) {
michael@0 758 return;
michael@0 759 }
michael@0 760 UVector lookAheadNodes(*fStatus);
michael@0 761 RBBINode *lookAheadNode;
michael@0 762 int32_t i;
michael@0 763 int32_t n;
michael@0 764
michael@0 765 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
michael@0 766 if (U_FAILURE(*fStatus)) {
michael@0 767 return;
michael@0 768 }
michael@0 769 for (i=0; i<lookAheadNodes.size(); i++) {
michael@0 770 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
michael@0 771
michael@0 772 for (n=0; n<fDStates->size(); n++) {
michael@0 773 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
michael@0 774 if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
michael@0 775 sd->fLookAhead = lookAheadNode->fVal;
michael@0 776 }
michael@0 777 }
michael@0 778 }
michael@0 779 }
michael@0 780
michael@0 781
michael@0 782
michael@0 783
michael@0 784 //-----------------------------------------------------------------------------
michael@0 785 //
michael@0 786 // flagTaggedStates
michael@0 787 //
michael@0 788 //-----------------------------------------------------------------------------
michael@0 789 void RBBITableBuilder::flagTaggedStates() {
michael@0 790 if (U_FAILURE(*fStatus)) {
michael@0 791 return;
michael@0 792 }
michael@0 793 UVector tagNodes(*fStatus);
michael@0 794 RBBINode *tagNode;
michael@0 795 int32_t i;
michael@0 796 int32_t n;
michael@0 797
michael@0 798 if (U_FAILURE(*fStatus)) {
michael@0 799 return;
michael@0 800 }
michael@0 801 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
michael@0 802 if (U_FAILURE(*fStatus)) {
michael@0 803 return;
michael@0 804 }
michael@0 805 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
michael@0 806 tagNode = (RBBINode *)tagNodes.elementAt(i);
michael@0 807
michael@0 808 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
michael@0 809 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
michael@0 810 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
michael@0 811 sortedAdd(&sd->fTagVals, tagNode->fVal);
michael@0 812 }
michael@0 813 }
michael@0 814 }
michael@0 815 }
michael@0 816
michael@0 817
michael@0 818
michael@0 819
michael@0 820 //-----------------------------------------------------------------------------
michael@0 821 //
michael@0 822 // mergeRuleStatusVals
michael@0 823 //
michael@0 824 // Update the global table of rule status {tag} values
michael@0 825 // The rule builder has a global vector of status values that are common
michael@0 826 // for all tables. Merge the ones from this table into the global set.
michael@0 827 //
michael@0 828 //-----------------------------------------------------------------------------
michael@0 829 void RBBITableBuilder::mergeRuleStatusVals() {
michael@0 830 //
michael@0 831 // The basic outline of what happens here is this...
michael@0 832 //
michael@0 833 // for each state in this state table
michael@0 834 // if the status tag list for this state is in the global statuses list
michael@0 835 // record where and
michael@0 836 // continue with the next state
michael@0 837 // else
michael@0 838 // add the tag list for this state to the global list.
michael@0 839 //
michael@0 840 int i;
michael@0 841 int n;
michael@0 842
michael@0 843 // Pre-set a single tag of {0} into the table.
michael@0 844 // We will need this as a default, for rule sets with no explicit tagging.
michael@0 845 if (fRB->fRuleStatusVals->size() == 0) {
michael@0 846 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
michael@0 847 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
michael@0 848 }
michael@0 849
michael@0 850 // For each state
michael@0 851 for (n=0; n<fDStates->size(); n++) {
michael@0 852 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
michael@0 853 UVector *thisStatesTagValues = sd->fTagVals;
michael@0 854 if (thisStatesTagValues == NULL) {
michael@0 855 // No tag values are explicitly associated with this state.
michael@0 856 // Set the default tag value.
michael@0 857 sd->fTagsIdx = 0;
michael@0 858 continue;
michael@0 859 }
michael@0 860
michael@0 861 // There are tag(s) associated with this state.
michael@0 862 // fTagsIdx will be the index into the global tag list for this state's tag values.
michael@0 863 // Initial value of -1 flags that we haven't got it set yet.
michael@0 864 sd->fTagsIdx = -1;
michael@0 865 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
michael@0 866 int32_t nextTagGroupStart = 0;
michael@0 867
michael@0 868 // Loop runs once per group of tags in the global list
michael@0 869 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
michael@0 870 thisTagGroupStart = nextTagGroupStart;
michael@0 871 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
michael@0 872 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
michael@0 873 // The number of tags for this state is different from
michael@0 874 // the number of tags in this group from the global list.
michael@0 875 // Continue with the next group from the global list.
michael@0 876 continue;
michael@0 877 }
michael@0 878 // The lengths match, go ahead and compare the actual tag values
michael@0 879 // between this state and the group from the global list.
michael@0 880 for (i=0; i<thisStatesTagValues->size(); i++) {
michael@0 881 if (thisStatesTagValues->elementAti(i) !=
michael@0 882 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
michael@0 883 // Mismatch.
michael@0 884 break;
michael@0 885 }
michael@0 886 }
michael@0 887
michael@0 888 if (i == thisStatesTagValues->size()) {
michael@0 889 // We found a set of tag values in the global list that match
michael@0 890 // those for this state. Use them.
michael@0 891 sd->fTagsIdx = thisTagGroupStart;
michael@0 892 break;
michael@0 893 }
michael@0 894 }
michael@0 895
michael@0 896 if (sd->fTagsIdx == -1) {
michael@0 897 // No suitable entry in the global tag list already. Add one
michael@0 898 sd->fTagsIdx = fRB->fRuleStatusVals->size();
michael@0 899 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
michael@0 900 for (i=0; i<thisStatesTagValues->size(); i++) {
michael@0 901 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
michael@0 902 }
michael@0 903 }
michael@0 904 }
michael@0 905 }
michael@0 906
michael@0 907
michael@0 908
michael@0 909
michael@0 910
michael@0 911
michael@0 912
michael@0 913 //-----------------------------------------------------------------------------
michael@0 914 //
michael@0 915 // sortedAdd Add a value to a vector of sorted values (ints).
michael@0 916 // Do not replicate entries; if the value is already there, do not
michael@0 917 // add a second one.
michael@0 918 // Lazily create the vector if it does not already exist.
michael@0 919 //
michael@0 920 //-----------------------------------------------------------------------------
michael@0 921 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
michael@0 922 int32_t i;
michael@0 923
michael@0 924 if (*vector == NULL) {
michael@0 925 *vector = new UVector(*fStatus);
michael@0 926 }
michael@0 927 if (*vector == NULL || U_FAILURE(*fStatus)) {
michael@0 928 return;
michael@0 929 }
michael@0 930 UVector *vec = *vector;
michael@0 931 int32_t vSize = vec->size();
michael@0 932 for (i=0; i<vSize; i++) {
michael@0 933 int32_t valAtI = vec->elementAti(i);
michael@0 934 if (valAtI == val) {
michael@0 935 // The value is already in the vector. Don't add it again.
michael@0 936 return;
michael@0 937 }
michael@0 938 if (valAtI > val) {
michael@0 939 break;
michael@0 940 }
michael@0 941 }
michael@0 942 vec->insertElementAt(val, i, *fStatus);
michael@0 943 }
michael@0 944
michael@0 945
michael@0 946
michael@0 947 //-----------------------------------------------------------------------------
michael@0 948 //
michael@0 949 // setAdd Set operation on UVector
michael@0 950 // dest = dest union source
michael@0 951 // Elements may only appear once and must be sorted.
michael@0 952 //
michael@0 953 //-----------------------------------------------------------------------------
michael@0 954 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
michael@0 955 int32_t destOriginalSize = dest->size();
michael@0 956 int32_t sourceSize = source->size();
michael@0 957 int32_t di = 0;
michael@0 958 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
michael@0 959 void **destPtr, **sourcePtr;
michael@0 960 void **destLim, **sourceLim;
michael@0 961
michael@0 962 if (destOriginalSize > destArray.getCapacity()) {
michael@0 963 if (destArray.resize(destOriginalSize) == NULL) {
michael@0 964 return;
michael@0 965 }
michael@0 966 }
michael@0 967 destPtr = destArray.getAlias();
michael@0 968 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
michael@0 969
michael@0 970 if (sourceSize > sourceArray.getCapacity()) {
michael@0 971 if (sourceArray.resize(sourceSize) == NULL) {
michael@0 972 return;
michael@0 973 }
michael@0 974 }
michael@0 975 sourcePtr = sourceArray.getAlias();
michael@0 976 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
michael@0 977
michael@0 978 // Avoid multiple "get element" calls by getting the contents into arrays
michael@0 979 (void) dest->toArray(destPtr);
michael@0 980 (void) source->toArray(sourcePtr);
michael@0 981
michael@0 982 dest->setSize(sourceSize+destOriginalSize, *fStatus);
michael@0 983
michael@0 984 while (sourcePtr < sourceLim && destPtr < destLim) {
michael@0 985 if (*destPtr == *sourcePtr) {
michael@0 986 dest->setElementAt(*sourcePtr++, di++);
michael@0 987 destPtr++;
michael@0 988 }
michael@0 989 // This check is required for machines with segmented memory, like i5/OS.
michael@0 990 // Direct pointer comparison is not recommended.
michael@0 991 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
michael@0 992 dest->setElementAt(*destPtr++, di++);
michael@0 993 }
michael@0 994 else { /* *sourcePtr < *destPtr */
michael@0 995 dest->setElementAt(*sourcePtr++, di++);
michael@0 996 }
michael@0 997 }
michael@0 998
michael@0 999 // At most one of these two cleanup loops will execute
michael@0 1000 while (destPtr < destLim) {
michael@0 1001 dest->setElementAt(*destPtr++, di++);
michael@0 1002 }
michael@0 1003 while (sourcePtr < sourceLim) {
michael@0 1004 dest->setElementAt(*sourcePtr++, di++);
michael@0 1005 }
michael@0 1006
michael@0 1007 dest->setSize(di, *fStatus);
michael@0 1008 }
michael@0 1009
michael@0 1010
michael@0 1011
michael@0 1012 //-----------------------------------------------------------------------------
michael@0 1013 //
michael@0 1014 // setEqual Set operation on UVector.
michael@0 1015 // Compare for equality.
michael@0 1016 // Elements must be sorted.
michael@0 1017 //
michael@0 1018 //-----------------------------------------------------------------------------
michael@0 1019 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
michael@0 1020 return a->equals(*b);
michael@0 1021 }
michael@0 1022
michael@0 1023
michael@0 1024 //-----------------------------------------------------------------------------
michael@0 1025 //
michael@0 1026 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
michael@0 1027 // for each node in the tree.
michael@0 1028 //
michael@0 1029 //-----------------------------------------------------------------------------
michael@0 1030 #ifdef RBBI_DEBUG
michael@0 1031 void RBBITableBuilder::printPosSets(RBBINode *n) {
michael@0 1032 if (n==NULL) {
michael@0 1033 return;
michael@0 1034 }
michael@0 1035 n->printNode();
michael@0 1036 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
michael@0 1037
michael@0 1038 RBBIDebugPrintf(" firstpos: ");
michael@0 1039 printSet(n->fFirstPosSet);
michael@0 1040
michael@0 1041 RBBIDebugPrintf(" lastpos: ");
michael@0 1042 printSet(n->fLastPosSet);
michael@0 1043
michael@0 1044 RBBIDebugPrintf(" followpos: ");
michael@0 1045 printSet(n->fFollowPos);
michael@0 1046
michael@0 1047 printPosSets(n->fLeftChild);
michael@0 1048 printPosSets(n->fRightChild);
michael@0 1049 }
michael@0 1050 #endif
michael@0 1051
michael@0 1052
michael@0 1053
michael@0 1054 //-----------------------------------------------------------------------------
michael@0 1055 //
michael@0 1056 // getTableSize() Calculate the size of the runtime form of this
michael@0 1057 // state transition table.
michael@0 1058 //
michael@0 1059 //-----------------------------------------------------------------------------
michael@0 1060 int32_t RBBITableBuilder::getTableSize() const {
michael@0 1061 int32_t size = 0;
michael@0 1062 int32_t numRows;
michael@0 1063 int32_t numCols;
michael@0 1064 int32_t rowSize;
michael@0 1065
michael@0 1066 if (fTree == NULL) {
michael@0 1067 return 0;
michael@0 1068 }
michael@0 1069
michael@0 1070 size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table.
michael@0 1071
michael@0 1072 numRows = fDStates->size();
michael@0 1073 numCols = fRB->fSetBuilder->getNumCharCategories();
michael@0 1074
michael@0 1075 // Note The declaration of RBBIStateTableRow is for a table of two columns.
michael@0 1076 // Therefore we subtract two from numCols when determining
michael@0 1077 // how much storage to add to a row for the total columns.
michael@0 1078 rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
michael@0 1079 size += numRows * rowSize;
michael@0 1080 return size;
michael@0 1081 }
michael@0 1082
michael@0 1083
michael@0 1084
michael@0 1085 //-----------------------------------------------------------------------------
michael@0 1086 //
michael@0 1087 // exportTable() export the state transition table in the format required
michael@0 1088 // by the runtime engine. getTableSize() bytes of memory
michael@0 1089 // must be available at the output address "where".
michael@0 1090 //
michael@0 1091 //-----------------------------------------------------------------------------
michael@0 1092 void RBBITableBuilder::exportTable(void *where) {
michael@0 1093 RBBIStateTable *table = (RBBIStateTable *)where;
michael@0 1094 uint32_t state;
michael@0 1095 int col;
michael@0 1096
michael@0 1097 if (U_FAILURE(*fStatus) || fTree == NULL) {
michael@0 1098 return;
michael@0 1099 }
michael@0 1100
michael@0 1101 if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
michael@0 1102 fDStates->size() > 0x7fff) {
michael@0 1103 *fStatus = U_BRK_INTERNAL_ERROR;
michael@0 1104 return;
michael@0 1105 }
michael@0 1106
michael@0 1107 table->fRowLen = sizeof(RBBIStateTableRow) +
michael@0 1108 sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
michael@0 1109 table->fNumStates = fDStates->size();
michael@0 1110 table->fFlags = 0;
michael@0 1111 if (fRB->fLookAheadHardBreak) {
michael@0 1112 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
michael@0 1113 }
michael@0 1114 if (fRB->fSetBuilder->sawBOF()) {
michael@0 1115 table->fFlags |= RBBI_BOF_REQUIRED;
michael@0 1116 }
michael@0 1117 table->fReserved = 0;
michael@0 1118
michael@0 1119 for (state=0; state<table->fNumStates; state++) {
michael@0 1120 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
michael@0 1121 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
michael@0 1122 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
michael@0 1123 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
michael@0 1124 row->fAccepting = (int16_t)sd->fAccepting;
michael@0 1125 row->fLookAhead = (int16_t)sd->fLookAhead;
michael@0 1126 row->fTagIdx = (int16_t)sd->fTagsIdx;
michael@0 1127 for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
michael@0 1128 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
michael@0 1129 }
michael@0 1130 }
michael@0 1131 }
michael@0 1132
michael@0 1133
michael@0 1134
michael@0 1135 //-----------------------------------------------------------------------------
michael@0 1136 //
michael@0 1137 // printSet Debug function. Print the contents of a UVector
michael@0 1138 //
michael@0 1139 //-----------------------------------------------------------------------------
michael@0 1140 #ifdef RBBI_DEBUG
michael@0 1141 void RBBITableBuilder::printSet(UVector *s) {
michael@0 1142 int32_t i;
michael@0 1143 for (i=0; i<s->size(); i++) {
michael@0 1144 void *v = s->elementAt(i);
michael@0 1145 RBBIDebugPrintf("%10p", v);
michael@0 1146 }
michael@0 1147 RBBIDebugPrintf("\n");
michael@0 1148 }
michael@0 1149 #endif
michael@0 1150
michael@0 1151
michael@0 1152 //-----------------------------------------------------------------------------
michael@0 1153 //
michael@0 1154 // printStates Debug Function. Dump the fully constructed state transition table.
michael@0 1155 //
michael@0 1156 //-----------------------------------------------------------------------------
michael@0 1157 #ifdef RBBI_DEBUG
michael@0 1158 void RBBITableBuilder::printStates() {
michael@0 1159 int c; // input "character"
michael@0 1160 int n; // state number
michael@0 1161
michael@0 1162 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
michael@0 1163 RBBIDebugPrintf(" | Acc LA Tag");
michael@0 1164 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
michael@0 1165 RBBIDebugPrintf(" %2d", c);
michael@0 1166 }
michael@0 1167 RBBIDebugPrintf("\n");
michael@0 1168 RBBIDebugPrintf(" |---------------");
michael@0 1169 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
michael@0 1170 RBBIDebugPrintf("---");
michael@0 1171 }
michael@0 1172 RBBIDebugPrintf("\n");
michael@0 1173
michael@0 1174 for (n=0; n<fDStates->size(); n++) {
michael@0 1175 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
michael@0 1176 RBBIDebugPrintf(" %3d | " , n);
michael@0 1177 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
michael@0 1178 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
michael@0 1179 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
michael@0 1180 }
michael@0 1181 RBBIDebugPrintf("\n");
michael@0 1182 }
michael@0 1183 RBBIDebugPrintf("\n\n");
michael@0 1184 }
michael@0 1185 #endif
michael@0 1186
michael@0 1187
michael@0 1188
michael@0 1189 //-----------------------------------------------------------------------------
michael@0 1190 //
michael@0 1191 // printRuleStatusTable Debug Function. Dump the common rule status table
michael@0 1192 //
michael@0 1193 //-----------------------------------------------------------------------------
michael@0 1194 #ifdef RBBI_DEBUG
michael@0 1195 void RBBITableBuilder::printRuleStatusTable() {
michael@0 1196 int32_t thisRecord = 0;
michael@0 1197 int32_t nextRecord = 0;
michael@0 1198 int i;
michael@0 1199 UVector *tbl = fRB->fRuleStatusVals;
michael@0 1200
michael@0 1201 RBBIDebugPrintf("index | tags \n");
michael@0 1202 RBBIDebugPrintf("-------------------\n");
michael@0 1203
michael@0 1204 while (nextRecord < tbl->size()) {
michael@0 1205 thisRecord = nextRecord;
michael@0 1206 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
michael@0 1207 RBBIDebugPrintf("%4d ", thisRecord);
michael@0 1208 for (i=thisRecord+1; i<nextRecord; i++) {
michael@0 1209 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
michael@0 1210 }
michael@0 1211 RBBIDebugPrintf("\n");
michael@0 1212 }
michael@0 1213 RBBIDebugPrintf("\n\n");
michael@0 1214 }
michael@0 1215 #endif
michael@0 1216
michael@0 1217
michael@0 1218 //-----------------------------------------------------------------------------
michael@0 1219 //
michael@0 1220 // RBBIStateDescriptor Methods. This is a very struct-like class
michael@0 1221 // Most access is directly to the fields.
michael@0 1222 //
michael@0 1223 //-----------------------------------------------------------------------------
michael@0 1224
michael@0 1225 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
michael@0 1226 fMarked = FALSE;
michael@0 1227 fAccepting = 0;
michael@0 1228 fLookAhead = 0;
michael@0 1229 fTagsIdx = 0;
michael@0 1230 fTagVals = NULL;
michael@0 1231 fPositions = NULL;
michael@0 1232 fDtran = NULL;
michael@0 1233
michael@0 1234 fDtran = new UVector(lastInputSymbol+1, *fStatus);
michael@0 1235 if (U_FAILURE(*fStatus)) {
michael@0 1236 return;
michael@0 1237 }
michael@0 1238 if (fDtran == NULL) {
michael@0 1239 *fStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 1240 return;
michael@0 1241 }
michael@0 1242 fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized.
michael@0 1243 // It is indexed by input symbols, and will
michael@0 1244 // hold the next state number for each
michael@0 1245 // symbol.
michael@0 1246 }
michael@0 1247
michael@0 1248
michael@0 1249 RBBIStateDescriptor::~RBBIStateDescriptor() {
michael@0 1250 delete fPositions;
michael@0 1251 delete fDtran;
michael@0 1252 delete fTagVals;
michael@0 1253 fPositions = NULL;
michael@0 1254 fDtran = NULL;
michael@0 1255 fTagVals = NULL;
michael@0 1256 }
michael@0 1257
michael@0 1258 U_NAMESPACE_END
michael@0 1259
michael@0 1260 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial