The Tor Browser: intl/icu/source/common/rbbiscan.cpp@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     2 //

     3 //  file:  rbbiscan.cpp

     4 //

     5 //  Copyright (C) 2002-2012, International Business Machines Corporation and others.

     6 //  All Rights Reserved.

     7 //

     8 //  This file contains the Rule Based Break Iterator Rule Builder functions for

     9 //   scanning the rules and assembling a parse tree.  This is the first phase

    10 //   of compiling the rules.

    11 //

    12 //  The overall of the rules is managed by class RBBIRuleBuilder, which will

    13 //  create and use an instance of this class as part of the process.

    14 //

    16 #include "unicode/utypes.h"

    18 #if !UCONFIG_NO_BREAK_ITERATION

    20 #include "unicode/unistr.h"

    21 #include "unicode/uniset.h"

    22 #include "unicode/uchar.h"

    23 #include "unicode/uchriter.h"

    24 #include "unicode/parsepos.h"

    25 #include "unicode/parseerr.h"

    26 #include "cmemory.h"

    27 #include "cstring.h"

    29 #include "rbbirpt.h"   // Contains state table for the rbbi rules parser.

    30                        //   generated by a Perl script.

    31 #include "rbbirb.h"

    32 #include "rbbinode.h"

    33 #include "rbbiscan.h"

    34 #include "rbbitblb.h"

    36 #include "uassert.h"

    38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

    40 //------------------------------------------------------------------------------

    41 //

    42 // Unicode Set init strings for each of the character classes needed for parsing a rule file.

    43 //               (Initialized with hex values for portability to EBCDIC based machines.

    44 //                Really ugly, but there's no good way to avoid it.)

    45 //

    46 //              The sets are referred to by name in the rbbirpt.txt, which is the

    47 //              source form of the state transition table for the RBBI rule parser.

    48 //

    49 //------------------------------------------------------------------------------

    50 static const UChar gRuleSet_rule_char_pattern[]       = {

    51  //   [    ^      [    \     p     {      Z     }     \     u    0      0    2      0

    52     0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,

    53  //   -    \      u    0     0     7      f     ]     -     [    \      p

    54     0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,

    55  //   {     L     }    ]     -     [      \     p     {     N    }      ]     ]

    56     0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};

    58 static const UChar gRuleSet_name_char_pattern[]       = {

    59 //    [    _      \    p     {     L      }     \     p     {    N      }     ]

    60     0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};

    62 static const UChar gRuleSet_digit_char_pattern[] = {

    63 //    [    0      -    9     ]

    64     0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};

    66 static const UChar gRuleSet_name_start_char_pattern[] = {

    67 //    [    _      \    p     {     L      }     ]

    68     0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };

    70 static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00};  // "any"

    73 U_CDECL_BEGIN

    74 static void U_CALLCONV RBBISetTable_deleter(void *p) {

    75     icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p;

    76     delete px->key;

    77     // Note:  px->val is owned by the linked list "fSetsListHead" in scanner.

    78     //        Don't delete the value nodes here.

    79     uprv_free(px);

    80 }

    81 U_CDECL_END

    83 U_NAMESPACE_BEGIN

    85 //------------------------------------------------------------------------------

    86 //

    87 //  Constructor.

    88 //

    89 //------------------------------------------------------------------------------

    90 RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)

    91 {

    92     fRB                 = rb;

    93     fStackPtr           = 0;

    94     fStack[fStackPtr]   = 0;

    95     fNodeStackPtr       = 0;

    96     fRuleNum            = 0;

    97     fNodeStack[0]       = NULL;

    99     fSymbolTable                            = NULL;

   100     fSetTable                               = NULL;

   102     fScanIndex = 0;

   103     fNextIndex = 0;

   105     fReverseRule        = FALSE;

   106     fLookAheadRule      = FALSE;

   108     fLineNum    = 1;

   109     fCharNum    = 0;

   110     fQuoteMode  = FALSE;

   112     // Do not check status until after all critical fields are sufficiently initialized

   113     //   that the destructor can run cleanly.

   114     if (U_FAILURE(*rb->fStatus)) {

   115         return;

   116     }

   118     //

   119     //  Set up the constant Unicode Sets.

   120     //     Note:  These could be made static, lazily initialized, and shared among

   121     //            all instances of RBBIRuleScanners.  BUT this is quite a bit simpler,

   122     //            and the time to build these few sets should be small compared to a

   123     //            full break iterator build.

   124     fRuleSets[kRuleSet_rule_char-128]

   125         = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern),       *rb->fStatus);

   126     // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]

   127     fRuleSets[kRuleSet_white_space-128].

   128         add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);

   129     fRuleSets[kRuleSet_name_char-128]

   130         = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern),       *rb->fStatus);

   131     fRuleSets[kRuleSet_name_start_char-128]

   132         = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);

   133     fRuleSets[kRuleSet_digit_char-128]

   134         = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern),      *rb->fStatus);

   135     if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {

   136         // This case happens if ICU's data is missing.  UnicodeSet tries to look up property

   137         //   names from the init string, can't find them, and claims an illegal argument.

   138         //   Change the error so that the actual problem will be clearer to users.

   139         *rb->fStatus = U_BRK_INIT_ERROR;

   140     }

   141     if (U_FAILURE(*rb->fStatus)) {

   142         return;

   143     }

   145     fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);

   146     if (fSymbolTable == NULL) {

   147         *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;

   148         return;

   149     }

   150     fSetTable    = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus);

   151     if (U_FAILURE(*rb->fStatus)) {

   152         return;

   153     }

   154     uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);

   155 }

   159 //------------------------------------------------------------------------------

   160 //

   161 //  Destructor

   162 //

   163 //------------------------------------------------------------------------------

   164 RBBIRuleScanner::~RBBIRuleScanner() {

   165     delete fSymbolTable;

   166     if (fSetTable != NULL) {

   167          uhash_close(fSetTable);

   168          fSetTable = NULL;

   170     }

   173     // Node Stack.

   174     //   Normally has one entry, which is the entire parse tree for the rules.

   175     //   If errors occured, there may be additional subtrees left on the stack.

   176     while (fNodeStackPtr > 0) {

   177         delete fNodeStack[fNodeStackPtr];

   178         fNodeStackPtr--;

   179     }

   181 }

   183 //------------------------------------------------------------------------------

   184 //

   185 //  doParseAction        Do some action during rule parsing.

   186 //                       Called by the parse state machine.

   187 //                       Actions build the parse tree and Unicode Sets,

   188 //                       and maintain the parse stack for nested expressions.

   189 //

   190 //                       TODO:  unify EParseAction and RBBI_RuleParseAction enum types.

   191 //                              They represent exactly the same thing.  They're separate

   192 //                              only to work around enum forward declaration restrictions

   193 //                              in some compilers, while at the same time avoiding multiple

   194 //                              definitions problems.  I'm sure that there's a better way.

   195 //

   196 //------------------------------------------------------------------------------

   197 UBool RBBIRuleScanner::doParseActions(int32_t action)

   198 {

   199     RBBINode *n       = NULL;

   201     UBool   returnVal = TRUE;

   203     switch (action) {

   205     case doExprStart:

   206         pushNewNode(RBBINode::opStart);

   207         fRuleNum++;

   208         break;

   211     case doExprOrOperator:

   212         {

   213             fixOpStack(RBBINode::precOpCat);

   214             RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];

   215             RBBINode  *orNode      = pushNewNode(RBBINode::opOr);

   216             orNode->fLeftChild     = operandNode;

   217             operandNode->fParent   = orNode;

   218         }

   219         break;

   221     case doExprCatOperator:

   222         // concatenation operator.

   223         // For the implicit concatenation of adjacent terms in an expression that are

   224         //   not separated by any other operator.  Action is invoked between the

   225         //   actions for the two terms.

   226         {

   227             fixOpStack(RBBINode::precOpCat);

   228             RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];

   229             RBBINode  *catNode     = pushNewNode(RBBINode::opCat);

   230             catNode->fLeftChild    = operandNode;

   231             operandNode->fParent   = catNode;

   232         }

   233         break;

   235     case doLParen:

   236         // Open Paren.

   237         //   The openParen node is a dummy operation type with a low precedence,

   238         //     which has the affect of ensuring that any real binary op that

   239         //     follows within the parens binds more tightly to the operands than

   240         //     stuff outside of the parens.

   241         pushNewNode(RBBINode::opLParen);

   242         break;

   244     case doExprRParen:

   245         fixOpStack(RBBINode::precLParen);

   246         break;

   248     case doNOP:

   249         break;

   251     case doStartAssign:

   252         // We've just scanned "$variable = "

   253         // The top of the node stack has the $variable ref node.

   255         // Save the start position of the RHS text in the StartExpression node

   256         //   that precedes the $variableReference node on the stack.

   257         //   This will eventually be used when saving the full $variable replacement

   258         //   text as a string.

   259         n = fNodeStack[fNodeStackPtr-1];

   260         n->fFirstPos = fNextIndex;              // move past the '='

   262         // Push a new start-of-expression node; needed to keep parse of the

   263         //   RHS expression happy.

   264         pushNewNode(RBBINode::opStart);

   265         break;

   270     case doEndAssign:

   271         {

   272             // We have reached the end of an assignement statement.

   273             //   Current scan char is the ';' that terminates the assignment.

   275             // Terminate expression, leaves expression parse tree rooted in TOS node.

   276             fixOpStack(RBBINode::precStart);

   278             RBBINode *startExprNode  = fNodeStack[fNodeStackPtr-2];

   279             RBBINode *varRefNode     = fNodeStack[fNodeStackPtr-1];

   280             RBBINode *RHSExprNode    = fNodeStack[fNodeStackPtr];

   282             // Save original text of right side of assignment, excluding the terminating ';'

   283             //  in the root of the node for the right-hand-side expression.

   284             RHSExprNode->fFirstPos = startExprNode->fFirstPos;

   285             RHSExprNode->fLastPos  = fScanIndex;

   286             fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);

   288             // Expression parse tree becomes l. child of the $variable reference node.

   289             varRefNode->fLeftChild = RHSExprNode;

   290             RHSExprNode->fParent   = varRefNode;

   292             // Make a symbol table entry for the $variableRef node.

   293             fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);

   294             if (U_FAILURE(*fRB->fStatus)) {

   295                 // This is a round-about way to get the parse position set

   296                 //  so that duplicate symbols error messages include a line number.

   297                 UErrorCode t = *fRB->fStatus;

   298                 *fRB->fStatus = U_ZERO_ERROR;

   299                 error(t);

   300             }

   302             // Clean up the stack.

   303             delete startExprNode;

   304             fNodeStackPtr-=3;

   305             break;

   306         }

   308     case doEndOfRule:

   309         {

   310         fixOpStack(RBBINode::precStart);      // Terminate expression, leaves expression

   311         if (U_FAILURE(*fRB->fStatus)) {       //   parse tree rooted in TOS node.

   312             break;

   313         }

   314 #ifdef RBBI_DEBUG

   315         if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}

   316 #endif

   317         U_ASSERT(fNodeStackPtr == 1);

   319         // If this rule includes a look-ahead '/', add a endMark node to the

   320         //   expression tree.

   321         if (fLookAheadRule) {

   322             RBBINode  *thisRule       = fNodeStack[fNodeStackPtr];

   323             RBBINode  *endNode        = pushNewNode(RBBINode::endMark);

   324             RBBINode  *catNode        = pushNewNode(RBBINode::opCat);

   325             fNodeStackPtr -= 2;

   326             catNode->fLeftChild       = thisRule;

   327             catNode->fRightChild      = endNode;

   328             fNodeStack[fNodeStackPtr] = catNode;

   329             endNode->fVal             = fRuleNum;

   330             endNode->fLookAheadEnd    = TRUE;

   331         }

   333         // All rule expressions are ORed together.

   334         // The ';' that terminates an expression really just functions as a '|' with

   335         //   a low operator prededence.

   336         //

   337         // Each of the four sets of rules are collected separately.

   338         //  (forward, reverse, safe_forward, safe_reverse)

   339         //  OR this rule into the appropriate group of them.

   340         //

   341         RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);

   343         if (*destRules != NULL) {

   344             // This is not the first rule encounted.

   345             // OR previous stuff  (from *destRules)

   346             // with the current rule expression (on the Node Stack)

   347             //  with the resulting OR expression going to *destRules

   348             //

   349             RBBINode  *thisRule    = fNodeStack[fNodeStackPtr];

   350             RBBINode  *prevRules   = *destRules;

   351             RBBINode  *orNode      = pushNewNode(RBBINode::opOr);

   352             orNode->fLeftChild     = prevRules;

   353             prevRules->fParent     = orNode;

   354             orNode->fRightChild    = thisRule;

   355             thisRule->fParent      = orNode;

   356             *destRules             = orNode;

   357         }

   358         else

   359         {

   360             // This is the first rule encountered (for this direction).

   361             // Just move its parse tree from the stack to *destRules.

   362             *destRules = fNodeStack[fNodeStackPtr];

   363         }

   364         fReverseRule   = FALSE;   // in preparation for the next rule.

   365         fLookAheadRule = FALSE;

   366         fNodeStackPtr  = 0;

   367         }

   368         break;

   371     case doRuleError:

   372         error(U_BRK_RULE_SYNTAX);

   373         returnVal = FALSE;

   374         break;

   377     case doVariableNameExpectedErr:

   378         error(U_BRK_RULE_SYNTAX);

   379         break;

   382     //

   383     //  Unary operands  + ? *

   384     //    These all appear after the operand to which they apply.

   385     //    When we hit one, the operand (may be a whole sub expression)

   386     //    will be on the top of the stack.

   387     //    Unary Operator becomes TOS, with the old TOS as its one child.

   388     case doUnaryOpPlus:

   389         {

   390             RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];

   391             RBBINode  *plusNode    = pushNewNode(RBBINode::opPlus);

   392             plusNode->fLeftChild   = operandNode;

   393             operandNode->fParent   = plusNode;

   394         }

   395         break;

   397     case doUnaryOpQuestion:

   398         {

   399             RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];

   400             RBBINode  *qNode       = pushNewNode(RBBINode::opQuestion);

   401             qNode->fLeftChild      = operandNode;

   402             operandNode->fParent   = qNode;

   403         }

   404         break;

   406     case doUnaryOpStar:

   407         {

   408             RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];

   409             RBBINode  *starNode    = pushNewNode(RBBINode::opStar);

   410             starNode->fLeftChild   = operandNode;

   411             operandNode->fParent   = starNode;

   412         }

   413         break;

   415     case doRuleChar:

   416         // A "Rule Character" is any single character that is a literal part

   417         // of the regular expression.  Like a, b and c in the expression "(abc*) | [:L:]"

   418         // These are pretty uncommon in break rules; the terms are more commonly

   419         //  sets.  To keep things uniform, treat these characters like as

   420         // sets that just happen to contain only one character.

   421         {

   422             n = pushNewNode(RBBINode::setRef);

   423             findSetFor(UnicodeString(fC.fChar), n);

   424             n->fFirstPos = fScanIndex;

   425             n->fLastPos  = fNextIndex;

   426             fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

   427             break;

   428         }

   430     case doDotAny:

   431         // scanned a ".", meaning match any single character.

   432         {

   433             n = pushNewNode(RBBINode::setRef);

   434             findSetFor(UnicodeString(TRUE, kAny, 3), n);

   435             n->fFirstPos = fScanIndex;

   436             n->fLastPos  = fNextIndex;

   437             fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

   438             break;

   439         }

   441     case doSlash:

   442         // Scanned a '/', which identifies a look-ahead break position in a rule.

   443         n = pushNewNode(RBBINode::lookAhead);

   444         n->fVal      = fRuleNum;

   445         n->fFirstPos = fScanIndex;

   446         n->fLastPos  = fNextIndex;

   447         fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

   448         fLookAheadRule = TRUE;

   449         break;

   452     case doStartTagValue:

   453         // Scanned a '{', the opening delimiter for a tag value within a rule.

   454         n = pushNewNode(RBBINode::tag);

   455         n->fVal      = 0;

   456         n->fFirstPos = fScanIndex;

   457         n->fLastPos  = fNextIndex;

   458         break;

   460     case doTagDigit:

   461         // Just scanned a decimal digit that's part of a tag value

   462         {

   463             n = fNodeStack[fNodeStackPtr];

   464             uint32_t v = u_charDigitValue(fC.fChar);

   465             U_ASSERT(v < 10);

   466             n->fVal = n->fVal*10 + v;

   467             break;

   468         }

   470     case doTagValue:

   471         n = fNodeStack[fNodeStackPtr];

   472         n->fLastPos = fNextIndex;

   473         fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

   474         break;

   476     case doTagExpectedError:

   477         error(U_BRK_MALFORMED_RULE_TAG);

   478         returnVal = FALSE;

   479         break;

   481     case doOptionStart:

   482         // Scanning a !!option.   At the start of string.

   483         fOptionStart = fScanIndex;

   484         break;

   486     case doOptionEnd:

   487         {

   488             UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);

   489             if (opt == UNICODE_STRING("chain", 5)) {

   490                 fRB->fChainRules = TRUE;

   491             } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {

   492                 fRB->fLBCMNoChain = TRUE;

   493             } else if (opt == UNICODE_STRING("forward", 7)) {

   494                 fRB->fDefaultTree   = &fRB->fForwardTree;

   495             } else if (opt == UNICODE_STRING("reverse", 7)) {

   496                 fRB->fDefaultTree   = &fRB->fReverseTree;

   497             } else if (opt == UNICODE_STRING("safe_forward", 12)) {

   498                 fRB->fDefaultTree   = &fRB->fSafeFwdTree;

   499             } else if (opt == UNICODE_STRING("safe_reverse", 12)) {

   500                 fRB->fDefaultTree   = &fRB->fSafeRevTree;

   501             } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {

   502                 fRB->fLookAheadHardBreak = TRUE;

   503             } else {

   504                 error(U_BRK_UNRECOGNIZED_OPTION);

   505             }

   506         }

   507         break;

   509     case doReverseDir:

   510         fReverseRule = TRUE;

   511         break;

   513     case doStartVariableName:

   514         n = pushNewNode(RBBINode::varRef);

   515         if (U_FAILURE(*fRB->fStatus)) {

   516             break;

   517         }

   518         n->fFirstPos = fScanIndex;

   519         break;

   521     case doEndVariableName:

   522         n = fNodeStack[fNodeStackPtr];

   523         if (n==NULL || n->fType != RBBINode::varRef) {

   524             error(U_BRK_INTERNAL_ERROR);

   525             break;

   526         }

   527         n->fLastPos = fScanIndex;

   528         fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);

   529         // Look the newly scanned name up in the symbol table

   530         //   If there's an entry, set the l. child of the var ref to the replacement expression.

   531         //   (We also pass through here when scanning assignments, but no harm is done, other

   532         //    than a slight wasted effort that seems hard to avoid.  Lookup will be null)

   533         n->fLeftChild = fSymbolTable->lookupNode(n->fText);

   534         break;

   536     case doCheckVarDef:

   537         n = fNodeStack[fNodeStackPtr];

   538         if (n->fLeftChild == NULL) {

   539             error(U_BRK_UNDEFINED_VARIABLE);

   540             returnVal = FALSE;

   541         }

   542         break;

   544     case doExprFinished:

   545         break;

   547     case doRuleErrorAssignExpr:

   548         error(U_BRK_ASSIGN_ERROR);

   549         returnVal = FALSE;

   550         break;

   552     case doExit:

   553         returnVal = FALSE;

   554         break;

   556     case doScanUnicodeSet:

   557         scanSet();

   558         break;

   560     default:

   561         error(U_BRK_INTERNAL_ERROR);

   562         returnVal = FALSE;

   563         break;

   564     }

   565     return returnVal;

   566 }

   571 //------------------------------------------------------------------------------

   572 //

   573 //  Error         Report a rule parse error.

   574 //                Only report it if no previous error has been recorded.

   575 //

   576 //------------------------------------------------------------------------------

   577 void RBBIRuleScanner::error(UErrorCode e) {

   578     if (U_SUCCESS(*fRB->fStatus)) {

   579         *fRB->fStatus = e;

   580         if (fRB->fParseError) {

   581             fRB->fParseError->line  = fLineNum;

   582             fRB->fParseError->offset = fCharNum;

   583             fRB->fParseError->preContext[0] = 0;

   584             fRB->fParseError->preContext[0] = 0;

   585         }

   586     }

   587 }

   592 //------------------------------------------------------------------------------

   593 //

   594 //  fixOpStack   The parse stack holds partially assembled chunks of the parse tree.

   595 //               An entry on the stack may be as small as a single setRef node,

   596 //               or as large as the parse tree

   597 //               for an entire expression (this will be the one item left on the stack

   598 //               when the parsing of an RBBI rule completes.

   599 //

   600 //               This function is called when a binary operator is encountered.

   601 //               It looks back up the stack for operators that are not yet associated

   602 //               with a right operand, and if the precedence of the stacked operator >=

   603 //               the precedence of the current operator, binds the operand left,

   604 //               to the previously encountered operator.

   605 //

   606 //------------------------------------------------------------------------------

   607 void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {

   608     RBBINode *n;

   609     // printNodeStack("entering fixOpStack()");

   610     for (;;) {

   611         n = fNodeStack[fNodeStackPtr-1];   // an operator node

   612         if (n->fPrecedence == 0) {

   613             RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");

   614             error(U_BRK_INTERNAL_ERROR);

   615             return;

   616         }

   618         if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {

   619             // The most recent operand goes with the current operator,

   620             //   not with the previously stacked one.

   621             break;

   622         }

   623             // Stack operator is a binary op  ( '|' or concatenation)

   624             //   TOS operand becomes right child of this operator.

   625             //   Resulting subexpression becomes the TOS operand.

   626             n->fRightChild = fNodeStack[fNodeStackPtr];

   627             fNodeStack[fNodeStackPtr]->fParent = n;

   628             fNodeStackPtr--;

   629         // printNodeStack("looping in fixOpStack()   ");

   630     }

   632     if (p <= RBBINode::precLParen) {

   633         // Scan is at a right paren or end of expression.

   634         //  The scanned item must match the stack, or else there was an error.

   635         //  Discard the left paren (or start expr) node from the stack,

   636             //  leaving the completed (sub)expression as TOS.

   637             if (n->fPrecedence != p) {

   638                 // Right paren encountered matched start of expression node, or

   639                 // end of expression matched with a left paren node.

   640                 error(U_BRK_MISMATCHED_PAREN);

   641             }

   642             fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];

   643             fNodeStackPtr--;

   644             // Delete the now-discarded LParen or Start node.

   645             delete n;

   646     }

   647     // printNodeStack("leaving fixOpStack()");

   648 }

   653 //------------------------------------------------------------------------------

   654 //

   655 //   findSetFor    given a UnicodeString,

   656 //                  - find the corresponding Unicode Set  (uset node)

   657 //                         (create one if necessary)

   658 //                  - Set fLeftChild of the caller's node (should be a setRef node)

   659 //                         to the uset node

   660 //                 Maintain a hash table of uset nodes, so the same one is always used

   661 //                    for the same string.

   662 //                 If a "to adopt" set is provided and we haven't seen this key before,

   663 //                    add the provided set to the hash table.

   664 //                 If the string is one (32 bit) char in length, the set contains

   665 //                    just one element which is the char in question.

   666 //                 If the string is "any", return a set containing all chars.

   667 //

   668 //------------------------------------------------------------------------------

   669 void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {

   671     RBBISetTableEl   *el;

   673     // First check whether we've already cached a set for this string.

   674     // If so, just use the cached set in the new node.

   675     //   delete any set provided by the caller, since we own it.

   676     el = (RBBISetTableEl *)uhash_get(fSetTable, &s);

   677     if (el != NULL) {

   678         delete setToAdopt;

   679         node->fLeftChild = el->val;

   680         U_ASSERT(node->fLeftChild->fType == RBBINode::uset);

   681         return;

   682     }

   684     // Haven't seen this set before.

   685     // If the caller didn't provide us with a prebuilt set,

   686     //   create a new UnicodeSet now.

   687     if (setToAdopt == NULL) {

   688         if (s.compare(kAny, -1) == 0) {

   689             setToAdopt = new UnicodeSet(0x000000, 0x10ffff);

   690         } else {

   691             UChar32 c;

   692             c = s.char32At(0);

   693             setToAdopt = new UnicodeSet(c, c);

   694         }

   695     }

   697     //

   698     // Make a new uset node to refer to this UnicodeSet

   699     // This new uset node becomes the child of the caller's setReference node.

   700     //

   701     RBBINode *usetNode    = new RBBINode(RBBINode::uset);

   702     if (usetNode == NULL) {

   703         error(U_MEMORY_ALLOCATION_ERROR);

   704         return;

   705     }

   706     usetNode->fInputSet   = setToAdopt;

   707     usetNode->fParent     = node;

   708     node->fLeftChild      = usetNode;

   709     usetNode->fText = s;

   712     //

   713     // Add the new uset node to the list of all uset nodes.

   714     //

   715     fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);

   718     //

   719     // Add the new set to the set hash table.

   720     //

   721     el      = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));

   722     UnicodeString *tkey = new UnicodeString(s);

   723     if (tkey == NULL || el == NULL || setToAdopt == NULL) {

   724         // Delete to avoid memory leak

   725         delete tkey;

   726         tkey = NULL;

   727         uprv_free(el);

   728         el = NULL;

   729         delete setToAdopt;

   730         setToAdopt = NULL;

   732         error(U_MEMORY_ALLOCATION_ERROR);

   733         return;

   734     }

   735     el->key = tkey;

   736     el->val = usetNode;

   737     uhash_put(fSetTable, el->key, el, fRB->fStatus);

   739     return;

   740 }

   744 //

   745 //  Assorted Unicode character constants.

   746 //     Numeric because there is no portable way to enter them as literals.

   747 //     (Think EBCDIC).

   748 //

   749 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.

   750 static const UChar      chLF        = 0x0a;

   751 static const UChar      chNEL       = 0x85;      //    NEL newline variant

   752 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator

   753 static const UChar      chApos      = 0x27;      //  single quote, for quoted chars.

   754 static const UChar      chPound     = 0x23;      // '#', introduces a comment.

   755 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape

   756 static const UChar      chLParen    = 0x28;

   757 static const UChar      chRParen    = 0x29;

   760 //------------------------------------------------------------------------------

   761 //

   762 //  stripRules    Return a rules string without unnecessary

   763 //                characters.

   764 //

   765 //------------------------------------------------------------------------------

   766 UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {

   767     UnicodeString strippedRules;

   768     int rulesLength = rules.length();

   769     for (int idx = 0; idx < rulesLength; ) {

   770         UChar ch = rules[idx++];

   771         if (ch == chPound) {

   772             while (idx < rulesLength

   773                 && ch != chCR && ch != chLF && ch != chNEL)

   774             {

   775                 ch = rules[idx++];

   776             }

   777         }

   778         if (!u_isISOControl(ch)) {

   779             strippedRules.append(ch);

   780         }

   781     }

   782     // strippedRules = strippedRules.unescape();

   783     return strippedRules;

   784 }

   787 //------------------------------------------------------------------------------

   788 //

   789 //  nextCharLL    Low Level Next Char from rule input source.

   790 //                Get a char from the input character iterator,

   791 //                keep track of input position for error reporting.

   792 //

   793 //------------------------------------------------------------------------------

   794 UChar32  RBBIRuleScanner::nextCharLL() {

   795     UChar32  ch;

   797     if (fNextIndex >= fRB->fRules.length()) {

   798         return (UChar32)-1;

   799     }

   800     ch         = fRB->fRules.char32At(fNextIndex);

   801     fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);

   803     if (ch == chCR ||

   804         ch == chNEL ||

   805         ch == chLS   ||

   806         (ch == chLF && fLastChar != chCR)) {

   807         // Character is starting a new line.  Bump up the line number, and

   808         //  reset the column to 0.

   809         fLineNum++;

   810         fCharNum=0;

   811         if (fQuoteMode) {

   812             error(U_BRK_NEW_LINE_IN_QUOTED_STRING);

   813             fQuoteMode = FALSE;

   814         }

   815     }

   816     else {

   817         // Character is not starting a new line.  Except in the case of a

   818         //   LF following a CR, increment the column position.

   819         if (ch != chLF) {

   820             fCharNum++;

   821         }

   822     }

   823     fLastChar = ch;

   824     return ch;

   825 }

   828 //------------------------------------------------------------------------------

   829 //

   830 //   nextChar     for rules scanning.  At this level, we handle stripping

   831 //                out comments and processing backslash character escapes.

   832 //                The rest of the rules grammar is handled at the next level up.

   833 //

   834 //------------------------------------------------------------------------------

   835 void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {

   837     // Unicode Character constants needed for the processing done by nextChar(),

   838     //   in hex because literals wont work on EBCDIC machines.

   840     fScanIndex = fNextIndex;

   841     c.fChar    = nextCharLL();

   842     c.fEscaped = FALSE;

   844     //

   845     //  check for '' sequence.

   846     //  These are recognized in all contexts, whether in quoted text or not.

   847     //

   848     if (c.fChar == chApos) {

   849         if (fRB->fRules.char32At(fNextIndex) == chApos) {

   850             c.fChar    = nextCharLL();        // get nextChar officially so character counts

   851             c.fEscaped = TRUE;                //   stay correct.

   852         }

   853         else

   854         {

   855             // Single quote, by itself.

   856             //   Toggle quoting mode.

   857             //   Return either '('  or ')', because quotes cause a grouping of the quoted text.

   858             fQuoteMode = !fQuoteMode;

   859             if (fQuoteMode == TRUE) {

   860                 c.fChar = chLParen;

   861             } else {

   862                 c.fChar = chRParen;

   863             }

   864             c.fEscaped = FALSE;      // The paren that we return is not escaped.

   865             return;

   866         }

   867     }

   869     if (fQuoteMode) {

   870         c.fEscaped = TRUE;

   871     }

   872     else

   873     {

   874         // We are not in a 'quoted region' of the source.

   875         //

   876         if (c.fChar == chPound) {

   877             // Start of a comment.  Consume the rest of it.

   878             //  The new-line char that terminates the comment is always returned.

   879             //  It will be treated as white-space, and serves to break up anything

   880             //    that might otherwise incorrectly clump together with a comment in

   881             //    the middle (a variable name, for example.)

   882             for (;;) {

   883                 c.fChar = nextCharLL();

   884                 if (c.fChar == (UChar32)-1 ||  // EOF

   885                     c.fChar == chCR     ||

   886                     c.fChar == chLF     ||

   887                     c.fChar == chNEL    ||

   888                     c.fChar == chLS)       {break;}

   889             }

   890         }

   891         if (c.fChar == (UChar32)-1) {

   892             return;

   893         }

   895         //

   896         //  check for backslash escaped characters.

   897         //  Use UnicodeString::unescapeAt() to handle them.

   898         //

   899         if (c.fChar == chBackSlash) {

   900             c.fEscaped = TRUE;

   901             int32_t startX = fNextIndex;

   902             c.fChar = fRB->fRules.unescapeAt(fNextIndex);

   903             if (fNextIndex == startX) {

   904                 error(U_BRK_HEX_DIGITS_EXPECTED);

   905             }

   906             fCharNum += fNextIndex-startX;

   907         }

   908     }

   909     // putc(c.fChar, stdout);

   910 }

   912 //------------------------------------------------------------------------------

   913 //

   914 //  Parse RBBI rules.   The state machine for rules parsing is here.

   915 //                      The state tables are hand-written in the file rbbirpt.txt,

   916 //                      and converted to the form used here by a perl

   917 //                      script rbbicst.pl

   918 //

   919 //------------------------------------------------------------------------------

   920 void RBBIRuleScanner::parse() {

   921     uint16_t                state;

   922     const RBBIRuleTableEl  *tableEl;

   924     if (U_FAILURE(*fRB->fStatus)) {

   925         return;

   926     }

   928     state = 1;

   929     nextChar(fC);

   930     //

   931     // Main loop for the rule parsing state machine.

   932     //   Runs once per state transition.

   933     //   Each time through optionally performs, depending on the state table,

   934     //      - an advance to the the next input char

   935     //      - an action to be performed.

   936     //      - pushing or popping a state to/from the local state return stack.

   937     //

   938     for (;;) {

   939         //  Bail out if anything has gone wrong.

   940         //  RBBI rule file parsing stops on the first error encountered.

   941         if (U_FAILURE(*fRB->fStatus)) {

   942             break;

   943         }

   945         // Quit if state == 0.  This is the normal way to exit the state machine.

   946         //

   947         if (state == 0) {

   948             break;

   949         }

   951         // Find the state table element that matches the input char from the rule, or the

   952         //    class of the input character.  Start with the first table row for this

   953         //    state, then linearly scan forward until we find a row that matches the

   954         //    character.  The last row for each state always matches all characters, so

   955         //    the search will stop there, if not before.

   956         //

   957         tableEl = &gRuleParseStateTable[state];

   958         #ifdef RBBI_DEBUG

   959             if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {

   960                 RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d)    state=%s ",

   961                     fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);

   962             }

   963         #endif

   965         for (;;) {

   966             #ifdef RBBI_DEBUG

   967                 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}

   968             #endif

   969             if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE &&   tableEl->fCharClass == fC.fChar) {

   970                 // Table row specified an individual character, not a set, and

   971                 //   the input character is not escaped, and

   972                 //   the input character matched it.

   973                 break;

   974             }

   975             if (tableEl->fCharClass == 255) {

   976                 // Table row specified default, match anything character class.

   977                 break;

   978             }

   979             if (tableEl->fCharClass == 254 && fC.fEscaped)  {

   980                 // Table row specified "escaped" and the char was escaped.

   981                 break;

   982             }

   983             if (tableEl->fCharClass == 253 && fC.fEscaped &&

   984                 (fC.fChar == 0x50 || fC.fChar == 0x70 ))  {

   985                 // Table row specified "escaped P" and the char is either 'p' or 'P'.

   986                 break;

   987             }

   988             if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1)  {

   989                 // Table row specified eof and we hit eof on the input.

   990                 break;

   991             }

   993             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&

   994                 fC.fEscaped == FALSE &&                                      //   char is not escaped &&

   995                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF

   996                 U_ASSERT((tableEl->fCharClass-128) < LENGTHOF(fRuleSets));

   997                 if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {

   998                     // Table row specified a character class, or set of characters,

   999                     //   and the current char matches it.

  1000                     break;

  1001                 }

  1002             }

  1004             // No match on this row, advance to the next  row for this state,

  1005             tableEl++;

  1006         }

  1007         if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}

  1009         //

  1010         // We've found the row of the state table that matches the current input

  1011         //   character from the rules string.

  1012         // Perform any action specified  by this row in the state table.

  1013         if (doParseActions((int32_t)tableEl->fAction) == FALSE) {

  1014             // Break out of the state machine loop if the

  1015             //   the action signalled some kind of error, or

  1016             //   the action was to exit, occurs on normal end-of-rules-input.

  1017             break;

  1018         }

  1020         if (tableEl->fPushState != 0) {

  1021             fStackPtr++;

  1022             if (fStackPtr >= kStackSize) {

  1023                 error(U_BRK_INTERNAL_ERROR);

  1024                 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");

  1025                 fStackPtr--;

  1026             }

  1027             fStack[fStackPtr] = tableEl->fPushState;

  1028         }

  1030         if (tableEl->fNextChar) {

  1031             nextChar(fC);

  1032         }

  1034         // Get the next state from the table entry, or from the

  1035         //   state stack if the next state was specified as "pop".

  1036         if (tableEl->fNextState != 255) {

  1037             state = tableEl->fNextState;

  1038         } else {

  1039             state = fStack[fStackPtr];

  1040             fStackPtr--;

  1041             if (fStackPtr < 0) {

  1042                 error(U_BRK_INTERNAL_ERROR);

  1043                 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");

  1044                 fStackPtr++;

  1045             }

  1046         }

  1048     }

  1050     //

  1051     // If there were NO user specified reverse rules, set up the equivalent of ".*;"

  1052     //

  1053     if (fRB->fReverseTree == NULL) {

  1054         fRB->fReverseTree  = pushNewNode(RBBINode::opStar);

  1055         RBBINode  *operand = pushNewNode(RBBINode::setRef);

  1056         findSetFor(UnicodeString(TRUE, kAny, 3), operand);

  1057         fRB->fReverseTree->fLeftChild = operand;

  1058         operand->fParent              = fRB->fReverseTree;

  1059         fNodeStackPtr -= 2;

  1060     }

  1063     //

  1064     // Parsing of the input RBBI rules is complete.

  1065     // We now have a parse tree for the rule expressions

  1066     // and a list of all UnicodeSets that are referenced.

  1067     //

  1068 #ifdef RBBI_DEBUG

  1069     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}

  1070     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))

  1071     {

  1072         RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");

  1073         fRB->fForwardTree->printTree(TRUE);

  1074         RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");

  1075         fRB->fReverseTree->printTree(TRUE);

  1076         RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");

  1077         fRB->fSafeFwdTree->printTree(TRUE);

  1078         RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");

  1079         fRB->fSafeRevTree->printTree(TRUE);

  1080     }

  1081 #endif

  1082 }

  1085 //------------------------------------------------------------------------------

  1086 //

  1087 //  printNodeStack     for debugging...

  1088 //

  1089 //------------------------------------------------------------------------------

  1090 #ifdef RBBI_DEBUG

  1091 void RBBIRuleScanner::printNodeStack(const char *title) {

  1092     int i;

  1093     RBBIDebugPrintf("%s.  Dumping node stack...\n", title);

  1094     for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);}

  1095 }

  1096 #endif

  1101 //------------------------------------------------------------------------------

  1102 //

  1103 //  pushNewNode   create a new RBBINode of the specified type and push it

  1104 //                onto the stack of nodes.

  1105 //

  1106 //------------------------------------------------------------------------------

  1107 RBBINode  *RBBIRuleScanner::pushNewNode(RBBINode::NodeType  t) {

  1108     fNodeStackPtr++;

  1109     if (fNodeStackPtr >= kStackSize) {

  1110         error(U_BRK_INTERNAL_ERROR);

  1111         RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");

  1112         *fRB->fStatus = U_BRK_INTERNAL_ERROR;

  1113         return NULL;

  1114     }

  1115     fNodeStack[fNodeStackPtr] = new RBBINode(t);

  1116     if (fNodeStack[fNodeStackPtr] == NULL) {

  1117         *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;

  1118     }

  1119     return fNodeStack[fNodeStackPtr];

  1120 }

  1124 //------------------------------------------------------------------------------

  1125 //

  1126 //  scanSet    Construct a UnicodeSet from the text at the current scan

  1127 //             position.  Advance the scan position to the first character

  1128 //             after the set.

  1129 //

  1130 //             A new RBBI setref node referring to the set is pushed onto the node

  1131 //             stack.

  1132 //

  1133 //             The scan position is normally under the control of the state machine

  1134 //             that controls rule parsing.  UnicodeSets, however, are parsed by

  1135 //             the UnicodeSet constructor, not by the RBBI rule parser.

  1136 //

  1137 //------------------------------------------------------------------------------

  1138 void RBBIRuleScanner::scanSet() {

  1139     UnicodeSet    *uset;

  1140     ParsePosition  pos;

  1141     int            startPos;

  1142     int            i;

  1144     if (U_FAILURE(*fRB->fStatus)) {

  1145         return;

  1146     }

  1148     pos.setIndex(fScanIndex);

  1149     startPos = fScanIndex;

  1150     UErrorCode localStatus = U_ZERO_ERROR;

  1151     uset = new UnicodeSet();

  1152     if (uset == NULL) {

  1153         localStatus = U_MEMORY_ALLOCATION_ERROR;

  1154     } else {

  1155         uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);

  1156     }

  1157     if (U_FAILURE(localStatus)) {

  1158         //  TODO:  Get more accurate position of the error from UnicodeSet's return info.

  1159         //         UnicodeSet appears to not be reporting correctly at this time.

  1160         #ifdef RBBI_DEBUG

  1161             RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());

  1162         #endif

  1163         error(localStatus);

  1164         delete uset;

  1165         return;

  1166     }

  1168     // Verify that the set contains at least one code point.

  1169     //

  1170     U_ASSERT(uset!=NULL);

  1171     if (uset->isEmpty()) {

  1172         // This set is empty.

  1173         //  Make it an error, because it almost certainly is not what the user wanted.

  1174         //  Also, avoids having to think about corner cases in the tree manipulation code

  1175         //   that occurs later on.

  1176         error(U_BRK_RULE_EMPTY_SET);

  1177         delete uset;

  1178         return;

  1179     }

  1182     // Advance the RBBI parse postion over the UnicodeSet pattern.

  1183     //   Don't just set fScanIndex because the line/char positions maintained

  1184     //   for error reporting would be thrown off.

  1185     i = pos.getIndex();

  1186     for (;;) {

  1187         if (fNextIndex >= i) {

  1188             break;

  1189         }

  1190         nextCharLL();

  1191     }

  1193     if (U_SUCCESS(*fRB->fStatus)) {

  1194         RBBINode         *n;

  1196         n = pushNewNode(RBBINode::setRef);

  1197         n->fFirstPos = startPos;

  1198         n->fLastPos  = fNextIndex;

  1199         fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);

  1200         //  findSetFor() serves several purposes here:

  1201         //     - Adopts storage for the UnicodeSet, will be responsible for deleting.

  1202         //     - Mantains collection of all sets in use, needed later for establishing

  1203         //          character categories for run time engine.

  1204         //     - Eliminates mulitiple instances of the same set.

  1205         //     - Creates a new uset node if necessary (if this isn't a duplicate.)

  1206         findSetFor(n->fText, n, uset);

  1207     }

  1209 }

  1211 U_NAMESPACE_END

  1213 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

The Tor Browser / file revision

intl/icu/source/common/rbbiscan.cpp@b8a032363ba2

intl/icu/source/common/rbbiscan.cpp