michael@0: michael@0: #***************************************************************************** michael@0: # michael@0: # Copyright (C) 2002-2003, International Business Machines Corporation and others. michael@0: # All Rights Reserved. michael@0: # michael@0: #***************************************************************************** michael@0: # michael@0: # file: rbbirpt.txt michael@0: # ICU Break Iterator Rule Parser State Table michael@0: # michael@0: # This state table is used when reading and parsing a set of RBBI rules michael@0: # The rule parser uses a state machine; the data in this file define the michael@0: # state transitions that occur for each input character. michael@0: # michael@0: # *** This file defines the RBBI rule grammar. This is it. michael@0: # *** The determination of what is accepted is here. michael@0: # michael@0: # This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays michael@0: # that are then built with the rule parser. michael@0: # michael@0: michael@0: # michael@0: # Here is the syntax of the state definitions in this file: michael@0: # michael@0: # michael@0: #StateName: michael@0: # input-char n next-state ^push-state action michael@0: # input-char n next-state ^push-state action michael@0: # | | | | | michael@0: # | | | | |--- action to be performed by state machine michael@0: # | | | | See function RBBIRuleScanner::doParseActions() michael@0: # | | | | michael@0: # | | | |--- Push this named state onto the state stack. michael@0: # | | | Later, when next state is specified as "pop", michael@0: # | | | the pushed state will become the current state. michael@0: # | | | michael@0: # | | |--- Transition to this state if the current input character matches the input michael@0: # | | character or char class in the left hand column. "pop" causes the next michael@0: # | | state to be popped from the state stack. michael@0: # | | michael@0: # | |--- When making the state transition specified on this line, advance to the next michael@0: # | character from the input only if 'n' appears here. michael@0: # | michael@0: # |--- Character or named character classes to test for. If the current character being scanned michael@0: # matches, peform the actions and go to the state specified on this line. michael@0: # The input character is tested sequentally, in the order written. The characters and michael@0: # character classes tested for do not need to be mutually exclusive. The first match wins. michael@0: # michael@0: michael@0: michael@0: michael@0: michael@0: # michael@0: # start state, scan position is at the beginning of the rules file, or in between two rules. michael@0: # michael@0: start: michael@0: escaped term ^break-rule-end doExprStart michael@0: white_space n start michael@0: '$' scan-var-name ^assign-or-rule doExprStart michael@0: '!' n rev-option michael@0: ';' n start # ignore empty rules. michael@0: eof exit michael@0: default term ^break-rule-end doExprStart michael@0: michael@0: # michael@0: # break-rule-end: Returned from doing a break-rule expression. michael@0: # michael@0: break-rule-end: michael@0: ';' n start doEndOfRule michael@0: white_space n break-rule-end michael@0: default errorDeath doRuleError michael@0: michael@0: michael@0: # michael@0: # ! We've just scanned a '!', indicating either a !!key word flag or a michael@0: # !Reverse rule. michael@0: # michael@0: rev-option: michael@0: '!' n option-scan1 michael@0: default reverse-rule ^break-rule-end doReverseDir michael@0: michael@0: option-scan1: michael@0: name_start_char n option-scan2 doOptionStart michael@0: default errorDeath doRuleError michael@0: michael@0: option-scan2: michael@0: name_char n option-scan2 michael@0: default option-scan3 doOptionEnd michael@0: michael@0: option-scan3: michael@0: ';' n start michael@0: white_space n option-scan3 michael@0: default errorDeath doRuleError michael@0: michael@0: michael@0: reverse-rule: michael@0: default term ^break-rule-end doExprStart michael@0: michael@0: michael@0: # michael@0: # term. Eat through a single rule character, or a composite thing, which michael@0: # could be a parenthesized expression, a variable name, or a Unicode Set. michael@0: # michael@0: term: michael@0: escaped n expr-mod doRuleChar michael@0: white_space n term michael@0: rule_char n expr-mod doRuleChar michael@0: '[' scan-unicode-set ^expr-mod michael@0: '(' n term ^expr-mod doLParen michael@0: '$' scan-var-name ^term-var-ref michael@0: '.' n expr-mod doDotAny michael@0: default errorDeath doRuleError michael@0: michael@0: michael@0: michael@0: # michael@0: # term-var-ref We've just finished scanning a reference to a $variable. michael@0: # Check that the variable was defined. michael@0: # The variable name scanning is in common with assignment statements, michael@0: # so the check can't be done there. michael@0: term-var-ref: michael@0: default expr-mod doCheckVarDef michael@0: michael@0: michael@0: # michael@0: # expr-mod We've just finished scanning a term, now look for the optional michael@0: # trailing '*', '?', '+' michael@0: # michael@0: expr-mod: michael@0: white_space n expr-mod michael@0: '*' n expr-cont doUnaryOpStar michael@0: '+' n expr-cont doUnaryOpPlus michael@0: '?' n expr-cont doUnaryOpQuestion michael@0: default expr-cont michael@0: michael@0: michael@0: # michael@0: # expr-cont Expression, continuation. At a point where additional terms are michael@0: # allowed, but not required. michael@0: # michael@0: expr-cont: michael@0: escaped term doExprCatOperator michael@0: white_space n expr-cont michael@0: rule_char term doExprCatOperator michael@0: '[' term doExprCatOperator michael@0: '(' term doExprCatOperator michael@0: '$' term doExprCatOperator michael@0: '.' term doExprCatOperator michael@0: '/' look-ahead doExprCatOperator michael@0: '{' n tag-open doExprCatOperator michael@0: '|' n term doExprOrOperator michael@0: ')' n pop doExprRParen michael@0: default pop doExprFinished michael@0: michael@0: michael@0: # michael@0: # look-ahead Scanning a '/', which identifies a break point, assuming that the michael@0: # remainder of the expression matches. michael@0: # michael@0: # Generate a parse tree as if this was a special kind of input symbol michael@0: # appearing in an otherwise normal concatenation expression. michael@0: # michael@0: look-ahead: michael@0: '/' n expr-cont-no-slash doSlash michael@0: default errorDeath michael@0: michael@0: michael@0: # michael@0: # expr-cont-no-slash Expression, continuation. At a point where additional terms are michael@0: # allowed, but not required. Just like michael@0: # expr-cont, above, except that no '/' michael@0: # look-ahead symbol is permitted. michael@0: # michael@0: expr-cont-no-slash: michael@0: escaped term doExprCatOperator michael@0: white_space n expr-cont michael@0: rule_char term doExprCatOperator michael@0: '[' term doExprCatOperator michael@0: '(' term doExprCatOperator michael@0: '$' term doExprCatOperator michael@0: '.' term doExprCatOperator michael@0: '|' n term doExprOrOperator michael@0: ')' n pop doExprRParen michael@0: default pop doExprFinished michael@0: michael@0: michael@0: # michael@0: # tags scanning a '{', the opening delimiter for a tag that identifies michael@0: # the kind of match. Scan the whole {dddd} tag, where d=digit michael@0: # michael@0: tag-open: michael@0: white_space n tag-open michael@0: digit_char tag-value doStartTagValue michael@0: default errorDeath doTagExpectedError michael@0: michael@0: tag-value: michael@0: white_space n tag-close michael@0: '}' tag-close michael@0: digit_char n tag-value doTagDigit michael@0: default errorDeath doTagExpectedError michael@0: michael@0: tag-close: michael@0: white_space n tag-close michael@0: '}' n expr-cont-no-tag doTagValue michael@0: default errorDeath doTagExpectedError michael@0: michael@0: michael@0: michael@0: # michael@0: # expr-cont-no-tag Expression, continuation. At a point where additional terms are michael@0: # allowed, but not required. Just like michael@0: # expr-cont, above, except that no "{ddd}" michael@0: # tagging is permitted. michael@0: # michael@0: expr-cont-no-tag: michael@0: escaped term doExprCatOperator michael@0: white_space n expr-cont-no-tag michael@0: rule_char term doExprCatOperator michael@0: '[' term doExprCatOperator michael@0: '(' term doExprCatOperator michael@0: '$' term doExprCatOperator michael@0: '.' term doExprCatOperator michael@0: '/' look-ahead doExprCatOperator michael@0: '|' n term doExprOrOperator michael@0: ')' n pop doExprRParen michael@0: default pop doExprFinished michael@0: michael@0: michael@0: michael@0: michael@0: # michael@0: # Variable Name Scanning. michael@0: # michael@0: # The state that branched to here must have pushed a return state michael@0: # to go to after completion of the variable name scanning. michael@0: # michael@0: # The current input character must be the $ that introduces the name. michael@0: # The $ is consummed here rather than in the state that first detected it michael@0: # so that the doStartVariableName action only needs to happen in one michael@0: # place (here), and the other states don't need to worry about it. michael@0: # michael@0: scan-var-name: michael@0: '$' n scan-var-start doStartVariableName michael@0: default errorDeath michael@0: michael@0: michael@0: scan-var-start: michael@0: name_start_char n scan-var-body michael@0: default errorDeath doVariableNameExpectedErr michael@0: michael@0: scan-var-body: michael@0: name_char n scan-var-body michael@0: default pop doEndVariableName michael@0: michael@0: michael@0: michael@0: # michael@0: # scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class. michael@0: # Within the RBBI parser, after finding the first character michael@0: # of a Unicode Set, we just hand the rule input at that michael@0: # point of to the Unicode Set constructor, then pick michael@0: # up parsing after the close of the set. michael@0: # michael@0: # The action for this state invokes the UnicodeSet parser. michael@0: # michael@0: scan-unicode-set: michael@0: '[' n pop doScanUnicodeSet michael@0: 'p' n pop doScanUnicodeSet michael@0: 'P' n pop doScanUnicodeSet michael@0: default errorDeath michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: # michael@0: # assign-or-rule. A $variable was encountered at the start of something, could be michael@0: # either an assignment statement or a rule, depending on whether an '=' michael@0: # follows the variable name. We get to this state when the variable name michael@0: # scanning does a return. michael@0: # michael@0: assign-or-rule: michael@0: white_space n assign-or-rule michael@0: '=' n term ^assign-end doStartAssign # variable was target of assignment michael@0: default term-var-ref ^break-rule-end # variable was a term in a rule michael@0: michael@0: michael@0: michael@0: # michael@0: # assign-end This state is entered when the end of the expression on the michael@0: # right hand side of an assignment is found. We get here via michael@0: # a pop; this state is pushed when the '=' in an assignment is found. michael@0: # michael@0: # The only thing allowed at this point is a ';'. The RHS of an michael@0: # assignment must look like a rule expression, and we come here michael@0: # when what is being scanned no longer looks like an expression. michael@0: # michael@0: assign-end: michael@0: ';' n start doEndAssign michael@0: default errorDeath doRuleErrorAssignExpr michael@0: michael@0: michael@0: michael@0: # michael@0: # errorDeath. This state is specified as the next state whenever a syntax error michael@0: # in the source rules is detected. Barring bugs, the state machine will never michael@0: # actually get here, but will stop because of the action associated with the error. michael@0: # But, just in case, this state asks the state machine to exit. michael@0: errorDeath: michael@0: default n errorDeath doExit michael@0: michael@0: