michael@0: 
michael@0: #*****************************************************************************
michael@0: #
michael@0: #   Copyright (C) 2002-2003, International Business Machines Corporation and others.
michael@0: #   All Rights Reserved.
michael@0: #
michael@0: #*****************************************************************************
michael@0: #
michael@0: #  file:  rbbirpt.txt
michael@0: #  ICU Break Iterator Rule Parser State Table
michael@0: #
michael@0: #     This state table is used when reading and parsing a set of RBBI rules
michael@0: #     The rule parser uses a state machine; the data in this file define the
michael@0: #     state transitions that occur for each input character.
michael@0: #
michael@0: #     *** This file defines the RBBI rule grammar.   This is it.
michael@0: #     *** The determination of what is accepted is here.
michael@0: #
michael@0: #     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
michael@0: #     that are then built with the rule parser.
michael@0: #
michael@0: 
michael@0: #
michael@0: # Here is the syntax of the state definitions in this file:
michael@0: #
michael@0: #
michael@0: #StateName:
michael@0: #   input-char           n next-state           ^push-state     action    
michael@0: #   input-char           n next-state           ^push-state     action    
michael@0: #       |                |   |                      |             |
michael@0: #       |                |   |                      |             |--- action to be performed by state machine
michael@0: #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
michael@0: #       |                |   |                      |
michael@0: #       |                |   |                      |--- Push this named state onto the state stack.
michael@0: #       |                |   |                           Later, when next state is specified as "pop",
michael@0: #       |                |   |                           the pushed state will become the current state.
michael@0: #       |                |   |
michael@0: #       |                |   |--- Transition to this state if the current input character matches the input
michael@0: #       |                |        character or char class in the left hand column.  "pop" causes the next
michael@0: #       |                |        state to be popped from the state stack.
michael@0: #       |                |
michael@0: #       |                |--- When making the state transition specified on this line, advance to the next
michael@0: #       |                     character from the input only if 'n' appears here.
michael@0: #       |
michael@0: #       |--- Character or named character classes to test for.  If the current character being scanned
michael@0: #            matches, peform the actions and go to the state specified on this line.
michael@0: #            The input character is tested sequentally, in the order written.  The characters and
michael@0: #            character classes tested for do not need to be mutually exclusive.  The first match wins.
michael@0: #            
michael@0: 
michael@0: 
michael@0: 
michael@0: 
michael@0: #
michael@0: #  start state, scan position is at the beginning of the rules file, or in between two rules.
michael@0: #
michael@0: start:
michael@0:     escaped                term                  ^break-rule-end    doExprStart                       
michael@0:     white_space          n start                     
michael@0:     '$'                    scan-var-name         ^assign-or-rule    doExprStart
michael@0:     '!'                  n rev-option                             
michael@0:     ';'                  n start                                                  # ignore empty rules.
michael@0:     eof                    exit              
michael@0:     default                term                  ^break-rule-end    doExprStart
michael@0:     
michael@0: #
michael@0: #  break-rule-end:  Returned from doing a break-rule expression.
michael@0: #
michael@0: break-rule-end:
michael@0:     ';'	                 n start                                    doEndOfRule
michael@0:     white_space          n break-rule-end
michael@0:     default                errorDeath                               doRuleError
michael@0:      
michael@0: 
michael@0: #
michael@0: #   !               We've just scanned a '!', indicating either a !!key word flag or a
michael@0: #                   !Reverse rule.
michael@0: #
michael@0: rev-option:
michael@0:     '!'                  n option-scan1   
michael@0:     default                reverse-rule           ^break-rule-end   doReverseDir
michael@0:     
michael@0: option-scan1:
michael@0:     name_start_char      n option-scan2                             doOptionStart
michael@0:     default                errorDeath                               doRuleError
michael@0:     
michael@0: option-scan2:
michael@0:     name_char            n option-scan2
michael@0:     default                option-scan3                             doOptionEnd
michael@0:     
michael@0: option-scan3:
michael@0:     ';'                  n start 
michael@0:     white_space          n option-scan3 
michael@0:     default                errorDeath                               doRuleError 
michael@0:     
michael@0: 
michael@0: reverse-rule:
michael@0:     default                term                   ^break-rule-end   doExprStart
michael@0:     
michael@0:     
michael@0: #
michael@0: #  term.  Eat through a single rule character, or a composite thing, which
michael@0: #         could be a parenthesized expression, a variable name, or a Unicode Set.
michael@0: #
michael@0: term:
michael@0:     escaped              n expr-mod                                 doRuleChar
michael@0:     white_space          n term
michael@0:     rule_char            n expr-mod                                 doRuleChar
michael@0:     '['                    scan-unicode-set      ^expr-mod
michael@0:     '('                  n term                  ^expr-mod          doLParen
michael@0:     '$'                    scan-var-name         ^term-var-ref
michael@0:     '.'                  n expr-mod                                 doDotAny
michael@0:     default                errorDeath                               doRuleError
michael@0:     
michael@0:     
michael@0: 
michael@0: #
michael@0: #  term-var-ref   We've just finished scanning a reference to a $variable.
michael@0: #                 Check that the variable was defined.
michael@0: #                 The variable name scanning is in common with assignment statements,
michael@0: #                 so the check can't be done there.
michael@0: term-var-ref:
michael@0:     default                expr-mod                                 doCheckVarDef
michael@0:     
michael@0:     
michael@0: #
michael@0: #   expr-mod      We've just finished scanning a term, now look for the optional
michael@0: #                 trailing '*', '?', '+'
michael@0: #
michael@0: expr-mod:
michael@0:     white_space          n  expr-mod
michael@0:     '*'                  n  expr-cont                               doUnaryOpStar
michael@0:     '+'                  n  expr-cont                               doUnaryOpPlus
michael@0:     '?'                  n  expr-cont                               doUnaryOpQuestion
michael@0:     default                 expr-cont 
michael@0:     
michael@0:     
michael@0: #
michael@0: #  expr-cont      Expression, continuation.  At a point where additional terms are
michael@0: #                                            allowed, but not required.
michael@0: #
michael@0: expr-cont:
michael@0:     escaped                 term                                    doExprCatOperator
michael@0:     white_space          n  expr-cont
michael@0:     rule_char               term                                    doExprCatOperator
michael@0:     '['                     term                                    doExprCatOperator
michael@0:     '('                     term                                    doExprCatOperator
michael@0:     '$'                     term                                    doExprCatOperator
michael@0:     '.'                     term                                    doExprCatOperator
michael@0:     '/'                     look-ahead                              doExprCatOperator
michael@0:     '{'                  n  tag-open                                doExprCatOperator
michael@0:     '|'                  n  term                                    doExprOrOperator
michael@0:     ')'                  n  pop                                     doExprRParen
michael@0:     default                 pop                                     doExprFinished
michael@0:     
michael@0: 
michael@0: #
michael@0: #   look-ahead    Scanning a '/', which identifies a break point, assuming that the
michael@0: #                 remainder of the expression matches.
michael@0: #
michael@0: #                 Generate a parse tree as if this was a special kind of input symbol
michael@0: #                 appearing in an otherwise normal concatenation expression.
michael@0: #
michael@0: look-ahead:
michael@0:     '/'                   n expr-cont-no-slash                      doSlash
michael@0:     default                 errorDeath
michael@0: 
michael@0: 
michael@0: #
michael@0: #  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
michael@0: #                                            allowed, but not required.  Just like
michael@0: #                                            expr-cont, above, except that no '/'
michael@0: #                                            look-ahead symbol is permitted.
michael@0: #
michael@0: expr-cont-no-slash:
michael@0:     escaped                 term                                    doExprCatOperator
michael@0:     white_space          n  expr-cont
michael@0:     rule_char               term                                    doExprCatOperator
michael@0:     '['                     term                                    doExprCatOperator
michael@0:     '('                     term                                    doExprCatOperator
michael@0:     '$'                     term                                    doExprCatOperator
michael@0:     '.'                     term                                    doExprCatOperator
michael@0:     '|'                  n  term                                    doExprOrOperator
michael@0:     ')'                  n  pop                                     doExprRParen
michael@0:     default                 pop                                     doExprFinished
michael@0: 
michael@0: 
michael@0: #
michael@0: #   tags             scanning a '{', the opening delimiter for a tag that identifies
michael@0: #                    the kind of match.  Scan the whole {dddd} tag, where d=digit
michael@0: #
michael@0: tag-open:
michael@0:     white_space          n  tag-open
michael@0:     digit_char              tag-value                               doStartTagValue
michael@0:     default                 errorDeath                              doTagExpectedError
michael@0:     
michael@0: tag-value:
michael@0:     white_space          n  tag-close
michael@0:     '}'                     tag-close
michael@0:     digit_char           n  tag-value                               doTagDigit
michael@0:     default                 errorDeath                              doTagExpectedError
michael@0:     
michael@0: tag-close:
michael@0:     white_space          n  tag-close
michael@0:     '}'                  n  expr-cont-no-tag                        doTagValue
michael@0:     default                 errorDeath                              doTagExpectedError
michael@0:     
michael@0:     
michael@0:     
michael@0: #
michael@0: #  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
michael@0: #                                            allowed, but not required.  Just like
michael@0: #                                            expr-cont, above, except that no "{ddd}"
michael@0: #                                            tagging is permitted.
michael@0: #
michael@0: expr-cont-no-tag:
michael@0:     escaped                 term                                    doExprCatOperator
michael@0:     white_space          n  expr-cont-no-tag
michael@0:     rule_char               term                                    doExprCatOperator
michael@0:     '['                     term                                    doExprCatOperator
michael@0:     '('                     term                                    doExprCatOperator
michael@0:     '$'                     term                                    doExprCatOperator
michael@0:     '.'                     term                                    doExprCatOperator
michael@0:     '/'                     look-ahead                              doExprCatOperator
michael@0:     '|'                  n  term                                    doExprOrOperator
michael@0:     ')'                  n  pop                                     doExprRParen
michael@0:     default                 pop                                     doExprFinished
michael@0:     
michael@0:     
michael@0: 
michael@0: 
michael@0: #
michael@0: #   Variable Name Scanning.
michael@0: #
michael@0: #                    The state that branched to here must have pushed a return state
michael@0: #                    to go to after completion of the variable name scanning.
michael@0: #
michael@0: #                    The current input character must be the $ that introduces the name.
michael@0: #                    The $ is consummed here rather than in the state that first detected it
michael@0: #                    so that the doStartVariableName action only needs to happen in one
michael@0: #                    place (here), and the other states don't need to worry about it.
michael@0: #
michael@0: scan-var-name:
michael@0:    '$'                  n scan-var-start                            doStartVariableName
michael@0:    default                errorDeath
michael@0: 
michael@0: 
michael@0: scan-var-start:
michael@0:     name_start_char      n scan-var-body
michael@0:     default                errorDeath                               doVariableNameExpectedErr
michael@0:     
michael@0: scan-var-body:
michael@0:     name_char            n scan-var-body
michael@0:     default                pop                                      doEndVariableName
michael@0:     
michael@0:     
michael@0:     
michael@0: #
michael@0: #  scan-unicode-set   Unicode Sets are parsed by the the UnicodeSet class.
michael@0: #                     Within the RBBI parser, after finding the first character
michael@0: #                     of a Unicode Set, we just hand the rule input at that
michael@0: #                     point of to the Unicode Set constructor, then pick
michael@0: #                     up parsing after the close of the set.
michael@0: #
michael@0: #                     The action for this state invokes the UnicodeSet parser.
michael@0: #
michael@0: scan-unicode-set:
michael@0:     '['                   n pop                                      doScanUnicodeSet
michael@0:     'p'                   n pop                                      doScanUnicodeSet
michael@0:     'P'                   n pop                                      doScanUnicodeSet
michael@0:     default		    errorDeath 
michael@0:     
michael@0:     
michael@0: 
michael@0: 
michael@0: 
michael@0: 
michael@0: 
michael@0: #
michael@0: #  assign-or-rule.   A $variable was encountered at the start of something, could be
michael@0: #                    either an assignment statement or a rule, depending on whether an '='
michael@0: #                    follows the variable name.  We get to this state when the variable name
michael@0: #                    scanning does a return.
michael@0: #
michael@0: assign-or-rule:
michael@0:     white_space          n assign-or-rule
michael@0:     '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
michael@0:     default                term-var-ref          ^break-rule-end                    # variable was a term in a rule
michael@0: 
michael@0: 
michael@0: 
michael@0: #
michael@0: #  assign-end        This state is entered when the end of the expression on the
michael@0: #                    right hand side of an assignment is found.  We get here via
michael@0: #                    a pop; this state is pushed when the '=' in an assignment is found.
michael@0: #
michael@0: #                    The only thing allowed at this point is a ';'.  The RHS of an
michael@0: #                    assignment must look like a rule expression, and we come here
michael@0: #                    when what is being scanned no longer looks like an expression.
michael@0: #
michael@0: assign-end:
michael@0:     ';'                  n start                                    doEndAssign
michael@0:     default                errorDeath                               doRuleErrorAssignExpr
michael@0:     
michael@0:     
michael@0:     
michael@0: #
michael@0: # errorDeath.   This state is specified as the next state whenever a syntax error
michael@0: #               in the source rules is detected.  Barring bugs, the state machine will never
michael@0: #               actually get here, but will stop because of the action associated with the error.
michael@0: #               But, just in case, this state asks the state machine to exit.
michael@0: errorDeath:
michael@0:     default              n errorDeath                               doExit
michael@0: 
michael@0: