intl/icu/source/common/rbbirpt.txt

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/rbbirpt.txt	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,315 @@
     1.4 +
     1.5 +#*****************************************************************************
     1.6 +#
     1.7 +#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
     1.8 +#   All Rights Reserved.
     1.9 +#
    1.10 +#*****************************************************************************
    1.11 +#
    1.12 +#  file:  rbbirpt.txt
    1.13 +#  ICU Break Iterator Rule Parser State Table
    1.14 +#
    1.15 +#     This state table is used when reading and parsing a set of RBBI rules
    1.16 +#     The rule parser uses a state machine; the data in this file define the
    1.17 +#     state transitions that occur for each input character.
    1.18 +#
    1.19 +#     *** This file defines the RBBI rule grammar.   This is it.
    1.20 +#     *** The determination of what is accepted is here.
    1.21 +#
    1.22 +#     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
    1.23 +#     that are then built with the rule parser.
    1.24 +#
    1.25 +
    1.26 +#
    1.27 +# Here is the syntax of the state definitions in this file:
    1.28 +#
    1.29 +#
    1.30 +#StateName:
    1.31 +#   input-char           n next-state           ^push-state     action    
    1.32 +#   input-char           n next-state           ^push-state     action    
    1.33 +#       |                |   |                      |             |
    1.34 +#       |                |   |                      |             |--- action to be performed by state machine
    1.35 +#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
    1.36 +#       |                |   |                      |
    1.37 +#       |                |   |                      |--- Push this named state onto the state stack.
    1.38 +#       |                |   |                           Later, when next state is specified as "pop",
    1.39 +#       |                |   |                           the pushed state will become the current state.
    1.40 +#       |                |   |
    1.41 +#       |                |   |--- Transition to this state if the current input character matches the input
    1.42 +#       |                |        character or char class in the left hand column.  "pop" causes the next
    1.43 +#       |                |        state to be popped from the state stack.
    1.44 +#       |                |
    1.45 +#       |                |--- When making the state transition specified on this line, advance to the next
    1.46 +#       |                     character from the input only if 'n' appears here.
    1.47 +#       |
    1.48 +#       |--- Character or named character classes to test for.  If the current character being scanned
    1.49 +#            matches, peform the actions and go to the state specified on this line.
    1.50 +#            The input character is tested sequentally, in the order written.  The characters and
    1.51 +#            character classes tested for do not need to be mutually exclusive.  The first match wins.
    1.52 +#            
    1.53 +
    1.54 +
    1.55 +
    1.56 +
    1.57 +#
    1.58 +#  start state, scan position is at the beginning of the rules file, or in between two rules.
    1.59 +#
    1.60 +start:
    1.61 +    escaped                term                  ^break-rule-end    doExprStart                       
    1.62 +    white_space          n start                     
    1.63 +    '$'                    scan-var-name         ^assign-or-rule    doExprStart
    1.64 +    '!'                  n rev-option                             
    1.65 +    ';'                  n start                                                  # ignore empty rules.
    1.66 +    eof                    exit              
    1.67 +    default                term                  ^break-rule-end    doExprStart
    1.68 +    
    1.69 +#
    1.70 +#  break-rule-end:  Returned from doing a break-rule expression.
    1.71 +#
    1.72 +break-rule-end:
    1.73 +    ';'	                 n start                                    doEndOfRule
    1.74 +    white_space          n break-rule-end
    1.75 +    default                errorDeath                               doRuleError
    1.76 +     
    1.77 +
    1.78 +#
    1.79 +#   !               We've just scanned a '!', indicating either a !!key word flag or a
    1.80 +#                   !Reverse rule.
    1.81 +#
    1.82 +rev-option:
    1.83 +    '!'                  n option-scan1   
    1.84 +    default                reverse-rule           ^break-rule-end   doReverseDir
    1.85 +    
    1.86 +option-scan1:
    1.87 +    name_start_char      n option-scan2                             doOptionStart
    1.88 +    default                errorDeath                               doRuleError
    1.89 +    
    1.90 +option-scan2:
    1.91 +    name_char            n option-scan2
    1.92 +    default                option-scan3                             doOptionEnd
    1.93 +    
    1.94 +option-scan3:
    1.95 +    ';'                  n start 
    1.96 +    white_space          n option-scan3 
    1.97 +    default                errorDeath                               doRuleError 
    1.98 +    
    1.99 +
   1.100 +reverse-rule:
   1.101 +    default                term                   ^break-rule-end   doExprStart
   1.102 +    
   1.103 +    
   1.104 +#
   1.105 +#  term.  Eat through a single rule character, or a composite thing, which
   1.106 +#         could be a parenthesized expression, a variable name, or a Unicode Set.
   1.107 +#
   1.108 +term:
   1.109 +    escaped              n expr-mod                                 doRuleChar
   1.110 +    white_space          n term
   1.111 +    rule_char            n expr-mod                                 doRuleChar
   1.112 +    '['                    scan-unicode-set      ^expr-mod
   1.113 +    '('                  n term                  ^expr-mod          doLParen
   1.114 +    '$'                    scan-var-name         ^term-var-ref
   1.115 +    '.'                  n expr-mod                                 doDotAny
   1.116 +    default                errorDeath                               doRuleError
   1.117 +    
   1.118 +    
   1.119 +
   1.120 +#
   1.121 +#  term-var-ref   We've just finished scanning a reference to a $variable.
   1.122 +#                 Check that the variable was defined.
   1.123 +#                 The variable name scanning is in common with assignment statements,
   1.124 +#                 so the check can't be done there.
   1.125 +term-var-ref:
   1.126 +    default                expr-mod                                 doCheckVarDef
   1.127 +    
   1.128 +    
   1.129 +#
   1.130 +#   expr-mod      We've just finished scanning a term, now look for the optional
   1.131 +#                 trailing '*', '?', '+'
   1.132 +#
   1.133 +expr-mod:
   1.134 +    white_space          n  expr-mod
   1.135 +    '*'                  n  expr-cont                               doUnaryOpStar
   1.136 +    '+'                  n  expr-cont                               doUnaryOpPlus
   1.137 +    '?'                  n  expr-cont                               doUnaryOpQuestion
   1.138 +    default                 expr-cont 
   1.139 +    
   1.140 +    
   1.141 +#
   1.142 +#  expr-cont      Expression, continuation.  At a point where additional terms are
   1.143 +#                                            allowed, but not required.
   1.144 +#
   1.145 +expr-cont:
   1.146 +    escaped                 term                                    doExprCatOperator
   1.147 +    white_space          n  expr-cont
   1.148 +    rule_char               term                                    doExprCatOperator
   1.149 +    '['                     term                                    doExprCatOperator
   1.150 +    '('                     term                                    doExprCatOperator
   1.151 +    '$'                     term                                    doExprCatOperator
   1.152 +    '.'                     term                                    doExprCatOperator
   1.153 +    '/'                     look-ahead                              doExprCatOperator
   1.154 +    '{'                  n  tag-open                                doExprCatOperator
   1.155 +    '|'                  n  term                                    doExprOrOperator
   1.156 +    ')'                  n  pop                                     doExprRParen
   1.157 +    default                 pop                                     doExprFinished
   1.158 +    
   1.159 +
   1.160 +#
   1.161 +#   look-ahead    Scanning a '/', which identifies a break point, assuming that the
   1.162 +#                 remainder of the expression matches.
   1.163 +#
   1.164 +#                 Generate a parse tree as if this was a special kind of input symbol
   1.165 +#                 appearing in an otherwise normal concatenation expression.
   1.166 +#
   1.167 +look-ahead:
   1.168 +    '/'                   n expr-cont-no-slash                      doSlash
   1.169 +    default                 errorDeath
   1.170 +
   1.171 +
   1.172 +#
   1.173 +#  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
   1.174 +#                                            allowed, but not required.  Just like
   1.175 +#                                            expr-cont, above, except that no '/'
   1.176 +#                                            look-ahead symbol is permitted.
   1.177 +#
   1.178 +expr-cont-no-slash:
   1.179 +    escaped                 term                                    doExprCatOperator
   1.180 +    white_space          n  expr-cont
   1.181 +    rule_char               term                                    doExprCatOperator
   1.182 +    '['                     term                                    doExprCatOperator
   1.183 +    '('                     term                                    doExprCatOperator
   1.184 +    '$'                     term                                    doExprCatOperator
   1.185 +    '.'                     term                                    doExprCatOperator
   1.186 +    '|'                  n  term                                    doExprOrOperator
   1.187 +    ')'                  n  pop                                     doExprRParen
   1.188 +    default                 pop                                     doExprFinished
   1.189 +
   1.190 +
   1.191 +#
   1.192 +#   tags             scanning a '{', the opening delimiter for a tag that identifies
   1.193 +#                    the kind of match.  Scan the whole {dddd} tag, where d=digit
   1.194 +#
   1.195 +tag-open:
   1.196 +    white_space          n  tag-open
   1.197 +    digit_char              tag-value                               doStartTagValue
   1.198 +    default                 errorDeath                              doTagExpectedError
   1.199 +    
   1.200 +tag-value:
   1.201 +    white_space          n  tag-close
   1.202 +    '}'                     tag-close
   1.203 +    digit_char           n  tag-value                               doTagDigit
   1.204 +    default                 errorDeath                              doTagExpectedError
   1.205 +    
   1.206 +tag-close:
   1.207 +    white_space          n  tag-close
   1.208 +    '}'                  n  expr-cont-no-tag                        doTagValue
   1.209 +    default                 errorDeath                              doTagExpectedError
   1.210 +    
   1.211 +    
   1.212 +    
   1.213 +#
   1.214 +#  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
   1.215 +#                                            allowed, but not required.  Just like
   1.216 +#                                            expr-cont, above, except that no "{ddd}"
   1.217 +#                                            tagging is permitted.
   1.218 +#
   1.219 +expr-cont-no-tag:
   1.220 +    escaped                 term                                    doExprCatOperator
   1.221 +    white_space          n  expr-cont-no-tag
   1.222 +    rule_char               term                                    doExprCatOperator
   1.223 +    '['                     term                                    doExprCatOperator
   1.224 +    '('                     term                                    doExprCatOperator
   1.225 +    '$'                     term                                    doExprCatOperator
   1.226 +    '.'                     term                                    doExprCatOperator
   1.227 +    '/'                     look-ahead                              doExprCatOperator
   1.228 +    '|'                  n  term                                    doExprOrOperator
   1.229 +    ')'                  n  pop                                     doExprRParen
   1.230 +    default                 pop                                     doExprFinished
   1.231 +    
   1.232 +    
   1.233 +
   1.234 +
   1.235 +#
   1.236 +#   Variable Name Scanning.
   1.237 +#
   1.238 +#                    The state that branched to here must have pushed a return state
   1.239 +#                    to go to after completion of the variable name scanning.
   1.240 +#
   1.241 +#                    The current input character must be the $ that introduces the name.
   1.242 +#                    The $ is consummed here rather than in the state that first detected it
   1.243 +#                    so that the doStartVariableName action only needs to happen in one
   1.244 +#                    place (here), and the other states don't need to worry about it.
   1.245 +#
   1.246 +scan-var-name:
   1.247 +   '$'                  n scan-var-start                            doStartVariableName
   1.248 +   default                errorDeath
   1.249 +
   1.250 +
   1.251 +scan-var-start:
   1.252 +    name_start_char      n scan-var-body
   1.253 +    default                errorDeath                               doVariableNameExpectedErr
   1.254 +    
   1.255 +scan-var-body:
   1.256 +    name_char            n scan-var-body
   1.257 +    default                pop                                      doEndVariableName
   1.258 +    
   1.259 +    
   1.260 +    
   1.261 +#
   1.262 +#  scan-unicode-set   Unicode Sets are parsed by the the UnicodeSet class.
   1.263 +#                     Within the RBBI parser, after finding the first character
   1.264 +#                     of a Unicode Set, we just hand the rule input at that
   1.265 +#                     point of to the Unicode Set constructor, then pick
   1.266 +#                     up parsing after the close of the set.
   1.267 +#
   1.268 +#                     The action for this state invokes the UnicodeSet parser.
   1.269 +#
   1.270 +scan-unicode-set:
   1.271 +    '['                   n pop                                      doScanUnicodeSet
   1.272 +    'p'                   n pop                                      doScanUnicodeSet
   1.273 +    'P'                   n pop                                      doScanUnicodeSet
   1.274 +    default		    errorDeath 
   1.275 +    
   1.276 +    
   1.277 +
   1.278 +
   1.279 +
   1.280 +
   1.281 +
   1.282 +#
   1.283 +#  assign-or-rule.   A $variable was encountered at the start of something, could be
   1.284 +#                    either an assignment statement or a rule, depending on whether an '='
   1.285 +#                    follows the variable name.  We get to this state when the variable name
   1.286 +#                    scanning does a return.
   1.287 +#
   1.288 +assign-or-rule:
   1.289 +    white_space          n assign-or-rule
   1.290 +    '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
   1.291 +    default                term-var-ref          ^break-rule-end                    # variable was a term in a rule
   1.292 +
   1.293 +
   1.294 +
   1.295 +#
   1.296 +#  assign-end        This state is entered when the end of the expression on the
   1.297 +#                    right hand side of an assignment is found.  We get here via
   1.298 +#                    a pop; this state is pushed when the '=' in an assignment is found.
   1.299 +#
   1.300 +#                    The only thing allowed at this point is a ';'.  The RHS of an
   1.301 +#                    assignment must look like a rule expression, and we come here
   1.302 +#                    when what is being scanned no longer looks like an expression.
   1.303 +#
   1.304 +assign-end:
   1.305 +    ';'                  n start                                    doEndAssign
   1.306 +    default                errorDeath                               doRuleErrorAssignExpr
   1.307 +    
   1.308 +    
   1.309 +    
   1.310 +#
   1.311 +# errorDeath.   This state is specified as the next state whenever a syntax error
   1.312 +#               in the source rules is detected.  Barring bugs, the state machine will never
   1.313 +#               actually get here, but will stop because of the action associated with the error.
   1.314 +#               But, just in case, this state asks the state machine to exit.
   1.315 +errorDeath:
   1.316 +    default              n errorDeath                               doExit
   1.317 +
   1.318 +

mercurial