intl/icu/source/i18n/regexcst.txt

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/regexcst.txt	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,467 @@
     1.4 +
     1.5 +#*****************************************************************************
     1.6 +#
     1.7 +#   Copyright (C) 2002-2007, International Business Machines Corporation and others.
     1.8 +#   All Rights Reserved.
     1.9 +#
    1.10 +#*****************************************************************************
    1.11 +#
    1.12 +#  file:  regexcst.txt
    1.13 +#  ICU Regular Expression Parser State Table
    1.14 +#
    1.15 +#     This state table is used when reading and parsing a regular expression pattern
    1.16 +#     The pattern parser uses a state machine; the data in this file define the
    1.17 +#     state transitions that occur for each input character.
    1.18 +#
    1.19 +#     *** This file defines the regex pattern grammar.   This is it.
    1.20 +#     *** The determination of what is accepted is here.
    1.21 +#
    1.22 +#     This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
    1.23 +#     that are then built with the rule parser.
    1.24 +#
    1.25 +
    1.26 +#
    1.27 +# Here is the syntax of the state definitions in this file:
    1.28 +#
    1.29 +#
    1.30 +#StateName:
    1.31 +#   input-char           n next-state           ^push-state     action
    1.32 +#   input-char           n next-state           ^push-state     action
    1.33 +#       |                |   |                      |             |
    1.34 +#       |                |   |                      |             |--- action to be performed by state machine
    1.35 +#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
    1.36 +#       |                |   |                      |
    1.37 +#       |                |   |                      |--- Push this named state onto the state stack.
    1.38 +#       |                |   |                           Later, when next state is specified as "pop",
    1.39 +#       |                |   |                           the pushed state will become the current state.
    1.40 +#       |                |   |
    1.41 +#       |                |   |--- Transition to this state if the current input character matches the input
    1.42 +#       |                |        character or char class in the left hand column.  "pop" causes the next
    1.43 +#       |                |        state to be popped from the state stack.
    1.44 +#       |                |
    1.45 +#       |                |--- When making the state transition specified on this line, advance to the next
    1.46 +#       |                     character from the input only if 'n' appears here.
    1.47 +#       |
    1.48 +#       |--- Character or named character classes to test for.  If the current character being scanned
    1.49 +#            matches, peform the actions and go to the state specified on this line.
    1.50 +#            The input character is tested sequentally, in the order written.  The characters and
    1.51 +#            character classes tested for do not need to be mutually exclusive.  The first match wins.
    1.52 +#
    1.53 +
    1.54 +
    1.55 +
    1.56 +
    1.57 +#
    1.58 +#  start state, scan position is at the beginning of the pattern.
    1.59 +#
    1.60 +start:
    1.61 +   default                 term                                     doPatStart
    1.62 +
    1.63 +
    1.64 +
    1.65 +
    1.66 +#
    1.67 +#  term.  At a position where we can accept the start most items in a pattern.
    1.68 +#
    1.69 +term:
    1.70 +    quoted               n expr-quant                               doLiteralChar
    1.71 +    rule_char            n expr-quant                               doLiteralChar
    1.72 +    '['                  n set-open       ^set-finish               doSetBegin
    1.73 +    '('                  n open-paren
    1.74 +    '.'                  n expr-quant                               doDotAny
    1.75 +    '^'                  n expr-quant                               doCaret
    1.76 +    '$'                  n expr-quant                               doDollar
    1.77 +    '\'                  n backslash
    1.78 +    '|'                  n  term                                    doOrOperator
    1.79 +    ')'                  n  pop                                     doCloseParen
    1.80 +    eof	                   term                                     doPatFinish
    1.81 +    default                errorDeath                               doRuleError
    1.82 +
    1.83 +
    1.84 +
    1.85 +#
    1.86 +#   expr-quant    We've just finished scanning a term, now look for the optional
    1.87 +#                 trailing quantifier - *, +, ?, *?,  etc.
    1.88 +#
    1.89 +expr-quant:
    1.90 +    '*'                  n  quant-star
    1.91 +    '+'                  n  quant-plus
    1.92 +    '?'                  n  quant-opt
    1.93 +    '{'                  n  interval-open                          doIntervalInit
    1.94 +    '('                  n  open-paren-quant
    1.95 +    default                 expr-cont
    1.96 +
    1.97 +
    1.98 +#
    1.99 +#  expr-cont      Expression, continuation.  At a point where additional terms are
   1.100 +#                                            allowed, but not required.  No Quantifiers
   1.101 +#
   1.102 +expr-cont:
   1.103 +    '|'                  n  term                                    doOrOperator
   1.104 +    ')'                  n  pop                                     doCloseParen
   1.105 +    default                 term
   1.106 +
   1.107 +
   1.108 +#
   1.109 +#   open-paren-quant   Special case handling for comments appearing before a quantifier,
   1.110 +#                        e.g.   x(?#comment )*
   1.111 +#                      Open parens from expr-quant come here; anything but a (?# comment
   1.112 +#                      branches into the normal parenthesis sequence as quickly as possible.
   1.113 +#
   1.114 +open-paren-quant:
   1.115 +    '?'                  n  open-paren-quant2                      doSuppressComments
   1.116 +    default                 open-paren
   1.117 +
   1.118 +open-paren-quant2:
   1.119 +    '#'                  n  paren-comment   ^expr-quant
   1.120 +    default                 open-paren-extended
   1.121 +
   1.122 +
   1.123 +#
   1.124 +#   open-paren    We've got an open paren.  We need to scan further to
   1.125 +#                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
   1.126 +#
   1.127 +open-paren:
   1.128 +    '?'                  n  open-paren-extended                     doSuppressComments
   1.129 +    default                 term            ^expr-quant             doOpenCaptureParen
   1.130 +
   1.131 +open-paren-extended:
   1.132 +    ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
   1.133 +    '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
   1.134 +    '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
   1.135 +    '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
   1.136 +    '<'                  n  open-paren-lookbehind
   1.137 +    '#'                  n  paren-comment   ^term
   1.138 +    'i'                     paren-flag                              doBeginMatchMode
   1.139 +    'd'                     paren-flag                              doBeginMatchMode
   1.140 +    'm'                     paren-flag                              doBeginMatchMode
   1.141 +    's'                     paren-flag                              doBeginMatchMode
   1.142 +    'u'                     paren-flag                              doBeginMatchMode
   1.143 +    'w'                     paren-flag                              doBeginMatchMode
   1.144 +    'x'                     paren-flag                              doBeginMatchMode
   1.145 +    '-'                     paren-flag                              doBeginMatchMode
   1.146 +    '('                  n  errorDeath                              doConditionalExpr
   1.147 +    '{'                  n  errorDeath                              doPerlInline
   1.148 +    default                 errorDeath                              doBadOpenParenType
   1.149 +
   1.150 +open-paren-lookbehind:
   1.151 +    '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
   1.152 +    '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
   1.153 +    default                 errorDeath                              doBadOpenParenType
   1.154 +
   1.155 +
   1.156 +#
   1.157 +#   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
   1.158 +#
   1.159 +paren-comment:
   1.160 +    ')'                  n  pop
   1.161 +    eof		                errorDeath                              doMismatchedParenErr
   1.162 +    default              n  paren-comment
   1.163 +
   1.164 +#
   1.165 +#  paren-flag    Scanned a (?ismx-ismx  flag setting
   1.166 +#
   1.167 +paren-flag:
   1.168 +    'i'                  n  paren-flag                              doMatchMode
   1.169 +    'd'                  n  paren-flag                              doMatchMode
   1.170 +    'm'                  n  paren-flag                              doMatchMode
   1.171 +    's'                  n  paren-flag                              doMatchMode
   1.172 +    'u'                  n  paren-flag                              doMatchMode
   1.173 +    'w'                  n  paren-flag                              doMatchMode
   1.174 +    'x'                  n  paren-flag                              doMatchMode
   1.175 +    '-'                  n  paren-flag                              doMatchMode
   1.176 +    ')'                  n  term                                    doSetMatchMode
   1.177 +    ':'                  n  term              ^expr-quant           doMatchModeParen
   1.178 +    default                 errorDeath                              doBadModeFlag
   1.179 +
   1.180 +
   1.181 +#
   1.182 +#  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
   1.183 +#                 between plain '*', '*?', '*+'
   1.184 +#
   1.185 +quant-star:
   1.186 +     '?'                 n  expr-cont                               doNGStar               #  *?
   1.187 +     '+'                 n  expr-cont                               doPossessiveStar       #  *+
   1.188 +     default                expr-cont                               doStar
   1.189 +
   1.190 +
   1.191 +#
   1.192 +#  quant-plus     Scanning a '+' quantifier.  Need to look ahead to decide
   1.193 +#                 between plain '+', '+?', '++'
   1.194 +#
   1.195 +quant-plus:
   1.196 +     '?'                 n  expr-cont                               doNGPlus               #  *?
   1.197 +     '+'                 n  expr-cont                               doPossessivePlus       #  *+
   1.198 +     default                expr-cont                               doPlus
   1.199 +
   1.200 +
   1.201 +#
   1.202 +#  quant-opt  Scanning a '?' quantifier.  Need to look ahead to decide
   1.203 +#                  between plain '?', '??', '?+'
   1.204 +#
   1.205 +quant-opt:
   1.206 +     '?'                 n  expr-cont                               doNGOpt                 #  ??
   1.207 +     '+'                 n  expr-cont                               doPossessiveOpt         #  ?+
   1.208 +     default                expr-cont                               doOpt                   #  ?
   1.209 +
   1.210 +
   1.211 +#
   1.212 +#   Interval         scanning a '{', the opening delimiter for an interval specification
   1.213 +#                                   {number} or {min, max} or {min,}
   1.214 +#
   1.215 +interval-open:
   1.216 +    digit_char              interval-lower
   1.217 +    default                 errorDeath                              doIntervalError
   1.218 +
   1.219 +interval-lower:
   1.220 +    digit_char           n  interval-lower                          doIntevalLowerDigit
   1.221 +    ','			         n  interval-upper
   1.222 +    '}'                  n  interval-type                           doIntervalSame             # {n}
   1.223 +    default                 errorDeath                              doIntervalError
   1.224 +
   1.225 +interval-upper:
   1.226 +    digit_char           n  interval-upper                          doIntervalUpperDigit
   1.227 +    '}'                  n  interval-type
   1.228 +    default                 errorDeath                              doIntervalError
   1.229 +
   1.230 +interval-type:
   1.231 +    '?'                  n  expr-cont                               doNGInterval                # {n,m}?
   1.232 +    '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
   1.233 +    default                 expr-cont                               doInterval                  # {m,n}
   1.234 +
   1.235 +
   1.236 +#
   1.237 +#  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
   1.238 +#                                  The low level next-char function will have preprocessed
   1.239 +#                                  some of them already; those won't come here.
   1.240 +backslash:
   1.241 +   'A'                   n  term                                    doBackslashA
   1.242 +   'B'                   n  term                                    doBackslashB
   1.243 +   'b'                   n  term                                    doBackslashb
   1.244 +   'd'                   n  expr-quant                              doBackslashd
   1.245 +   'D'                   n  expr-quant                              doBackslashD
   1.246 +   'G'                   n  term                                    doBackslashG
   1.247 +   'N'                      expr-quant                              doNamedChar      #   \N{NAME}  named char
   1.248 +   'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
   1.249 +   'P'                      expr-quant                              doProperty
   1.250 +   'Q'                   n  term                                    doEnterQuoteMode
   1.251 +   'S'                   n  expr-quant                              doBackslashS
   1.252 +   's'                   n  expr-quant                              doBackslashs
   1.253 +   'W'                   n  expr-quant                              doBackslashW
   1.254 +   'w'                   n  expr-quant                              doBackslashw
   1.255 +   'X'                   n  expr-quant                              doBackslashX
   1.256 +   'Z'                   n  term                                    doBackslashZ
   1.257 +   'z'                   n  term                                    doBackslashz
   1.258 +   digit_char            n  expr-quant                              doBackRef         #  Will scan multiple digits
   1.259 +   eof                      errorDeath                              doEscapeError
   1.260 +   default               n  expr-quant                              doEscapedLiteralChar
   1.261 +
   1.262 +
   1.263 +
   1.264 +#
   1.265 +# [set expression] parsing,
   1.266 +#    All states involved in parsing set expressions have names beginning with "set-"
   1.267 +#
   1.268 +
   1.269 +set-open:
   1.270 +   '^'                   n  set-open2                               doSetNegate
   1.271 +   ':'                      set-posix                               doSetPosixProp
   1.272 +   default                  set-open2
   1.273 +
   1.274 +set-open2:
   1.275 +   ']'                   n  set-after-lit                           doSetLiteral
   1.276 +   default                  set-start
   1.277 +
   1.278 +#  set-posix:
   1.279 +#                  scanned a '[:'  If it really is a [:property:], doSetPosixProp will have
   1.280 +#                  moved the scan to the closing ']'.  If it wasn't a property
   1.281 +#                  expression, the scan will still be at the opening ':', which should
   1.282 +#                  be interpreted as a normal set expression.
   1.283 +set-posix:
   1.284 +    ']'                  n   pop                                    doSetEnd
   1.285 +    ':'                      set-start
   1.286 +    default                  errorDeath                             doRuleError  # should not be possible.
   1.287 +
   1.288 +#
   1.289 +#   set-start   after the [ and special case leading characters (^ and/or ]) but before
   1.290 +#               everything else.   A '-' is literal at this point.
   1.291 +#
   1.292 +set-start:
   1.293 +    ']'                  n  pop                                     doSetEnd
   1.294 +    '['                  n  set-open      ^set-after-set            doSetBeginUnion
   1.295 +    '\'                  n  set-escape
   1.296 +    '-'                  n  set-start-dash
   1.297 +    '&'                  n  set-start-amp
   1.298 +    default              n  set-after-lit                           doSetLiteral
   1.299 +
   1.300 +#    set-start-dash    Turn "[--" into a syntax error.
   1.301 +#                           "[-x" is good, - and x are literals.
   1.302 +#
   1.303 +set-start-dash:
   1.304 +    '-'                     errorDeath                              doRuleError
   1.305 +    default                 set-after-lit                           doSetAddDash
   1.306 +
   1.307 +#    set-start-amp     Turn "[&&" into a syntax error.
   1.308 +#                           "[&x" is good, & and x are literals.
   1.309 +#
   1.310 +set-start-amp:
   1.311 +    '&'                     errorDeath                              doRuleError
   1.312 +    default                 set-after-lit                           doSetAddAmp
   1.313 +
   1.314 +#
   1.315 +#   set-after-lit    The last thing scanned was a literal character within a set.
   1.316 +#                    Can be followed by anything.  Single '-' or '&' are
   1.317 +#                    literals in this context, not operators.
   1.318 +set-after-lit:
   1.319 +    ']'                  n  pop                                     doSetEnd
   1.320 +    '['                  n  set-open      ^set-after-set            doSetBeginUnion
   1.321 +    '-'                  n  set-lit-dash
   1.322 +    '&'                  n  set-lit-amp
   1.323 +    '\'                  n  set-escape
   1.324 +    eof                     errorDeath                              doSetNoCloseError
   1.325 +    default              n  set-after-lit                           doSetLiteral
   1.326 +
   1.327 +set-after-set:
   1.328 +    ']'                  n  pop                                     doSetEnd
   1.329 +    '['                  n  set-open      ^set-after-set            doSetBeginUnion
   1.330 +    '-'                  n  set-set-dash
   1.331 +    '&'                  n  set-set-amp
   1.332 +    '\'                  n  set-escape
   1.333 +    eof                     errorDeath                              doSetNoCloseError
   1.334 +    default              n  set-after-lit                           doSetLiteral
   1.335 +
   1.336 +set-after-range:
   1.337 +    ']'                  n  pop                                     doSetEnd
   1.338 +    '['                  n  set-open      ^set-after-set            doSetBeginUnion
   1.339 +    '-'                  n  set-range-dash
   1.340 +    '&'                  n  set-range-amp
   1.341 +    '\'                  n  set-escape
   1.342 +    eof                     errorDeath                              doSetNoCloseError
   1.343 +    default              n  set-after-lit                           doSetLiteral
   1.344 +    
   1.345 +
   1.346 +# set-after-op
   1.347 +#     After a --  or &&
   1.348 +#     It is an error to close a set at this point.
   1.349 +#
   1.350 +set-after-op:
   1.351 +    '['                  n  set-open         ^set-after-set         doSetBeginUnion
   1.352 +    ']'                     errorDeath                              doSetOpError
   1.353 +    '\'                  n  set-escape
   1.354 +    default              n  set-after-lit                           doSetLiteral
   1.355 +
   1.356 +#
   1.357 +#   set-set-amp
   1.358 +#      Have scanned [[set]&
   1.359 +#      Could be a '&' intersection operator, if a set follows.
   1.360 +#      Could be the start of a '&&' operator.
   1.361 +#      Otherewise is a literal.
   1.362 +set-set-amp:
   1.363 +    '['                  n  set-open      ^set-after-set           doSetBeginIntersection1
   1.364 +    '&'                  n  set-after-op                           doSetIntersection2
   1.365 +    default                 set-after-lit                          doSetAddAmp
   1.366 +
   1.367 +
   1.368 +# set-lit-amp   Have scanned "[literals&"
   1.369 +#               Could be a start of "&&" operator or a literal
   1.370 +#               In [abc&[def]],   the '&' is a literal
   1.371 +#
   1.372 +set-lit-amp:
   1.373 +    '&'                  n  set-after-op                            doSetIntersection2
   1.374 +    default                 set-after-lit                           doSetAddAmp
   1.375 +
   1.376 +
   1.377 +#
   1.378 +#  set-set-dash
   1.379 +#      Have scanned [set]-
   1.380 +#      Could be a '-' difference operator, if a [set] follows.
   1.381 +#      Could be the start of a '--' operator.
   1.382 +#      Otherewise is a literal.
   1.383 +set-set-dash:
   1.384 +    '['                  n  set-open      ^set-after-set           doSetBeginDifference1
   1.385 +    '-'                  n  set-after-op                           doSetDifference2
   1.386 +    default                 set-after-lit                          doSetAddDash
   1.387 +
   1.388 +
   1.389 +#
   1.390 +#  set-range-dash
   1.391 +#      scanned  a-b-  or \w-
   1.392 +#         any set or range like item where the trailing single '-' should
   1.393 +#         be literal, not a set difference operation.
   1.394 +#         A trailing "--" is still a difference operator.
   1.395 +set-range-dash:
   1.396 +    '-'                  n  set-after-op                           doSetDifference2
   1.397 +    default                 set-after-lit                          doSetAddDash
   1.398 +
   1.399 +
   1.400 +set-range-amp:
   1.401 +    '&'                  n  set-after-op                           doSetIntersection2
   1.402 +    default                 set-after-lit                          doSetAddAmp
   1.403 +
   1.404 +
   1.405 +#  set-lit-dash
   1.406 +#     Have scanned "[literals-" Could be a range or a -- operator or a literal
   1.407 +#     In [abc-[def]], the '-' is a literal (confirmed with a Java test)
   1.408 +#        [abc-\p{xx}  the '-' is an error
   1.409 +#        [abc-]       the '-' is a literal
   1.410 +#        [ab-xy]      the '-' is a range
   1.411 +#
   1.412 +set-lit-dash:
   1.413 +    '-'                  n  set-after-op                            doSetDifference2
   1.414 +    '['                     set-after-lit                           doSetAddDash
   1.415 +    ']'                     set-after-lit                           doSetAddDash
   1.416 +    '\'                  n  set-lit-dash-escape
   1.417 +    default              n  set-after-range                         doSetRange
   1.418 +
   1.419 +# set-lit-dash-escape
   1.420 +#
   1.421 +#    scanned "[literal-\"
   1.422 +#    Could be a range, if the \ introduces an escaped literal char or a named char.
   1.423 +#    Otherwise it is an error.
   1.424 +#
   1.425 +set-lit-dash-escape:
   1.426 +   's'                      errorDeath                             doSetOpError
   1.427 +   'S'                      errorDeath                             doSetOpError
   1.428 +   'w'                      errorDeath                             doSetOpError
   1.429 +   'W'                      errorDeath                             doSetOpError
   1.430 +   'd'                      errorDeath                             doSetOpError
   1.431 +   'D'                      errorDeath                             doSetOpError
   1.432 +   'N'                      set-after-range                        doSetNamedRange
   1.433 +   default               n  set-after-range                        doSetRange
   1.434 +
   1.435 +   
   1.436 +#
   1.437 +#  set-escape
   1.438 +#       Common back-slash escape processing within set expressions
   1.439 +#
   1.440 +set-escape:
   1.441 +   'p'                      set-after-set                           doSetProp
   1.442 +   'P'                      set-after-set                           doSetProp
   1.443 +   'N'                      set-after-lit                           doSetNamedChar
   1.444 +   's'                   n  set-after-range                         doSetBackslash_s
   1.445 +   'S'                   n  set-after-range                         doSetBackslash_S
   1.446 +   'w'                   n  set-after-range                         doSetBackslash_w
   1.447 +   'W'                   n  set-after-range                         doSetBackslash_W
   1.448 +   'd'                   n  set-after-range                         doSetBackslash_d
   1.449 +   'D'                   n  set-after-range                         doSetBackslash_D
   1.450 +   default               n  set-after-lit                           doSetLiteralEscaped 
   1.451 +
   1.452 +#
   1.453 +# set-finish
   1.454 +#     Have just encountered the final ']' that completes a [set], and
   1.455 +#     arrived here via a pop.  From here, we exit the set parsing world, and go
   1.456 +#     back to generic regular expression parsing.
   1.457 +#
   1.458 +set-finish:
   1.459 +    default                 expr-quant                              doSetFinish
   1.460 +
   1.461 +
   1.462 +#
   1.463 +# errorDeath.   This state is specified as the next state whenever a syntax error
   1.464 +#               in the source rules is detected.  Barring bugs, the state machine will never
   1.465 +#               actually get here, but will stop because of the action associated with the error.
   1.466 +#               But, just in case, this state asks the state machine to exit.
   1.467 +errorDeath:
   1.468 +    default              n errorDeath                               doExit
   1.469 +
   1.470 +

mercurial