1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/regexcst.txt Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,467 @@ 1.4 + 1.5 +#***************************************************************************** 1.6 +# 1.7 +# Copyright (C) 2002-2007, International Business Machines Corporation and others. 1.8 +# All Rights Reserved. 1.9 +# 1.10 +#***************************************************************************** 1.11 +# 1.12 +# file: regexcst.txt 1.13 +# ICU Regular Expression Parser State Table 1.14 +# 1.15 +# This state table is used when reading and parsing a regular expression pattern 1.16 +# The pattern parser uses a state machine; the data in this file define the 1.17 +# state transitions that occur for each input character. 1.18 +# 1.19 +# *** This file defines the regex pattern grammar. This is it. 1.20 +# *** The determination of what is accepted is here. 1.21 +# 1.22 +# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays 1.23 +# that are then built with the rule parser. 1.24 +# 1.25 + 1.26 +# 1.27 +# Here is the syntax of the state definitions in this file: 1.28 +# 1.29 +# 1.30 +#StateName: 1.31 +# input-char n next-state ^push-state action 1.32 +# input-char n next-state ^push-state action 1.33 +# | | | | | 1.34 +# | | | | |--- action to be performed by state machine 1.35 +# | | | | See function RBBIRuleScanner::doParseActions() 1.36 +# | | | | 1.37 +# | | | |--- Push this named state onto the state stack. 1.38 +# | | | Later, when next state is specified as "pop", 1.39 +# | | | the pushed state will become the current state. 1.40 +# | | | 1.41 +# | | |--- Transition to this state if the current input character matches the input 1.42 +# | | character or char class in the left hand column. "pop" causes the next 1.43 +# | | state to be popped from the state stack. 1.44 +# | | 1.45 +# | |--- When making the state transition specified on this line, advance to the next 1.46 +# | character from the input only if 'n' appears here. 1.47 +# | 1.48 +# |--- Character or named character classes to test for. If the current character being scanned 1.49 +# matches, peform the actions and go to the state specified on this line. 1.50 +# The input character is tested sequentally, in the order written. The characters and 1.51 +# character classes tested for do not need to be mutually exclusive. The first match wins. 1.52 +# 1.53 + 1.54 + 1.55 + 1.56 + 1.57 +# 1.58 +# start state, scan position is at the beginning of the pattern. 1.59 +# 1.60 +start: 1.61 + default term doPatStart 1.62 + 1.63 + 1.64 + 1.65 + 1.66 +# 1.67 +# term. At a position where we can accept the start most items in a pattern. 1.68 +# 1.69 +term: 1.70 + quoted n expr-quant doLiteralChar 1.71 + rule_char n expr-quant doLiteralChar 1.72 + '[' n set-open ^set-finish doSetBegin 1.73 + '(' n open-paren 1.74 + '.' n expr-quant doDotAny 1.75 + '^' n expr-quant doCaret 1.76 + '$' n expr-quant doDollar 1.77 + '\' n backslash 1.78 + '|' n term doOrOperator 1.79 + ')' n pop doCloseParen 1.80 + eof term doPatFinish 1.81 + default errorDeath doRuleError 1.82 + 1.83 + 1.84 + 1.85 +# 1.86 +# expr-quant We've just finished scanning a term, now look for the optional 1.87 +# trailing quantifier - *, +, ?, *?, etc. 1.88 +# 1.89 +expr-quant: 1.90 + '*' n quant-star 1.91 + '+' n quant-plus 1.92 + '?' n quant-opt 1.93 + '{' n interval-open doIntervalInit 1.94 + '(' n open-paren-quant 1.95 + default expr-cont 1.96 + 1.97 + 1.98 +# 1.99 +# expr-cont Expression, continuation. At a point where additional terms are 1.100 +# allowed, but not required. No Quantifiers 1.101 +# 1.102 +expr-cont: 1.103 + '|' n term doOrOperator 1.104 + ')' n pop doCloseParen 1.105 + default term 1.106 + 1.107 + 1.108 +# 1.109 +# open-paren-quant Special case handling for comments appearing before a quantifier, 1.110 +# e.g. x(?#comment )* 1.111 +# Open parens from expr-quant come here; anything but a (?# comment 1.112 +# branches into the normal parenthesis sequence as quickly as possible. 1.113 +# 1.114 +open-paren-quant: 1.115 + '?' n open-paren-quant2 doSuppressComments 1.116 + default open-paren 1.117 + 1.118 +open-paren-quant2: 1.119 + '#' n paren-comment ^expr-quant 1.120 + default open-paren-extended 1.121 + 1.122 + 1.123 +# 1.124 +# open-paren We've got an open paren. We need to scan further to 1.125 +# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. 1.126 +# 1.127 +open-paren: 1.128 + '?' n open-paren-extended doSuppressComments 1.129 + default term ^expr-quant doOpenCaptureParen 1.130 + 1.131 +open-paren-extended: 1.132 + ':' n term ^expr-quant doOpenNonCaptureParen # (?: 1.133 + '>' n term ^expr-quant doOpenAtomicParen # (?> 1.134 + '=' n term ^expr-cont doOpenLookAhead # (?= 1.135 + '!' n term ^expr-cont doOpenLookAheadNeg # (?! 1.136 + '<' n open-paren-lookbehind 1.137 + '#' n paren-comment ^term 1.138 + 'i' paren-flag doBeginMatchMode 1.139 + 'd' paren-flag doBeginMatchMode 1.140 + 'm' paren-flag doBeginMatchMode 1.141 + 's' paren-flag doBeginMatchMode 1.142 + 'u' paren-flag doBeginMatchMode 1.143 + 'w' paren-flag doBeginMatchMode 1.144 + 'x' paren-flag doBeginMatchMode 1.145 + '-' paren-flag doBeginMatchMode 1.146 + '(' n errorDeath doConditionalExpr 1.147 + '{' n errorDeath doPerlInline 1.148 + default errorDeath doBadOpenParenType 1.149 + 1.150 +open-paren-lookbehind: 1.151 + '=' n term ^expr-cont doOpenLookBehind # (?<= 1.152 + '!' n term ^expr-cont doOpenLookBehindNeg # (?<! 1.153 + default errorDeath doBadOpenParenType 1.154 + 1.155 + 1.156 +# 1.157 +# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' 1.158 +# 1.159 +paren-comment: 1.160 + ')' n pop 1.161 + eof errorDeath doMismatchedParenErr 1.162 + default n paren-comment 1.163 + 1.164 +# 1.165 +# paren-flag Scanned a (?ismx-ismx flag setting 1.166 +# 1.167 +paren-flag: 1.168 + 'i' n paren-flag doMatchMode 1.169 + 'd' n paren-flag doMatchMode 1.170 + 'm' n paren-flag doMatchMode 1.171 + 's' n paren-flag doMatchMode 1.172 + 'u' n paren-flag doMatchMode 1.173 + 'w' n paren-flag doMatchMode 1.174 + 'x' n paren-flag doMatchMode 1.175 + '-' n paren-flag doMatchMode 1.176 + ')' n term doSetMatchMode 1.177 + ':' n term ^expr-quant doMatchModeParen 1.178 + default errorDeath doBadModeFlag 1.179 + 1.180 + 1.181 +# 1.182 +# quant-star Scanning a '*' quantifier. Need to look ahead to decide 1.183 +# between plain '*', '*?', '*+' 1.184 +# 1.185 +quant-star: 1.186 + '?' n expr-cont doNGStar # *? 1.187 + '+' n expr-cont doPossessiveStar # *+ 1.188 + default expr-cont doStar 1.189 + 1.190 + 1.191 +# 1.192 +# quant-plus Scanning a '+' quantifier. Need to look ahead to decide 1.193 +# between plain '+', '+?', '++' 1.194 +# 1.195 +quant-plus: 1.196 + '?' n expr-cont doNGPlus # *? 1.197 + '+' n expr-cont doPossessivePlus # *+ 1.198 + default expr-cont doPlus 1.199 + 1.200 + 1.201 +# 1.202 +# quant-opt Scanning a '?' quantifier. Need to look ahead to decide 1.203 +# between plain '?', '??', '?+' 1.204 +# 1.205 +quant-opt: 1.206 + '?' n expr-cont doNGOpt # ?? 1.207 + '+' n expr-cont doPossessiveOpt # ?+ 1.208 + default expr-cont doOpt # ? 1.209 + 1.210 + 1.211 +# 1.212 +# Interval scanning a '{', the opening delimiter for an interval specification 1.213 +# {number} or {min, max} or {min,} 1.214 +# 1.215 +interval-open: 1.216 + digit_char interval-lower 1.217 + default errorDeath doIntervalError 1.218 + 1.219 +interval-lower: 1.220 + digit_char n interval-lower doIntevalLowerDigit 1.221 + ',' n interval-upper 1.222 + '}' n interval-type doIntervalSame # {n} 1.223 + default errorDeath doIntervalError 1.224 + 1.225 +interval-upper: 1.226 + digit_char n interval-upper doIntervalUpperDigit 1.227 + '}' n interval-type 1.228 + default errorDeath doIntervalError 1.229 + 1.230 +interval-type: 1.231 + '?' n expr-cont doNGInterval # {n,m}? 1.232 + '+' n expr-cont doPossessiveInterval # {n,m}+ 1.233 + default expr-cont doInterval # {m,n} 1.234 + 1.235 + 1.236 +# 1.237 +# backslash # Backslash. Figure out which of the \thingies we have encountered. 1.238 +# The low level next-char function will have preprocessed 1.239 +# some of them already; those won't come here. 1.240 +backslash: 1.241 + 'A' n term doBackslashA 1.242 + 'B' n term doBackslashB 1.243 + 'b' n term doBackslashb 1.244 + 'd' n expr-quant doBackslashd 1.245 + 'D' n expr-quant doBackslashD 1.246 + 'G' n term doBackslashG 1.247 + 'N' expr-quant doNamedChar # \N{NAME} named char 1.248 + 'p' expr-quant doProperty # \p{Lu} style property 1.249 + 'P' expr-quant doProperty 1.250 + 'Q' n term doEnterQuoteMode 1.251 + 'S' n expr-quant doBackslashS 1.252 + 's' n expr-quant doBackslashs 1.253 + 'W' n expr-quant doBackslashW 1.254 + 'w' n expr-quant doBackslashw 1.255 + 'X' n expr-quant doBackslashX 1.256 + 'Z' n term doBackslashZ 1.257 + 'z' n term doBackslashz 1.258 + digit_char n expr-quant doBackRef # Will scan multiple digits 1.259 + eof errorDeath doEscapeError 1.260 + default n expr-quant doEscapedLiteralChar 1.261 + 1.262 + 1.263 + 1.264 +# 1.265 +# [set expression] parsing, 1.266 +# All states involved in parsing set expressions have names beginning with "set-" 1.267 +# 1.268 + 1.269 +set-open: 1.270 + '^' n set-open2 doSetNegate 1.271 + ':' set-posix doSetPosixProp 1.272 + default set-open2 1.273 + 1.274 +set-open2: 1.275 + ']' n set-after-lit doSetLiteral 1.276 + default set-start 1.277 + 1.278 +# set-posix: 1.279 +# scanned a '[:' If it really is a [:property:], doSetPosixProp will have 1.280 +# moved the scan to the closing ']'. If it wasn't a property 1.281 +# expression, the scan will still be at the opening ':', which should 1.282 +# be interpreted as a normal set expression. 1.283 +set-posix: 1.284 + ']' n pop doSetEnd 1.285 + ':' set-start 1.286 + default errorDeath doRuleError # should not be possible. 1.287 + 1.288 +# 1.289 +# set-start after the [ and special case leading characters (^ and/or ]) but before 1.290 +# everything else. A '-' is literal at this point. 1.291 +# 1.292 +set-start: 1.293 + ']' n pop doSetEnd 1.294 + '[' n set-open ^set-after-set doSetBeginUnion 1.295 + '\' n set-escape 1.296 + '-' n set-start-dash 1.297 + '&' n set-start-amp 1.298 + default n set-after-lit doSetLiteral 1.299 + 1.300 +# set-start-dash Turn "[--" into a syntax error. 1.301 +# "[-x" is good, - and x are literals. 1.302 +# 1.303 +set-start-dash: 1.304 + '-' errorDeath doRuleError 1.305 + default set-after-lit doSetAddDash 1.306 + 1.307 +# set-start-amp Turn "[&&" into a syntax error. 1.308 +# "[&x" is good, & and x are literals. 1.309 +# 1.310 +set-start-amp: 1.311 + '&' errorDeath doRuleError 1.312 + default set-after-lit doSetAddAmp 1.313 + 1.314 +# 1.315 +# set-after-lit The last thing scanned was a literal character within a set. 1.316 +# Can be followed by anything. Single '-' or '&' are 1.317 +# literals in this context, not operators. 1.318 +set-after-lit: 1.319 + ']' n pop doSetEnd 1.320 + '[' n set-open ^set-after-set doSetBeginUnion 1.321 + '-' n set-lit-dash 1.322 + '&' n set-lit-amp 1.323 + '\' n set-escape 1.324 + eof errorDeath doSetNoCloseError 1.325 + default n set-after-lit doSetLiteral 1.326 + 1.327 +set-after-set: 1.328 + ']' n pop doSetEnd 1.329 + '[' n set-open ^set-after-set doSetBeginUnion 1.330 + '-' n set-set-dash 1.331 + '&' n set-set-amp 1.332 + '\' n set-escape 1.333 + eof errorDeath doSetNoCloseError 1.334 + default n set-after-lit doSetLiteral 1.335 + 1.336 +set-after-range: 1.337 + ']' n pop doSetEnd 1.338 + '[' n set-open ^set-after-set doSetBeginUnion 1.339 + '-' n set-range-dash 1.340 + '&' n set-range-amp 1.341 + '\' n set-escape 1.342 + eof errorDeath doSetNoCloseError 1.343 + default n set-after-lit doSetLiteral 1.344 + 1.345 + 1.346 +# set-after-op 1.347 +# After a -- or && 1.348 +# It is an error to close a set at this point. 1.349 +# 1.350 +set-after-op: 1.351 + '[' n set-open ^set-after-set doSetBeginUnion 1.352 + ']' errorDeath doSetOpError 1.353 + '\' n set-escape 1.354 + default n set-after-lit doSetLiteral 1.355 + 1.356 +# 1.357 +# set-set-amp 1.358 +# Have scanned [[set]& 1.359 +# Could be a '&' intersection operator, if a set follows. 1.360 +# Could be the start of a '&&' operator. 1.361 +# Otherewise is a literal. 1.362 +set-set-amp: 1.363 + '[' n set-open ^set-after-set doSetBeginIntersection1 1.364 + '&' n set-after-op doSetIntersection2 1.365 + default set-after-lit doSetAddAmp 1.366 + 1.367 + 1.368 +# set-lit-amp Have scanned "[literals&" 1.369 +# Could be a start of "&&" operator or a literal 1.370 +# In [abc&[def]], the '&' is a literal 1.371 +# 1.372 +set-lit-amp: 1.373 + '&' n set-after-op doSetIntersection2 1.374 + default set-after-lit doSetAddAmp 1.375 + 1.376 + 1.377 +# 1.378 +# set-set-dash 1.379 +# Have scanned [set]- 1.380 +# Could be a '-' difference operator, if a [set] follows. 1.381 +# Could be the start of a '--' operator. 1.382 +# Otherewise is a literal. 1.383 +set-set-dash: 1.384 + '[' n set-open ^set-after-set doSetBeginDifference1 1.385 + '-' n set-after-op doSetDifference2 1.386 + default set-after-lit doSetAddDash 1.387 + 1.388 + 1.389 +# 1.390 +# set-range-dash 1.391 +# scanned a-b- or \w- 1.392 +# any set or range like item where the trailing single '-' should 1.393 +# be literal, not a set difference operation. 1.394 +# A trailing "--" is still a difference operator. 1.395 +set-range-dash: 1.396 + '-' n set-after-op doSetDifference2 1.397 + default set-after-lit doSetAddDash 1.398 + 1.399 + 1.400 +set-range-amp: 1.401 + '&' n set-after-op doSetIntersection2 1.402 + default set-after-lit doSetAddAmp 1.403 + 1.404 + 1.405 +# set-lit-dash 1.406 +# Have scanned "[literals-" Could be a range or a -- operator or a literal 1.407 +# In [abc-[def]], the '-' is a literal (confirmed with a Java test) 1.408 +# [abc-\p{xx} the '-' is an error 1.409 +# [abc-] the '-' is a literal 1.410 +# [ab-xy] the '-' is a range 1.411 +# 1.412 +set-lit-dash: 1.413 + '-' n set-after-op doSetDifference2 1.414 + '[' set-after-lit doSetAddDash 1.415 + ']' set-after-lit doSetAddDash 1.416 + '\' n set-lit-dash-escape 1.417 + default n set-after-range doSetRange 1.418 + 1.419 +# set-lit-dash-escape 1.420 +# 1.421 +# scanned "[literal-\" 1.422 +# Could be a range, if the \ introduces an escaped literal char or a named char. 1.423 +# Otherwise it is an error. 1.424 +# 1.425 +set-lit-dash-escape: 1.426 + 's' errorDeath doSetOpError 1.427 + 'S' errorDeath doSetOpError 1.428 + 'w' errorDeath doSetOpError 1.429 + 'W' errorDeath doSetOpError 1.430 + 'd' errorDeath doSetOpError 1.431 + 'D' errorDeath doSetOpError 1.432 + 'N' set-after-range doSetNamedRange 1.433 + default n set-after-range doSetRange 1.434 + 1.435 + 1.436 +# 1.437 +# set-escape 1.438 +# Common back-slash escape processing within set expressions 1.439 +# 1.440 +set-escape: 1.441 + 'p' set-after-set doSetProp 1.442 + 'P' set-after-set doSetProp 1.443 + 'N' set-after-lit doSetNamedChar 1.444 + 's' n set-after-range doSetBackslash_s 1.445 + 'S' n set-after-range doSetBackslash_S 1.446 + 'w' n set-after-range doSetBackslash_w 1.447 + 'W' n set-after-range doSetBackslash_W 1.448 + 'd' n set-after-range doSetBackslash_d 1.449 + 'D' n set-after-range doSetBackslash_D 1.450 + default n set-after-lit doSetLiteralEscaped 1.451 + 1.452 +# 1.453 +# set-finish 1.454 +# Have just encountered the final ']' that completes a [set], and 1.455 +# arrived here via a pop. From here, we exit the set parsing world, and go 1.456 +# back to generic regular expression parsing. 1.457 +# 1.458 +set-finish: 1.459 + default expr-quant doSetFinish 1.460 + 1.461 + 1.462 +# 1.463 +# errorDeath. This state is specified as the next state whenever a syntax error 1.464 +# in the source rules is detected. Barring bugs, the state machine will never 1.465 +# actually get here, but will stop because of the action associated with the error. 1.466 +# But, just in case, this state asks the state machine to exit. 1.467 +errorDeath: 1.468 + default n errorDeath doExit 1.469 + 1.470 +