|
1 |
|
2 #***************************************************************************** |
|
3 # |
|
4 # Copyright (C) 2002-2007, International Business Machines Corporation and others. |
|
5 # All Rights Reserved. |
|
6 # |
|
7 #***************************************************************************** |
|
8 # |
|
9 # file: regexcst.txt |
|
10 # ICU Regular Expression Parser State Table |
|
11 # |
|
12 # This state table is used when reading and parsing a regular expression pattern |
|
13 # The pattern parser uses a state machine; the data in this file define the |
|
14 # state transitions that occur for each input character. |
|
15 # |
|
16 # *** This file defines the regex pattern grammar. This is it. |
|
17 # *** The determination of what is accepted is here. |
|
18 # |
|
19 # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays |
|
20 # that are then built with the rule parser. |
|
21 # |
|
22 |
|
23 # |
|
24 # Here is the syntax of the state definitions in this file: |
|
25 # |
|
26 # |
|
27 #StateName: |
|
28 # input-char n next-state ^push-state action |
|
29 # input-char n next-state ^push-state action |
|
30 # | | | | | |
|
31 # | | | | |--- action to be performed by state machine |
|
32 # | | | | See function RBBIRuleScanner::doParseActions() |
|
33 # | | | | |
|
34 # | | | |--- Push this named state onto the state stack. |
|
35 # | | | Later, when next state is specified as "pop", |
|
36 # | | | the pushed state will become the current state. |
|
37 # | | | |
|
38 # | | |--- Transition to this state if the current input character matches the input |
|
39 # | | character or char class in the left hand column. "pop" causes the next |
|
40 # | | state to be popped from the state stack. |
|
41 # | | |
|
42 # | |--- When making the state transition specified on this line, advance to the next |
|
43 # | character from the input only if 'n' appears here. |
|
44 # | |
|
45 # |--- Character or named character classes to test for. If the current character being scanned |
|
46 # matches, peform the actions and go to the state specified on this line. |
|
47 # The input character is tested sequentally, in the order written. The characters and |
|
48 # character classes tested for do not need to be mutually exclusive. The first match wins. |
|
49 # |
|
50 |
|
51 |
|
52 |
|
53 |
|
54 # |
|
55 # start state, scan position is at the beginning of the pattern. |
|
56 # |
|
57 start: |
|
58 default term doPatStart |
|
59 |
|
60 |
|
61 |
|
62 |
|
63 # |
|
64 # term. At a position where we can accept the start most items in a pattern. |
|
65 # |
|
66 term: |
|
67 quoted n expr-quant doLiteralChar |
|
68 rule_char n expr-quant doLiteralChar |
|
69 '[' n set-open ^set-finish doSetBegin |
|
70 '(' n open-paren |
|
71 '.' n expr-quant doDotAny |
|
72 '^' n expr-quant doCaret |
|
73 '$' n expr-quant doDollar |
|
74 '\' n backslash |
|
75 '|' n term doOrOperator |
|
76 ')' n pop doCloseParen |
|
77 eof term doPatFinish |
|
78 default errorDeath doRuleError |
|
79 |
|
80 |
|
81 |
|
82 # |
|
83 # expr-quant We've just finished scanning a term, now look for the optional |
|
84 # trailing quantifier - *, +, ?, *?, etc. |
|
85 # |
|
86 expr-quant: |
|
87 '*' n quant-star |
|
88 '+' n quant-plus |
|
89 '?' n quant-opt |
|
90 '{' n interval-open doIntervalInit |
|
91 '(' n open-paren-quant |
|
92 default expr-cont |
|
93 |
|
94 |
|
95 # |
|
96 # expr-cont Expression, continuation. At a point where additional terms are |
|
97 # allowed, but not required. No Quantifiers |
|
98 # |
|
99 expr-cont: |
|
100 '|' n term doOrOperator |
|
101 ')' n pop doCloseParen |
|
102 default term |
|
103 |
|
104 |
|
105 # |
|
106 # open-paren-quant Special case handling for comments appearing before a quantifier, |
|
107 # e.g. x(?#comment )* |
|
108 # Open parens from expr-quant come here; anything but a (?# comment |
|
109 # branches into the normal parenthesis sequence as quickly as possible. |
|
110 # |
|
111 open-paren-quant: |
|
112 '?' n open-paren-quant2 doSuppressComments |
|
113 default open-paren |
|
114 |
|
115 open-paren-quant2: |
|
116 '#' n paren-comment ^expr-quant |
|
117 default open-paren-extended |
|
118 |
|
119 |
|
120 # |
|
121 # open-paren We've got an open paren. We need to scan further to |
|
122 # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. |
|
123 # |
|
124 open-paren: |
|
125 '?' n open-paren-extended doSuppressComments |
|
126 default term ^expr-quant doOpenCaptureParen |
|
127 |
|
128 open-paren-extended: |
|
129 ':' n term ^expr-quant doOpenNonCaptureParen # (?: |
|
130 '>' n term ^expr-quant doOpenAtomicParen # (?> |
|
131 '=' n term ^expr-cont doOpenLookAhead # (?= |
|
132 '!' n term ^expr-cont doOpenLookAheadNeg # (?! |
|
133 '<' n open-paren-lookbehind |
|
134 '#' n paren-comment ^term |
|
135 'i' paren-flag doBeginMatchMode |
|
136 'd' paren-flag doBeginMatchMode |
|
137 'm' paren-flag doBeginMatchMode |
|
138 's' paren-flag doBeginMatchMode |
|
139 'u' paren-flag doBeginMatchMode |
|
140 'w' paren-flag doBeginMatchMode |
|
141 'x' paren-flag doBeginMatchMode |
|
142 '-' paren-flag doBeginMatchMode |
|
143 '(' n errorDeath doConditionalExpr |
|
144 '{' n errorDeath doPerlInline |
|
145 default errorDeath doBadOpenParenType |
|
146 |
|
147 open-paren-lookbehind: |
|
148 '=' n term ^expr-cont doOpenLookBehind # (?<= |
|
149 '!' n term ^expr-cont doOpenLookBehindNeg # (?<! |
|
150 default errorDeath doBadOpenParenType |
|
151 |
|
152 |
|
153 # |
|
154 # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' |
|
155 # |
|
156 paren-comment: |
|
157 ')' n pop |
|
158 eof errorDeath doMismatchedParenErr |
|
159 default n paren-comment |
|
160 |
|
161 # |
|
162 # paren-flag Scanned a (?ismx-ismx flag setting |
|
163 # |
|
164 paren-flag: |
|
165 'i' n paren-flag doMatchMode |
|
166 'd' n paren-flag doMatchMode |
|
167 'm' n paren-flag doMatchMode |
|
168 's' n paren-flag doMatchMode |
|
169 'u' n paren-flag doMatchMode |
|
170 'w' n paren-flag doMatchMode |
|
171 'x' n paren-flag doMatchMode |
|
172 '-' n paren-flag doMatchMode |
|
173 ')' n term doSetMatchMode |
|
174 ':' n term ^expr-quant doMatchModeParen |
|
175 default errorDeath doBadModeFlag |
|
176 |
|
177 |
|
178 # |
|
179 # quant-star Scanning a '*' quantifier. Need to look ahead to decide |
|
180 # between plain '*', '*?', '*+' |
|
181 # |
|
182 quant-star: |
|
183 '?' n expr-cont doNGStar # *? |
|
184 '+' n expr-cont doPossessiveStar # *+ |
|
185 default expr-cont doStar |
|
186 |
|
187 |
|
188 # |
|
189 # quant-plus Scanning a '+' quantifier. Need to look ahead to decide |
|
190 # between plain '+', '+?', '++' |
|
191 # |
|
192 quant-plus: |
|
193 '?' n expr-cont doNGPlus # *? |
|
194 '+' n expr-cont doPossessivePlus # *+ |
|
195 default expr-cont doPlus |
|
196 |
|
197 |
|
198 # |
|
199 # quant-opt Scanning a '?' quantifier. Need to look ahead to decide |
|
200 # between plain '?', '??', '?+' |
|
201 # |
|
202 quant-opt: |
|
203 '?' n expr-cont doNGOpt # ?? |
|
204 '+' n expr-cont doPossessiveOpt # ?+ |
|
205 default expr-cont doOpt # ? |
|
206 |
|
207 |
|
208 # |
|
209 # Interval scanning a '{', the opening delimiter for an interval specification |
|
210 # {number} or {min, max} or {min,} |
|
211 # |
|
212 interval-open: |
|
213 digit_char interval-lower |
|
214 default errorDeath doIntervalError |
|
215 |
|
216 interval-lower: |
|
217 digit_char n interval-lower doIntevalLowerDigit |
|
218 ',' n interval-upper |
|
219 '}' n interval-type doIntervalSame # {n} |
|
220 default errorDeath doIntervalError |
|
221 |
|
222 interval-upper: |
|
223 digit_char n interval-upper doIntervalUpperDigit |
|
224 '}' n interval-type |
|
225 default errorDeath doIntervalError |
|
226 |
|
227 interval-type: |
|
228 '?' n expr-cont doNGInterval # {n,m}? |
|
229 '+' n expr-cont doPossessiveInterval # {n,m}+ |
|
230 default expr-cont doInterval # {m,n} |
|
231 |
|
232 |
|
233 # |
|
234 # backslash # Backslash. Figure out which of the \thingies we have encountered. |
|
235 # The low level next-char function will have preprocessed |
|
236 # some of them already; those won't come here. |
|
237 backslash: |
|
238 'A' n term doBackslashA |
|
239 'B' n term doBackslashB |
|
240 'b' n term doBackslashb |
|
241 'd' n expr-quant doBackslashd |
|
242 'D' n expr-quant doBackslashD |
|
243 'G' n term doBackslashG |
|
244 'N' expr-quant doNamedChar # \N{NAME} named char |
|
245 'p' expr-quant doProperty # \p{Lu} style property |
|
246 'P' expr-quant doProperty |
|
247 'Q' n term doEnterQuoteMode |
|
248 'S' n expr-quant doBackslashS |
|
249 's' n expr-quant doBackslashs |
|
250 'W' n expr-quant doBackslashW |
|
251 'w' n expr-quant doBackslashw |
|
252 'X' n expr-quant doBackslashX |
|
253 'Z' n term doBackslashZ |
|
254 'z' n term doBackslashz |
|
255 digit_char n expr-quant doBackRef # Will scan multiple digits |
|
256 eof errorDeath doEscapeError |
|
257 default n expr-quant doEscapedLiteralChar |
|
258 |
|
259 |
|
260 |
|
261 # |
|
262 # [set expression] parsing, |
|
263 # All states involved in parsing set expressions have names beginning with "set-" |
|
264 # |
|
265 |
|
266 set-open: |
|
267 '^' n set-open2 doSetNegate |
|
268 ':' set-posix doSetPosixProp |
|
269 default set-open2 |
|
270 |
|
271 set-open2: |
|
272 ']' n set-after-lit doSetLiteral |
|
273 default set-start |
|
274 |
|
275 # set-posix: |
|
276 # scanned a '[:' If it really is a [:property:], doSetPosixProp will have |
|
277 # moved the scan to the closing ']'. If it wasn't a property |
|
278 # expression, the scan will still be at the opening ':', which should |
|
279 # be interpreted as a normal set expression. |
|
280 set-posix: |
|
281 ']' n pop doSetEnd |
|
282 ':' set-start |
|
283 default errorDeath doRuleError # should not be possible. |
|
284 |
|
285 # |
|
286 # set-start after the [ and special case leading characters (^ and/or ]) but before |
|
287 # everything else. A '-' is literal at this point. |
|
288 # |
|
289 set-start: |
|
290 ']' n pop doSetEnd |
|
291 '[' n set-open ^set-after-set doSetBeginUnion |
|
292 '\' n set-escape |
|
293 '-' n set-start-dash |
|
294 '&' n set-start-amp |
|
295 default n set-after-lit doSetLiteral |
|
296 |
|
297 # set-start-dash Turn "[--" into a syntax error. |
|
298 # "[-x" is good, - and x are literals. |
|
299 # |
|
300 set-start-dash: |
|
301 '-' errorDeath doRuleError |
|
302 default set-after-lit doSetAddDash |
|
303 |
|
304 # set-start-amp Turn "[&&" into a syntax error. |
|
305 # "[&x" is good, & and x are literals. |
|
306 # |
|
307 set-start-amp: |
|
308 '&' errorDeath doRuleError |
|
309 default set-after-lit doSetAddAmp |
|
310 |
|
311 # |
|
312 # set-after-lit The last thing scanned was a literal character within a set. |
|
313 # Can be followed by anything. Single '-' or '&' are |
|
314 # literals in this context, not operators. |
|
315 set-after-lit: |
|
316 ']' n pop doSetEnd |
|
317 '[' n set-open ^set-after-set doSetBeginUnion |
|
318 '-' n set-lit-dash |
|
319 '&' n set-lit-amp |
|
320 '\' n set-escape |
|
321 eof errorDeath doSetNoCloseError |
|
322 default n set-after-lit doSetLiteral |
|
323 |
|
324 set-after-set: |
|
325 ']' n pop doSetEnd |
|
326 '[' n set-open ^set-after-set doSetBeginUnion |
|
327 '-' n set-set-dash |
|
328 '&' n set-set-amp |
|
329 '\' n set-escape |
|
330 eof errorDeath doSetNoCloseError |
|
331 default n set-after-lit doSetLiteral |
|
332 |
|
333 set-after-range: |
|
334 ']' n pop doSetEnd |
|
335 '[' n set-open ^set-after-set doSetBeginUnion |
|
336 '-' n set-range-dash |
|
337 '&' n set-range-amp |
|
338 '\' n set-escape |
|
339 eof errorDeath doSetNoCloseError |
|
340 default n set-after-lit doSetLiteral |
|
341 |
|
342 |
|
343 # set-after-op |
|
344 # After a -- or && |
|
345 # It is an error to close a set at this point. |
|
346 # |
|
347 set-after-op: |
|
348 '[' n set-open ^set-after-set doSetBeginUnion |
|
349 ']' errorDeath doSetOpError |
|
350 '\' n set-escape |
|
351 default n set-after-lit doSetLiteral |
|
352 |
|
353 # |
|
354 # set-set-amp |
|
355 # Have scanned [[set]& |
|
356 # Could be a '&' intersection operator, if a set follows. |
|
357 # Could be the start of a '&&' operator. |
|
358 # Otherewise is a literal. |
|
359 set-set-amp: |
|
360 '[' n set-open ^set-after-set doSetBeginIntersection1 |
|
361 '&' n set-after-op doSetIntersection2 |
|
362 default set-after-lit doSetAddAmp |
|
363 |
|
364 |
|
365 # set-lit-amp Have scanned "[literals&" |
|
366 # Could be a start of "&&" operator or a literal |
|
367 # In [abc&[def]], the '&' is a literal |
|
368 # |
|
369 set-lit-amp: |
|
370 '&' n set-after-op doSetIntersection2 |
|
371 default set-after-lit doSetAddAmp |
|
372 |
|
373 |
|
374 # |
|
375 # set-set-dash |
|
376 # Have scanned [set]- |
|
377 # Could be a '-' difference operator, if a [set] follows. |
|
378 # Could be the start of a '--' operator. |
|
379 # Otherewise is a literal. |
|
380 set-set-dash: |
|
381 '[' n set-open ^set-after-set doSetBeginDifference1 |
|
382 '-' n set-after-op doSetDifference2 |
|
383 default set-after-lit doSetAddDash |
|
384 |
|
385 |
|
386 # |
|
387 # set-range-dash |
|
388 # scanned a-b- or \w- |
|
389 # any set or range like item where the trailing single '-' should |
|
390 # be literal, not a set difference operation. |
|
391 # A trailing "--" is still a difference operator. |
|
392 set-range-dash: |
|
393 '-' n set-after-op doSetDifference2 |
|
394 default set-after-lit doSetAddDash |
|
395 |
|
396 |
|
397 set-range-amp: |
|
398 '&' n set-after-op doSetIntersection2 |
|
399 default set-after-lit doSetAddAmp |
|
400 |
|
401 |
|
402 # set-lit-dash |
|
403 # Have scanned "[literals-" Could be a range or a -- operator or a literal |
|
404 # In [abc-[def]], the '-' is a literal (confirmed with a Java test) |
|
405 # [abc-\p{xx} the '-' is an error |
|
406 # [abc-] the '-' is a literal |
|
407 # [ab-xy] the '-' is a range |
|
408 # |
|
409 set-lit-dash: |
|
410 '-' n set-after-op doSetDifference2 |
|
411 '[' set-after-lit doSetAddDash |
|
412 ']' set-after-lit doSetAddDash |
|
413 '\' n set-lit-dash-escape |
|
414 default n set-after-range doSetRange |
|
415 |
|
416 # set-lit-dash-escape |
|
417 # |
|
418 # scanned "[literal-\" |
|
419 # Could be a range, if the \ introduces an escaped literal char or a named char. |
|
420 # Otherwise it is an error. |
|
421 # |
|
422 set-lit-dash-escape: |
|
423 's' errorDeath doSetOpError |
|
424 'S' errorDeath doSetOpError |
|
425 'w' errorDeath doSetOpError |
|
426 'W' errorDeath doSetOpError |
|
427 'd' errorDeath doSetOpError |
|
428 'D' errorDeath doSetOpError |
|
429 'N' set-after-range doSetNamedRange |
|
430 default n set-after-range doSetRange |
|
431 |
|
432 |
|
433 # |
|
434 # set-escape |
|
435 # Common back-slash escape processing within set expressions |
|
436 # |
|
437 set-escape: |
|
438 'p' set-after-set doSetProp |
|
439 'P' set-after-set doSetProp |
|
440 'N' set-after-lit doSetNamedChar |
|
441 's' n set-after-range doSetBackslash_s |
|
442 'S' n set-after-range doSetBackslash_S |
|
443 'w' n set-after-range doSetBackslash_w |
|
444 'W' n set-after-range doSetBackslash_W |
|
445 'd' n set-after-range doSetBackslash_d |
|
446 'D' n set-after-range doSetBackslash_D |
|
447 default n set-after-lit doSetLiteralEscaped |
|
448 |
|
449 # |
|
450 # set-finish |
|
451 # Have just encountered the final ']' that completes a [set], and |
|
452 # arrived here via a pop. From here, we exit the set parsing world, and go |
|
453 # back to generic regular expression parsing. |
|
454 # |
|
455 set-finish: |
|
456 default expr-quant doSetFinish |
|
457 |
|
458 |
|
459 # |
|
460 # errorDeath. This state is specified as the next state whenever a syntax error |
|
461 # in the source rules is detected. Barring bugs, the state machine will never |
|
462 # actually get here, but will stop because of the action associated with the error. |
|
463 # But, just in case, this state asks the state machine to exit. |
|
464 errorDeath: |
|
465 default n errorDeath doExit |
|
466 |
|
467 |