michael@0: """ michael@0: Iterator based sre token scanner michael@0: """ michael@0: import re michael@0: from re import VERBOSE, MULTILINE, DOTALL michael@0: import sre_parse michael@0: import sre_compile michael@0: import sre_constants michael@0: from sre_constants import BRANCH, SUBPATTERN michael@0: michael@0: __all__ = ['Scanner', 'pattern'] michael@0: michael@0: FLAGS = (VERBOSE | MULTILINE | DOTALL) michael@0: michael@0: class Scanner(object): michael@0: def __init__(self, lexicon, flags=FLAGS): michael@0: self.actions = [None] michael@0: # Combine phrases into a compound pattern michael@0: s = sre_parse.Pattern() michael@0: s.flags = flags michael@0: p = [] michael@0: for idx, token in enumerate(lexicon): michael@0: phrase = token.pattern michael@0: try: michael@0: subpattern = sre_parse.SubPattern(s, michael@0: [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) michael@0: except sre_constants.error: michael@0: raise michael@0: p.append(subpattern) michael@0: self.actions.append(token) michael@0: michael@0: s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work michael@0: p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) michael@0: self.scanner = sre_compile.compile(p) michael@0: michael@0: def iterscan(self, string, idx=0, context=None): michael@0: """ michael@0: Yield match, end_idx for each match michael@0: """ michael@0: match = self.scanner.scanner(string, idx).match michael@0: actions = self.actions michael@0: lastend = idx michael@0: end = len(string) michael@0: while True: michael@0: m = match() michael@0: if m is None: michael@0: break michael@0: matchbegin, matchend = m.span() michael@0: if lastend == matchend: michael@0: break michael@0: action = actions[m.lastindex] michael@0: if action is not None: michael@0: rval, next_pos = action(m, context) michael@0: if next_pos is not None and next_pos != matchend: michael@0: # "fast forward" the scanner michael@0: matchend = next_pos michael@0: match = self.scanner.scanner(string, matchend).match michael@0: yield rval, matchend michael@0: lastend = matchend michael@0: michael@0: michael@0: def pattern(pattern, flags=FLAGS): michael@0: def decorator(fn): michael@0: fn.pattern = pattern michael@0: fn.regex = re.compile(pattern, flags) michael@0: return fn michael@0: return decorator