# -*- coding: utf-8 -*- ############################################################################### # Name: arpeggio.py # Purpose: PEG parser interpreter # Author: Igor R. Dejanović # Copyright: (c) 2009-2019 Igor R. Dejanović # License: MIT License # # This is an implementation of packrat parser interpreter based on PEG # grammars. Grammars are defined using Python language constructs or the PEG # textual notation. ############################################################################### from __future__ import print_function, unicode_literals import sys from collections import OrderedDict import codecs import re import bisect from arpeggio.utils import isstr import types __version__ = "2.0.2" if sys.version < '3': text = unicode else: text = str DEFAULT_WS = '\t\n\r ' NOMATCH_MARKER = 0 class ArpeggioError(Exception): """ Base class for arpeggio errors. """ def __init__(self, message): self.message = message def __str__(self): return repr(self.message) class GrammarError(ArpeggioError): """ Error raised during parser building phase used to indicate error in the grammar definition. """ class SemanticError(ArpeggioError): """ Error raised during the phase of semantic analysis used to indicate semantic error. """ class NoMatch(Exception): """ Exception raised by the Match classes during parsing to indicate that the match is not successful. Args: rules (list of ParsingExpression): Rules that are tried at the position of the exception. position (int): A position in the input stream where exception occurred. parser (Parser): An instance of a parser. """ def __init__(self, rules, position, parser): self.rules = rules self.position = position self.parser = parser def eval_attrs(self): """ Call this to evaluate `message`, `context`, `line` and `col`. Called by __str__. """ def rule_to_exp_str(rule): if hasattr(rule, '_exp_str'): # Rule may override expected report string return rule._exp_str elif rule.root: return rule.rule_name elif isinstance(rule, Match) and \ not isinstance(rule, EndOfFile): return "'{}'".format(rule.to_match.replace('\n', '\\n')) else: return rule.name if not self.rules: self.message = "Not expected input" else: what_is_expected = OrderedDict.fromkeys( ["{}".format(rule_to_exp_str(r)) for r in self.rules]) what_str = " or ".join(what_is_expected) self.message = "Expected {}".format(what_str) self.context = self.parser.context(position=self.position) self.line, self.col = self.parser.pos_to_linecol(self.position) def __str__(self): self.eval_attrs() return "{} at position {}{} => '{}'."\ .format(self.message, "{}:".format(self.parser.file_name) if self.parser.file_name else "", (self.line, self.col), self.context) def __unicode__(self): return self.__str__() def flatten(_iterable): '''Flattening of python iterables.''' result = [] for e in _iterable: if hasattr(e, "__iter__") and not type(e) in [text, NonTerminal]: result.extend(flatten(e)) else: result.append(e) return result class DebugPrinter(object): """ Mixin class for adding debug print support. Attributes: debug (bool): If true debugging messages will be printed. _current_indent(int): Current indentation level for prints. """ def __init__(self, **kwargs): self.debug = kwargs.pop("debug", False) self.file = kwargs.pop("file", sys.stdout) self._current_indent = 0 super(DebugPrinter, self).__init__(**kwargs) def dprint(self, message, indent_change=0): """ Handle debug message. Print to the stream specified by the 'file' keyword argument at the current indentation level. Default stream is stdout. """ if indent_change < 0: self._current_indent += indent_change print(("%s%s" % (" " * self._current_indent, message)), file=self.file) if indent_change > 0: self._current_indent += indent_change # --------------------------------------------------------- # Parser Model (PEG Abstract Semantic Graph) elements class ParsingExpression(object): """ An abstract class for all parsing expressions. Represents the node of the Parser Model. Attributes: elements: A list (or other python object) used as a staging structure for python based grammar definition. Used in _from_python for building nodes list of child parser expressions. rule_name (str): The name of the parser rule if this is the root rule. root (bool): Does this parser expression represents the root of the parser rule? The root parser rule will create non-terminal node of the parse tree during parsing. nodes (list of ParsingExpression): A list of child parser expressions. suppress (bool): If this is set to True than no ParseTreeNode will be created for this ParsingExpression. Default False. """ suppress = False def __init__(self, *elements, **kwargs): if len(elements) == 1: elements = elements[0] self.elements = elements self.rule_name = kwargs.get('rule_name', '') self.root = kwargs.get('root', False) nodes = kwargs.get('nodes', []) if not hasattr(nodes, '__iter__'): nodes = [nodes] self.nodes = nodes if 'suppress' in kwargs: self.suppress = kwargs['suppress'] # Memoization. Every node cache the parsing results for the given input # positions. self._result_cache = {} # position -> parse tree at the position @property def desc(self): return "{}{}".format(self.name, "-" if self.suppress else "") @property def name(self): if self.root: return "%s=%s" % (self.rule_name, self.__class__.__name__) else: return self.__class__.__name__ @property def id(self): if self.root: return self.rule_name else: return id(self) def _clear_cache(self, processed=None): """ Clears memoization cache. Should be called on input change and end of parsing. Args: processed (set): Set of processed nodes to prevent infinite loops. """ self._result_cache = {} if not processed: processed = set() for node in self.nodes: if node not in processed: processed.add(node) node._clear_cache(processed) def parse(self, parser): if parser.debug: name = self.name if name.startswith('__asgn'): name = "{}[{}]".format(self.name, self._attr_name) parser.dprint(">> Matching rule {}{} at position {} => {}" .format(name, " in {}".format(parser.in_rule) if parser.in_rule else "", parser.position, parser.context()), 1) # Current position could change in recursive calls # so save it. c_pos = parser.position # Memoization. # If this position is already parsed by this parser expression use # the result if parser.memoization: try: result, new_pos = self._result_cache[c_pos] parser.position = new_pos parser.cache_hits += 1 if parser.debug: parser.dprint( "** Cache hit for [{}, {}] = '{}' : new_pos={}" .format(name, c_pos, text(result), text(new_pos))) parser.dprint( "<<+ Matched rule {} at position {}" .format(name, new_pos), -1) # If NoMatch is recorded at this position raise. if result is NOMATCH_MARKER: raise parser.nm # else return cached result return result except KeyError: parser.cache_misses += 1 # Remember last parsing expression and set this as # the new last. last_pexpression = parser.last_pexpression parser.last_pexpression = self if self.rule_name: # If we are entering root rule # remember previous root rule name and set # this one on the parser to be available for # debugging messages previous_root_rule_name = parser.in_rule parser.in_rule = self.rule_name try: result = self._parse(parser) if self.suppress or (type(result) is list and result and result[0] is None): result = None except NoMatch: parser.position = c_pos # Backtracking # Memoize NoMatch at this position for this rule if parser.memoization: self._result_cache[c_pos] = (NOMATCH_MARKER, c_pos) raise finally: # Recover last parsing expression. parser.last_pexpression = last_pexpression if parser.debug: parser.dprint("<<{} rule {}{} at position {} => {}" .format("- Not matched" if parser.position is c_pos else "+ Matched", name, " in {}".format(parser.in_rule) if parser.in_rule else "", parser.position, parser.context()), -1) # If leaving root rule restore previous root rule name. if self.rule_name: parser.in_rule = previous_root_rule_name # For root rules flatten non-terminal/list if self.root and result and not isinstance(result, Terminal): if not isinstance(result, NonTerminal): result = flatten(result) # Tree reduction will eliminate Non-terminal with single child. if parser.reduce_tree and len(result) == 1: result = result[0] # If the result is not parse tree node it must be a plain list # so create a new NonTerminal. if not isinstance(result, ParseTreeNode): result = NonTerminal(self, result) # Result caching for use by memoization. if parser.memoization: self._result_cache[c_pos] = (result, parser.position) return result class Sequence(ParsingExpression): """ Will match sequence of parser expressions in exact order they are defined. """ def __init__(self, *elements, **kwargs): super(Sequence, self).__init__(*elements, **kwargs) self.ws = kwargs.pop('ws', None) self.skipws = kwargs.pop('skipws', None) def _parse(self, parser): results = [] c_pos = parser.position if self.ws is not None: old_ws = parser.ws parser.ws = self.ws if self.skipws is not None: old_skipws = parser.skipws parser.skipws = self.skipws # Prefetching append = results.append try: for e in self.nodes: result = e.parse(parser) if result: append(result) except NoMatch: parser.position = c_pos # Backtracking raise finally: if self.ws is not None: parser.ws = old_ws if self.skipws is not None: parser.skipws = old_skipws if results: return results class OrderedChoice(Sequence): """ Will match one of the parser expressions specified. Parser will try to match expressions in the order they are defined. """ def _parse(self, parser): result = None match = False c_pos = parser.position if self.ws is not None: old_ws = parser.ws parser.ws = self.ws if self.skipws is not None: old_skipws = parser.skipws parser.skipws = self.skipws try: for e in self.nodes: try: result = e.parse(parser) if result is not None: match = True result = [result] break except NoMatch: parser.position = c_pos # Backtracking finally: if self.ws is not None: parser.ws = old_ws if self.skipws is not None: parser.skipws = old_skipws if not match: parser._nm_raise(self, c_pos, parser) return result class Repetition(ParsingExpression): """ Base class for all repetition-like parser expressions (?,*,+) Args: eolterm(bool): Flag that indicates that end of line should terminate repetition match. """ def __init__(self, *elements, **kwargs): super(Repetition, self).__init__(*elements, **kwargs) self.eolterm = kwargs.get('eolterm', False) self.sep = kwargs.get('sep', None) class Optional(Repetition): """ Optional will try to match parser expression specified and will not fail in case match is not successful. """ def _parse(self, parser): result = None c_pos = parser.position try: result = [self.nodes[0].parse(parser)] except NoMatch: parser.position = c_pos # Backtracking return result class ZeroOrMore(Repetition): """ ZeroOrMore will try to match parser expression specified zero or more times. It will never fail. """ def _parse(self, parser): results = [] if self.eolterm: # Remember current eolterm and set eolterm of # this repetition old_eolterm = parser.eolterm parser.eolterm = self.eolterm # Prefetching append = results.append p = self.nodes[0].parse sep = self.sep.parse if self.sep else None result = None while True: try: c_pos = parser.position if sep and result: sep_result = sep(parser) if sep_result: append(sep_result) result = p(parser) if not result: break append(result) except NoMatch: parser.position = c_pos # Backtracking break if self.eolterm: # Restore previous eolterm parser.eolterm = old_eolterm return results class OneOrMore(Repetition): """ OneOrMore will try to match parser expression specified one or more times. """ def _parse(self, parser): results = [] first = True if self.eolterm: # Remember current eolterm and set eolterm of # this repetition old_eolterm = parser.eolterm parser.eolterm = self.eolterm # Prefetching append = results.append p = self.nodes[0].parse sep = self.sep.parse if self.sep else None result = None try: while True: try: c_pos = parser.position if sep and result: sep_result = sep(parser) if sep_result: append(sep_result) result = p(parser) if not result: break append(result) first = False except NoMatch: parser.position = c_pos # Backtracking if first: raise break finally: if self.eolterm: # Restore previous eolterm parser.eolterm = old_eolterm return results class UnorderedGroup(Repetition): """ Will try to match all of the parsing expression in any order. """ def _parse(self, parser): results = [] c_pos = parser.position if self.eolterm: # Remember current eolterm and set eolterm of # this repetition old_eolterm = parser.eolterm parser.eolterm = self.eolterm # Prefetching append = results.append nodes_to_try = list(self.nodes) sep = self.sep.parse if self.sep else None result = None sep_result = None first = True while nodes_to_try: sep_exc = None # Separator c_loc_pos_sep = parser.position if sep and not first: try: sep_result = sep(parser) except NoMatch as e: parser.position = c_loc_pos_sep # Backtracking # This still might be valid if all remaining subexpressions # are optional and none of them will match sep_exc = e c_loc_pos = parser.position match = True all_optionals_fail = True for e in list(nodes_to_try): try: result = e.parse(parser) if result: if sep_exc: raise sep_exc if sep_result: append(sep_result) first = False match = True all_optionals_fail = False append(result) nodes_to_try.remove(e) break except NoMatch: match = False parser.position = c_loc_pos # local backtracking if not match or all_optionals_fail: # If sep is matched backtrack it parser.position = c_loc_pos_sep break if self.eolterm: # Restore previous eolterm parser.eolterm = old_eolterm if not match: # Unsuccessful match of the whole PE - full backtracking parser.position = c_pos parser._nm_raise(self, c_pos, parser) if results: return results class SyntaxPredicate(ParsingExpression): """ Base class for all syntax predicates (and, not, empty). Predicates are parser expressions that will do the match but will not consume any input. """ class And(SyntaxPredicate): """ This predicate will succeed if the specified expression matches current input. """ def _parse(self, parser): c_pos = parser.position for e in self.nodes: try: e.parse(parser) except NoMatch: parser.position = c_pos raise parser.position = c_pos class Not(SyntaxPredicate): """ This predicate will succeed if the specified expression doesn't match current input. """ def _parse(self, parser): c_pos = parser.position old_in_not = parser.in_not parser.in_not = True try: for e in self.nodes: try: e.parse(parser) except NoMatch: parser.position = c_pos return parser.position = c_pos parser._nm_raise(self, c_pos, parser) finally: parser.in_not = old_in_not class Empty(SyntaxPredicate): """ This predicate will always succeed without consuming input. """ def _parse(self, parser): pass class Decorator(ParsingExpression): """ Decorator are special kind of parsing expression used to mark a containing pexpression and give it some special semantics. For example, decorators are used to mark pexpression as lexical rules (see :class:Lex). """ class Combine(Decorator): """ This decorator defines pexpression that represents a lexeme rule. This rules will always return a Terminal parse tree node. Whitespaces will be preserved. Comments will not be matched. """ def _parse(self, parser): results = [] oldin_lex_rule = parser.in_lex_rule parser.in_lex_rule = True c_pos = parser.position try: for parser_model_node in self.nodes: results.append(parser_model_node.parse(parser)) results = flatten(results) # Create terminal from result return Terminal(self, c_pos, "".join([x.flat_str() for x in results])) except NoMatch: parser.position = c_pos # Backtracking raise finally: parser.in_lex_rule = oldin_lex_rule class Match(ParsingExpression): """ Base class for all classes that will try to match something from the input. """ def __init__(self, rule_name, root=False, **kwargs): super(Match, self).__init__(rule_name=rule_name, root=root, **kwargs) @property def name(self): if self.root: return "%s=%s(%s)" % (self.rule_name, self.__class__.__name__, self.to_match) else: return "%s(%s)" % (self.__class__.__name__, self.to_match) def _parse_comments(self, parser): """Parse comments.""" try: parser.in_parse_comments = True if parser.comments_model: try: while True: # TODO: Consumed whitespaces and comments should be # attached to the first match ahead. parser.comments.append( parser.comments_model.parse(parser)) if parser.skipws: # Whitespace skipping pos = parser.position ws = parser.ws i = parser.input length = len(i) while pos < length and i[pos] in ws: pos += 1 parser.position = pos except NoMatch: # NoMatch in comment matching is perfectly # legal and no action should be taken. pass finally: parser.in_parse_comments = False def parse(self, parser): if parser.skipws and not parser.in_lex_rule: # Whitespace skipping pos = parser.position ws = parser.ws i = parser.input length = len(i) while pos < length and i[pos] in ws: pos += 1 parser.position = pos if parser.debug: parser.dprint( "?? Try match rule {}{} at position {} => {}" .format(self.name, " in {}".format(parser.in_rule) if parser.in_rule else "", parser.position, parser.context())) if parser.skipws and parser.position in parser.comment_positions: # Skip comments if already parsed. parser.position = parser.comment_positions[parser.position] else: if not parser.in_parse_comments and not parser.in_lex_rule: comment_start = parser.position self._parse_comments(parser) parser.comment_positions[comment_start] = parser.position result = self._parse(parser) if not self.suppress: return result class RegExMatch(Match): ''' This Match class will perform input matching based on Regular Expressions. Args: to_match (regex string): A regular expression string to match. It will be used to create regular expression using re.compile. ignore_case(bool): If case insensitive match is needed. Default is None to support propagation from global parser setting. multiline(bool): allow regex to works on multiple lines (re.DOTALL flag). Default is None to support propagation from global parser setting. str_repr(str): A string that is used to represent this regex. re_flags: flags parameter for re.compile if neither ignore_case or multiple are set. ''' def __init__(self, to_match, rule_name='', root=False, ignore_case=None, multiline=None, str_repr=None, re_flags=re.MULTILINE, **kwargs): super(RegExMatch, self).__init__(rule_name, root, **kwargs) self.to_match_regex = to_match self.ignore_case = ignore_case self.multiline = multiline self.explicit_flags = re_flags self.to_match = str_repr if str_repr is not None else to_match def compile(self): flags = self.explicit_flags if self.multiline is True: flags |= re.DOTALL if self.multiline is False and flags & re.DOTALL: flags -= re.DOTALL if self.ignore_case is True: flags |= re.IGNORECASE if self.ignore_case is False and flags & re.IGNORECASE: flags -= re.IGNORECASE self.regex = re.compile(self.to_match_regex, flags) def __str__(self): return self.to_match def __unicode__(self): return self.__str__() def _parse(self, parser): c_pos = parser.position m = self.regex.match(parser.input, c_pos) if m: matched = m.group() if parser.debug: parser.dprint( "++ Match '%s' at %d => '%s'" % (matched, c_pos, parser.context(len(matched)))) parser.position += len(matched) if matched: return Terminal(self, c_pos, matched, extra_info=m) else: if parser.debug: parser.dprint("-- NoMatch at {}".format(c_pos)) parser._nm_raise(self, c_pos, parser) class StrMatch(Match): """ This Match class will perform input matching by a string comparison. Args: to_match (str): A string to match. ignore_case(bool): If case insensitive match is needed. Default is None to support propagation from global parser setting. """ def __init__(self, to_match, rule_name='', root=False, ignore_case=None, **kwargs): super(StrMatch, self).__init__(rule_name, root, **kwargs) self.to_match = to_match self.ignore_case = ignore_case def _parse(self, parser): c_pos = parser.position input_frag = parser.input[c_pos:c_pos+len(self.to_match)] if self.ignore_case: match = input_frag.lower() == self.to_match.lower() else: match = input_frag == self.to_match if match: if parser.debug: parser.dprint( "++ Match '{}' at {} => '{}'" .format(self.to_match, c_pos, parser.context(len(self.to_match)))) parser.position += len(self.to_match) # If this match is inside sequence than mark for suppression suppress = type(parser.last_pexpression) is Sequence return Terminal(self, c_pos, self.to_match, suppress=suppress) else: if parser.debug: parser.dprint( "-- No match '{}' at {} => '{}'" .format(self.to_match, c_pos, parser.context(len(self.to_match)))) parser._nm_raise(self, c_pos, parser) def __str__(self): return self.to_match def __unicode__(self): return self.__str__() def __eq__(self, other): return self.to_match == text(other) def __hash__(self): return hash(self.to_match) # HACK: Kwd class is a bit hackish. Need to find a better way to # introduce different classes of string tokens. class Kwd(StrMatch): """ A specialization of StrMatch to specify keywords of the language. """ def __init__(self, to_match): super(Kwd, self).__init__(to_match) self.to_match = to_match self.root = True self.rule_name = 'keyword' class EndOfFile(Match): """ The Match class that will succeed in case end of input is reached. """ def __init__(self): super(EndOfFile, self).__init__("EOF") @property def name(self): return "EOF" def _parse(self, parser): c_pos = parser.position if len(parser.input) == c_pos: return Terminal(EOF(), c_pos, '', suppress=True) else: if parser.debug: parser.dprint("!! EOF not matched.") parser._nm_raise(self, c_pos, parser) def EOF(): return EndOfFile() # --------------------------------------------------------- # --------------------------------------------------- # Parse Tree node classes class ParseTreeNode(object): """ Abstract base class representing node of the Parse Tree. The node can be terminal(the leaf of the parse tree) or non-terminal. Attributes: rule (ParsingExpression): The rule that created this node. rule_name (str): The name of the rule that created this node if root rule or empty string otherwise. position (int): A position in the input stream where the match occurred. position_end (int, read-only): A position in the input stream where the node ends. This position is one char behind the last char contained in this node. Thus, position_end - position = length of the node. error (bool): Is this a false parse tree node created during error recovery. comments : A parse tree of comment(s) attached to this node. """ def __init__(self, rule, position, error): assert rule assert rule.rule_name is not None self.rule = rule self.rule_name = rule.rule_name self.position = position self.error = error self.comments = None @property def name(self): return "%s [%s]" % (self.rule_name, self.position) @property def position_end(self): "Must be implemented in subclasses." raise NotImplementedError def visit(self, visitor): """ Visitor pattern implementation. Args: visitor(PTNodeVisitor): The visitor object. """ if visitor.debug: visitor.dprint("Visiting {} type:{} str:{}" .format(self.name, type(self).__name__, text(self))) children = SemanticActionResults() if isinstance(self, NonTerminal): for node in self: child = node.visit(visitor) # If visit returns None suppress that child node if child is not None: children.append_result(node.rule_name, child) visit_name = "visit_%s" % self.rule_name if hasattr(visitor, visit_name): # Call visit method. result = getattr(visitor, visit_name)(self, children) # If there is a method with 'second' prefix save # the result of visit for post-processing if hasattr(visitor, "second_%s" % self.rule_name): visitor.for_second_pass.append((self.rule_name, result)) return result elif visitor.defaults: # If default actions are enabled return visitor.visit__default__(self, children) def tree_str(self, indent=0): return '{}{} [{}-{}]'.format(' ' * indent, self.rule.name, self.position, self.position_end) class Terminal(ParseTreeNode): """ Leaf node of the Parse Tree. Represents matched string. Attributes: rule (ParsingExpression): The rule that created this terminal. position (int): A position in the input stream where match occurred. value (str): Matched string at the given position or missing token name in the case of an error node. suppress(bool): If True this terminal can be ignored in semantic analysis. extra_info(object): additional information (e.g. the re matcher object) """ __slots__ = ['rule', 'rule_name', 'position', 'error', 'comments', 'value', 'suppress', 'extra_info'] def __init__(self, rule, position, value, error=False, suppress=False, extra_info=None): super(Terminal, self).__init__(rule, position, error) self.value = value self.suppress = suppress self.extra_info = extra_info @property def desc(self): if self.value: return "%s '%s' [%s]" % (self.rule_name, self.value, self.position) else: return "%s [%s]" % (self.rule_name, self.position) @property def position_end(self): return self.position + len(self.value) def flat_str(self): return self.value def __str__(self): return self.value def __unicode__(self): return self.__str__() def __repr__(self): return self.desc def tree_str(self, indent=0): return '{}: {}'.format(super(Terminal, self).tree_str(indent), self.value) def __eq__(self, other): return text(self) == text(other) class NonTerminal(ParseTreeNode, list): """ Non-leaf node of the Parse Tree. Represents language syntax construction. At the same time used in ParseTreeNode navigation expressions. See test_ptnode_navigation_expressions.py for examples of navigation expressions. Attributes: nodes (list of ParseTreeNode): Children parse tree nodes. _filtered (bool): Is this NT a dynamically created filtered NT. This is used internally. """ __slots__ = ['rule', 'rule_name', 'position', 'error', 'comments', '_filtered', '_expr_cache'] def __init__(self, rule, nodes, error=False, _filtered=False): # Inherit position from the first child node position = nodes[0].position if nodes else 0 super(NonTerminal, self).__init__(rule, position, error) self.extend(flatten([nodes])) self._filtered = _filtered @property def value(self): """Terminal protocol.""" return text(self) @property def desc(self): return self.name @property def position_end(self): return self[-1].position_end if self else self.position def flat_str(self): """ Return flatten string representation. """ return "".join([x.flat_str() for x in self]) def __str__(self): return " | ".join([text(x) for x in self]) def __unicode__(self): return self.__str__() def __repr__(self): return "[ %s ]" % ", ".join([repr(x) for x in self]) def tree_str(self, indent=0): return '{}\n{}'.format(super(NonTerminal, self).tree_str(indent), '\n'.join([c.tree_str(indent + 1) for c in self])) def __getattr__(self, rule_name): """ Find a child (non)terminal by the rule name. Args: rule_name(str): The name of the rule that is referenced from this node rule. """ # Prevent infinite recursion if rule_name in ['_expr_cache', '_filtered', 'rule', 'rule_name', 'position', 'append', 'extend']: raise AttributeError try: # First check the cache if rule_name in self._expr_cache: return self._expr_cache[rule_name] except AttributeError: # Navigation expression cache. Used for lookup by rule name. self._expr_cache = {} # If result is not found in the cache collect all nodes # with the given rule name and create new NonTerminal # and cache it for later access. nodes = [] rule = None for n in self: if self._filtered: # For filtered NT rule_name is a rule on # each of its children for m in n: if m.rule_name == rule_name: nodes.append(m) rule = m.rule else: if n.rule_name == rule_name: nodes.append(n) rule = n.rule if rule is None: # If rule is not found resort to default behavior return self.__getattribute__(rule_name) result = NonTerminal(rule=rule, nodes=nodes, _filtered=True) self._expr_cache[rule_name] = result return result # ---------------------------------------------------- # Semantic Actions # class PTNodeVisitor(DebugPrinter): """ Base class for all parse tree visitors. """ def __init__(self, defaults=True, **kwargs): """ Args: defaults(bool): If the default visit method should be applied in case no method is defined. """ self.for_second_pass = [] self.defaults = defaults super(PTNodeVisitor, self).__init__(**kwargs) def visit__default__(self, node, children): """ Called if no visit method is defined for the node. Args: node(ParseTreeNode): children(processed children ParseTreeNode-s): """ if isinstance(node, Terminal): # Default for Terminal is to convert to string unless suppress flag # is set in which case it is suppressed by setting to None. retval = text(node) if not node.suppress else None else: retval = node # Special case. If only one child exist return it. if len(children) == 1: retval = children[0] else: # If there is only one non-string child return # that by default. This will support e.g. bracket # removals. last_non_str = None for c in children: if not isstr(c): if last_non_str is None: last_non_str = c else: # If there is multiple non-string objects # by default convert non-terminal to string if self.debug: self.dprint("*** Warning: Multiple " "non-string objects found in " "default visit. Converting non-" "terminal to a string.") retval = text(node) break else: # Return the only non-string child retval = last_non_str return retval def visit_parse_tree(parse_tree, visitor): """ Applies visitor to parse_tree and runs the second pass afterwards. Args: parse_tree(ParseTreeNode): visitor(PTNodeVisitor): """ if not parse_tree: raise Exception( "Parse tree is empty. You did call parse(), didn't you?") if visitor.debug: visitor.dprint("ASG: First pass") # Visit tree. result = parse_tree.visit(visitor) # Second pass if visitor.debug: visitor.dprint("ASG: Second pass") for sa_name, asg_node in visitor.for_second_pass: getattr(visitor, "second_%s" % sa_name)(asg_node) return result class SemanticAction(object): """ Semantic actions are executed during semantic analysis. They are in charge of producing Abstract Semantic Graph (ASG) out of the parse tree. Every non-terminal and terminal can have semantic action defined which will be triggered during semantic analysis. Semantic action triggering is separated in two passes. first_pass method is required and the method called second_pass is optional and will be called if exists after the first pass. Second pass can be used for forward referencing, e.g. linking to the declaration registered in the first pass stage. """ def first_pass(self, parser, node, nodes): """ Called in the first pass of tree walk. This is the default implementation used if no semantic action is defined. """ if isinstance(node, Terminal): # Default for Terminal is to convert to string unless suppress flag # is set in which case it is suppressed by setting to None. retval = text(node) if not node.suppress else None else: retval = node # Special case. If only one child exist return it. if len(nodes) == 1: retval = nodes[0] else: # If there is only one non-string child return # that by default. This will support e.g. bracket # removals. last_non_str = None for c in nodes: if not isstr(c): if last_non_str is None: last_non_str = c else: # If there is multiple non-string objects # by default convert non-terminal to string if parser.debug: parser.dprint( "*** Warning: Multiple non-" "string objects found in applying " "default semantic action. Converting " "non-terminal to string.") retval = text(node) break else: # Return the only non-string child retval = last_non_str return retval class SemanticActionResults(list): """ Used in visitor methods call to supply results of semantic analysis of children parse tree nodes. Enables dot access by the name of the rule similar to NonTerminal tree navigation. Enables index access as well as iteration. """ def __init__(self): self.results = {} def append_result(self, name, result): if name: if name not in self.results: self.results[name] = [] self.results[name].append(result) self.append(result) def __getattr__(self, attr_name): if attr_name == 'results': raise AttributeError return self.results.get(attr_name, []) # Common semantic actions class SemanticActionSingleChild(SemanticAction): def first_pass(self, parser, node, children): return children[0] class SemanticActionBodyWithBraces(SemanticAction): def first_pass(self, parser, node, children): return children[1:-1] class SemanticActionToString(SemanticAction): def first_pass(self, parser, node, children): return text(node) # ---------------------------------------------------- # Parsers class Parser(DebugPrinter): """ Abstract base class for all parsers. Attributes: comments_model: parser model for comments. comments(list): A list of ParseTreeNode for matched comments. sem_actions(dict): A dictionary of semantic actions keyed by the rule name. parse_tree(NonTerminal): The parse tree consisting of NonTerminal and Terminal instances. in_rule (str): Current rule name. in_parse_comments (bool): True if parsing comments. in_lex_rule (bool): True if in lexical rule. Currently used in Combine decorator to convert match to a single Terminal. in_not (bool): True if in Not parsing expression. Used for better error reporting. last_pexpression (ParsingExpression): Last parsing expression traversed. """ # Not marker for NoMatch rules list. Used if the first unsuccessful rule # match is Not. FIRST_NOT = Not() def __init__(self, skipws=True, ws=None, reduce_tree=False, autokwd=False, ignore_case=False, memoization=False, **kwargs): """ Args: skipws (bool): Should the whitespace skipping be done. Default is True. ws (str): A string consisting of whitespace characters. reduce_tree (bool): If true non-terminals with single child will be eliminated from the parse tree. Default is False. autokwd(bool): If keyword-like StrMatches are matched on word boundaries. Default is False. ignore_case(bool): If case is ignored (default=False) memoization(bool): If memoization should be used (a.k.a. packrat parsing) """ super(Parser, self).__init__(**kwargs) # Used to indicate state in which parser should not # treat newlines as whitespaces. self._eolterm = False self.skipws = skipws if ws is not None: self.ws = ws else: self.ws = DEFAULT_WS self.reduce_tree = reduce_tree self.autokwd = autokwd self.ignore_case = ignore_case self.memoization = memoization self.comments_model = None self.comments = [] self.comment_positions = {} self.sem_actions = {} self.parse_tree = None # Create regex used for autokwd matching flags = 0 if ignore_case: flags = re.IGNORECASE self.keyword_regex = re.compile(r'[^\d\W]\w*', flags) # Keep track of root rule we are currently in. # Used for debugging purposes self.in_rule = '' self.in_parse_comments = False # Are we in lexical rule? If so do not # skip whitespaces. self.in_lex_rule = False # Are we in Not parsing expression? self.in_not = False # Last parsing expression traversed self.last_pexpression = None @property def ws(self): return self._ws @ws.setter def ws(self, new_value): self._real_ws = new_value self._ws = new_value if self.eolterm: self._ws = self._ws.replace('\n', '').replace('\r', '') @property def eolterm(self): return self._eolterm @eolterm.setter def eolterm(self, new_value): # Toggle newline char in ws on eolterm property set. # During eolterm state parser should not treat # newline as a whitespace. self._eolterm = new_value if self._eolterm: self._ws = self._ws.replace('\n', '').replace('\r', '') else: self._ws = self._real_ws def parse(self, _input, file_name=None): """ Parses input and produces parse tree. Args: _input(str): An input string to parse. file_name(str): If input is loaded from file this can be set to file name. It is used in error messages. """ self.position = 0 # Input position self.nm = None # Last NoMatch exception self.line_ends = [] self.input = _input self.file_name = file_name self.comment_positions = {} self.cache_hits = 0 self.cache_misses = 0 try: self.parse_tree = self._parse() except NoMatch as e: # Remove Not marker if e.rules[0] is Parser.FIRST_NOT: del e.rules[0] # Get line and column from position e.line, e.col = self.pos_to_linecol(e.position) raise finally: # At end of parsing clear all memoization caches. # Do this here to free memory. if self.memoization: self._clear_caches() # In debug mode export parse tree to dot file for # visualization if self.debug and self.parse_tree: from arpeggio.export import PTDOTExporter root_rule_name = self.parse_tree.rule_name PTDOTExporter().exportFile( self.parse_tree, "{}_parse_tree.dot".format(root_rule_name)) return self.parse_tree def parse_file(self, file_name): """ Parses content from the given file. Args: file_name(str): A file name. """ with codecs.open(file_name, 'r', 'utf-8') as f: content = f.read() return self.parse(content, file_name=file_name) def getASG(self, sem_actions=None, defaults=True): """ Creates Abstract Semantic Graph (ASG) from the parse tree. Args: sem_actions (dict): The semantic actions dictionary to use for semantic analysis. Rule names are the keys and semantic action objects are values. defaults (bool): If True a default semantic action will be applied in case no action is defined for the node. """ if not self.parse_tree: raise Exception( "Parse tree is empty. You did call parse(), didn't you?") if sem_actions is None: if not self.sem_actions: raise Exception("Semantic actions not defined.") else: sem_actions = self.sem_actions if type(sem_actions) is not dict: raise Exception("Semantic actions parameter must be a dictionary.") for_second_pass = [] def tree_walk(node): """ Walking the parse tree and calling first_pass for every registered semantic actions and creating list of object that needs to be called in the second pass. """ if self.debug: self.dprint( "Walking down %s type: %s str: %s" % (node.name, type(node).__name__, text(node))) children = SemanticActionResults() if isinstance(node, NonTerminal): for n in node: child = tree_walk(n) if child is not None: children.append_result(n.rule_name, child) if self.debug: self.dprint("Processing %s = '%s' type:%s len:%d" % (node.name, text(node), type(node).__name__, len(node) if isinstance(node, list) else 0)) for i, a in enumerate(children): self.dprint(" %d:%s type:%s" % (i+1, text(a), type(a).__name__)) if node.rule_name in sem_actions: sem_action = sem_actions[node.rule_name] if isinstance(sem_action, types.FunctionType): retval = sem_action(self, node, children) else: retval = sem_action.first_pass(self, node, children) if hasattr(sem_action, "second_pass"): for_second_pass.append((node.rule_name, retval)) if self.debug: action_name = sem_action.__name__ \ if hasattr(sem_action, '__name__') \ else sem_action.__class__.__name__ self.dprint(" Applying semantic action %s" % action_name) else: if defaults: # If no rule is present use some sane defaults if self.debug: self.dprint(" Applying default semantic action.") retval = SemanticAction().first_pass(self, node, children) else: retval = node if self.debug: if retval is None: self.dprint(" Suppressed.") else: self.dprint(" Resolved to = %s type:%s" % (text(retval), type(retval).__name__)) return retval if self.debug: self.dprint("ASG: First pass") asg = tree_walk(self.parse_tree) # Second pass if self.debug: self.dprint("ASG: Second pass") for sa_name, asg_node in for_second_pass: sem_actions[sa_name].second_pass(self, asg_node) return asg def pos_to_linecol(self, pos): """ Calculate (line, column) tuple for the given position in the stream. """ if not self.line_ends: try: # TODO: Check this implementation on Windows. self.line_ends.append(self.input.index("\n")) while True: try: self.line_ends.append( self.input.index("\n", self.line_ends[-1] + 1)) except ValueError: break except ValueError: pass line = bisect.bisect_left(self.line_ends, pos) col = pos if line > 0: col -= self.line_ends[line - 1] if self.input[self.line_ends[line - 1]] in '\n\r': col -= 1 return line + 1, col + 1 def context(self, length=None, position=None): """ Returns current context substring, i.e. the substring around current position. Args: length(int): If given used to mark with asterisk a length chars from the current position. position(int): The position in the input stream. """ if not position: position = self.position if length: retval = "{}*{}*{}".format( text(self.input[max(position - 10, 0):position]), text(self.input[position:position + length]), text(self.input[position + length:position + 10])) else: retval = "{}*{}".format( text(self.input[max(position - 10, 0):position]), text(self.input[position:position + 10])) return retval.replace('\n', ' ').replace('\r', '') def _nm_raise(self, *args): """ Register new NoMatch object if the input is consumed from the last NoMatch and raise last NoMatch. Args: args: A NoMatch instance or (value, position, parser) """ rule, position, parser = args if self.nm is None or not parser.in_parse_comments: if self.nm is None or position > self.nm.position: if self.in_not: self.nm = NoMatch([Parser.FIRST_NOT], position, parser) else: self.nm = NoMatch([rule], position, parser) elif position == self.nm.position and isinstance(rule, Match) \ and not self.in_not: self.nm.rules.append(rule) raise self.nm def _clear_caches(self): """ Clear memoization caches if packrat parser is used. """ self.parser_model._clear_cache() if self.comments_model: self.comments_model._clear_cache() class CrossRef(object): ''' Used for rule reference resolving. ''' def __init__(self, target_rule_name, position=-1): self.target_rule_name = target_rule_name self.position = position class ParserPython(Parser): def __init__(self, language_def, comment_def=None, syntax_classes=None, *args, **kwargs): """ Constructs parser from python statements and expressions. Args: language_def (python function): A python function that defines the root rule of the grammar. comment_def (python function): A python function that defines the root rule of the comments grammar. syntax_classes (dict): Overrides of special syntax parser expression classes (StrMatch, Sequence, OrderedChoice). """ super(ParserPython, self).__init__(*args, **kwargs) self.syntax_classes = syntax_classes if syntax_classes else {} # PEG Abstract Syntax Graph self.parser_model = self._from_python(language_def) self.comments_model = None if comment_def: self.comments_model = self._from_python(comment_def) self.comments_model.root = True self.comments_model.rule_name = comment_def.__name__ # In debug mode export parser model to dot for # visualization if self.debug: from arpeggio.export import PMDOTExporter root_rule = language_def.__name__ PMDOTExporter().exportFile(self.parser_model, "{}_parser_model.dot".format(root_rule)) def _parse(self): return self.parser_model.parse(self) def _from_python(self, expression): """ Create parser model from the definition given in the form of python functions returning lists, tuples, callables, strings and ParsingExpression objects. Returns: Parser Model (PEG Abstract Semantic Graph) """ __rule_cache = {"EndOfFile": EndOfFile()} __for_resolving = [] # Expressions that needs crossref resolvnih self.__cross_refs = 0 _StrMatch = self.syntax_classes.get('StrMatch', StrMatch) _OrderedChoice = self.syntax_classes.get('OrderedChoice', OrderedChoice) _Sequence = self.syntax_classes.get('Sequence', Sequence) def inner_from_python(expression): retval = None if isinstance(expression, types.FunctionType): # If this expression is a parser rule rule_name = expression.__name__ if rule_name in __rule_cache: c_rule = __rule_cache.get(rule_name) if self.debug: self.dprint("Rule {} founded in cache." .format(rule_name)) if isinstance(c_rule, CrossRef): self.__cross_refs += 1 if self.debug: self.dprint("CrossRef usage: {}" .format(c_rule.target_rule_name)) return c_rule # Semantic action for the rule if hasattr(expression, "sem"): self.sem_actions[rule_name] = expression.sem # Register rule cross-ref to support recursion __rule_cache[rule_name] = CrossRef(rule_name) curr_expr = expression while isinstance(curr_expr, types.FunctionType): # If function directly returns another function # go into until non-function is returned. curr_expr = curr_expr() retval = inner_from_python(curr_expr) retval.rule_name = rule_name retval.root = True # Update cache __rule_cache[rule_name] = retval if self.debug: self.dprint("New rule: {} -> {}" .format(rule_name, retval.__class__.__name__)) elif type(expression) is text or isinstance(expression, _StrMatch): if type(expression) is text: retval = _StrMatch(expression, ignore_case=self.ignore_case) else: retval = expression if expression.ignore_case is None: expression.ignore_case = self.ignore_case if self.autokwd: to_match = retval.to_match match = self.keyword_regex.match(to_match) if match and match.span() == (0, len(to_match)): retval = RegExMatch(r'{}\b'.format(to_match), ignore_case=self.ignore_case, str_repr=to_match) retval.compile() elif isinstance(expression, RegExMatch): # Regular expression are not compiled yet # to support global settings propagation from # parser. if expression.ignore_case is None: expression.ignore_case = self.ignore_case expression.compile() retval = expression elif isinstance(expression, Match): retval = expression elif isinstance(expression, UnorderedGroup): retval = expression for n in retval.elements: retval.nodes.append(inner_from_python(n)) if any((isinstance(x, CrossRef) for x in retval.nodes)): __for_resolving.append(retval) elif isinstance(expression, _Sequence) or \ isinstance(expression, Repetition) or \ isinstance(expression, SyntaxPredicate) or \ isinstance(expression, Decorator): retval = expression retval.nodes.append(inner_from_python(retval.elements)) if any((isinstance(x, CrossRef) for x in retval.nodes)): __for_resolving.append(retval) elif type(expression) in [list, tuple]: if type(expression) is list: retval = _OrderedChoice(expression) else: retval = _Sequence(expression) retval.nodes = [inner_from_python(e) for e in expression] if any((isinstance(x, CrossRef) for x in retval.nodes)): __for_resolving.append(retval) else: raise GrammarError("Unrecognized grammar element '%s'." % text(expression)) # Translate separator expression. if isinstance(expression, Repetition) and expression.sep: expression.sep = inner_from_python(expression.sep) return retval # Cross-ref resolving def resolve(): for e in __for_resolving: for i, node in enumerate(e.nodes): if isinstance(node, CrossRef): self.__cross_refs -= 1 e.nodes[i] = __rule_cache[node.target_rule_name] parser_model = inner_from_python(expression) resolve() assert self.__cross_refs == 0, "Not all crossrefs are resolved!" return parser_model def errors(self): pass