Commit 641fa08b authored by di68kap's avatar di68kap
Browse files

compile.py: added optional attribute visitors

parent c3da5507
......@@ -58,6 +58,7 @@ __all__ = ('CompilerError',
'ResultTuple',
'compile_source',
'visitor_name',
'attr_visitor_name',
'TreeProcessor',
'process_tree')
......@@ -74,15 +75,27 @@ class CompilerError(Exception):
def visitor_name(node_name: str) -> str:
"""
Returns the method name for `node_name`, e.g.::
Returns the visitor_method name for `node_name`, e.g.::
>>> visitor_name('expression')
'on_expression'
>>> visitor_name('expression')
'on_expression'
"""
# assert re.match(r'\w+$', node_name)
return 'on_' + node_name
def attr_visitor_name(attr_name: str) -> str:
"""
Returns the visitor_method name for `attr_name`, e.g.::
>>> attr_visitor_name('class')
'attr_class'
"""
# assert re.match(r'\w+$', node_name)
return 'attr_' + attr_name
ROOTNODE_PLACEHOLDER = RootNode()
......@@ -106,28 +119,35 @@ class Compiler:
method which will pick the right `on_XXX`-method. It is not
recommended to call the `on_XXX`-methods directly.
Attributes:
source: The source text of the AST to be compiled. This needs to be
:ivar source: The source text of the AST to be compiled. This needs to be
assigned by the user of the Compiler object - as is done
by function `compile_source()`
context: A list of parent nodes that ends with the currently
:ivar context: A list of parent nodes that ends with the currently
compiled node.
tree: The root of the abstract syntax tree.
finalizers: A stack of tuples (function, parameters) that will be
:ivar tree: The root of the abstract syntax tree.
:ivar finalizers: A stack of tuples (function, parameters) that will be
called in reverse order after compilation.
_dirty_flag: A flag indicating that the compiler has already been
:ivar has_attribute_visitors: A flag indicating that the class has
attribute-visitor-methods which are named 'attr_ATTRIBUTENAME'
and will be called if the currently processed node has one
or more attributes for which such visitors exist.
:ivar _dirty_flag: A flag indicating that the compiler has already been
called at least once and that therefore all compilation
variables must be reset when it is called again.
_debug: A flag indicating that debugging is turned on the value
:ivar _debug: A flag indicating that debugging is turned on. The value
for this flag is read before each call of the configuration
(see debugging section in DHParser.configuration)
(see debugging section in DHParser.configuration).
If debugging is turned on the compiler class raises en
error if a node is attempted to be compiled twice.
_debug_already_compiled: A set of nodes that have already been compiled.
error if there is an attempt to be compile one and the same
node a second time..
:ivar _debug_already_compiled: A set of nodes that have already been compiled.
"""
def __init__(self):
self.has_attribute_visitors = any(field[0:5] == 'attr_' and callable(getattr(self, field))
for field in dir(self))
self.reset()
def reset(self):
......@@ -176,36 +196,7 @@ class Compiler:
self.finalize()
return result
# def OBSOLETE_fallback_compiler(self, node: Node) -> Any:
# """This is a generic compiler function which will be called on
# all those node types for which no compiler method `on_XXX` has
# been defined.
#
# OBSOLETE, because it does not allow manipulation or parent tree
# during transformation
# """
# result = []
# if node.children:
# for child in node.children:
# nd = self.compile(child)
# if nd is not None:
# try:
# if nd.tag_name != EMPTY_PTYPE:
# result.append(nd)
# except AttributeError:
# pass
# if not isinstance(nd, Node):
# tn = node.tag_name
# raise TypeError(
# 'Fallback compiler for Node `%s` received a value of type '
# '`%s` from child `%s` instead of the required return type `Node`. '
# 'Override `DHParser.compile.Compiler.fallback_compiler()` or add '
# 'method `on_%s(self, node)` in class `%s` to avoid this error!'
# % (tn, str(type(nd)), child.tag_name, tn, self.__class__.__name__))
# node.result = tuple(result)
# return node
def fallback_compiler(self, node: Node) -> Any:
def fallback_compiler(self, node: Node, block_attribute_visitors: bool=False) -> Any:
"""This is a generic compiler function which will be called on
all those node types for which no compiler method `on_XXX` has
been defined."""
......@@ -232,6 +223,13 @@ class Compiler:
if nd is not None and nd.tag_name != EMPTY_PTYPE:
result.append(nd)
node.result = tuple(result)
if self.has_attribute_visitors and not block_attribute_visitors and node.has_attr():
for attribute, value in node.attr:
try:
attribute_visitor = self.__getattribute__(attr_visitor_name(attribute))
node = attribute_visitor(node, value) or node
except AttributeError:
pass
return node
def compile(self, node: Node) -> Any:
......@@ -253,7 +251,7 @@ class Compiler:
elem = node.tag_name
if elem[:1] == ':':
elem = elem[1:]
elem = elem[1:] + '__'
try:
compiler = self.__getattribute__(visitor_name(elem))
except AttributeError:
......@@ -323,19 +321,19 @@ def compile_source(source: str,
no fatal errors occurred in any of the earlier stages of the processing
pipeline.
:param source (str): The input text for compilation or a the name of a
:param source: The input text for compilation or a the name of a
file containing the input text.
:param preprocessor (function): text -> text. A preprocessor function
:param preprocessor: text -> text. A preprocessor function
or None, if no preprocessor is needed.
:param parser (function): A parsing function or grammar class
:param transformer (function): A transformation function that takes
:param parser: A parsing function or grammar class
:param transformer: A transformation function that takes
the root-node of the concrete syntax tree as an argument and
transforms it (in place) into an abstract syntax tree.
:param compiler (function): A compiler function or compiler class
:param compiler: A compiler function or compiler class
instance
:param preserve_AST (bool): Preserves the AST-tree.
:param preserve_AST: Preserves the AST-tree.
:return: The result of the compilation as a 3-tuple
:returns: The result of the compilation as a 3-tuple
(result, errors, abstract syntax tree). In detail:
1. The result as returned by the compiler or ``None`` in case of failure
......@@ -431,7 +429,7 @@ def compile_source(source: str,
class TreeProcessor(Compiler):
"""A special kind of Compiler class that take a tree as input (just like
"""A special kind of Compiler class that takes a tree as input (just like
`Compiler`) but always yields a tree as result.
The intended use case for TreeProcessor are digital-humanities-applications
......@@ -448,13 +446,13 @@ class TreeProcessor(Compiler):
def __call__(self, root: RootNode) -> RootNode:
assert isinstance(root, RootNode)
result = super().__call__(root)
assert isinstance(result, RootNode), result.as_sxpr()
assert isinstance(result, RootNode), str(result)
return cast(RootNode, result)
def process_tree(tp: TreeProcessor, tree: RootNode) -> Tuple[RootNode, List[Error]]:
"""Process a tree with the tree-processor `tp` only if no fatal error
has occurred so far. Catch any Python exceptions in case
has occurred so far. Catch any Python-exceptions in case
any normal errors have occurred earlier in the processing pipeline.
Don't catch Python-exceptions if no errors have occurred earlier.
......@@ -465,13 +463,12 @@ def process_tree(tp: TreeProcessor, tree: RootNode) -> Tuple[RootNode, List[Erro
error. Processing stages should be written with possible errors
occurring in earlier stages in mind, though. However, because it could
be difficult to provide for all possible kinds of badly structured
trees resulting from errors, exceptions occurring on code processing
trees resulting from errors, exceptions occurring when processing
potentially faulty trees will be dealt with gracefully.
Although process_tree returns the root-node of the processed tree,
tree processing should generally be assumed to change the tree
in place, even if a different root-node is returned than was passed
to the tree. If the input tree shall be preserved, it is necessary to
in place. If the input tree shall be preserved, it is necessary to
make a deep copy of the input tree, before calling process_tree.
"""
assert isinstance(tp, TreeProcessor)
......@@ -517,4 +514,4 @@ def process_tree(tp: TreeProcessor, tree: RootNode) -> Tuple[RootNode, List[Erro
# TODO: Verify compiler against grammar,
# i.e. make sure that for all on_X()-methods, `X` is the name of a parser
# TODO: AST validation against an ASDSL-Specification
# TODO: AST validation against an ASDL-Specification
......@@ -302,9 +302,6 @@ CONFIG_PRESET['left_recursion'] = True
# 'indented' - compact tree output, i.e. children a represented on
# indented lines with no opening or closing tags, brackets
# etc.
# 'smart' - serialize as S-expression if the S-expression fits on
# one line (see 'flatten_sxpr_threshold'), otherwise
# serialize as compact tree output
# 'json' - output in JSON-format. This is probably the least
# readable representation, but useful for serialization, for
# example, to return syntax trees from remote procedure calls.
......
......@@ -92,7 +92,7 @@ The structure of a JSON file can easily be described in EBNF::
'member = string §":" ~ _element \\n'\
'array = "[" ~ ( _element ( "," ~ _element )* )? §"]" ~ \\n'\
'string = `"` §_CHARS `"` ~ \\n'\
' _CHARS = /[^"\\]+/ | /\\[\/bnrt\\]/ \\n'\
' _CHARS = /[^"\\\\\]+/ | /\\\\\\\[\/bnrt\\\\\]/ \\n'\
'number = _INT _FRAC? _EXP? ~ \\n'\
' _INT = `-` /[1-9][0-9]+/ | /[0-9]/ \\n'\
' _FRAC = `.` /[0-9]+/ \\n'\
......@@ -100,13 +100,53 @@ The structure of a JSON file can easily be described in EBNF::
'bool = "true" ~ | "false" ~ \\n'\
'null = "null" ~ \\n'
Let's try this on our test-string. In order to compile this grammar into
executable Python-code, we use the high-level-function
This is a rather common EBNF-grammar. A few peculiarities are noteworthy, though:
First of all you might notice that some elements have names with a leading
underscore "_". It is a convention to mark those elements, in which we are on
interested on their own account, with an underscore "_". When moving from the
concrete syntax-tree to a more abstract syntax-tree, these elements could be
substituted by their content, to simplify the tree.
Secondly, some elements carry a name written in captial letters. This is also
a convention to mark those elements which with other parser-generators would
represent tokens delivered by a lexical scanner. DHParser is a "scanner-less"
parser, which means that the breaking down of the string into meaningful tokens
is done in place with regular expressions (like in the definition of "_EOF")
or simple combinations of regular expressions (see the definition of "_INT" above).
Their is no sharp distinction between tokens and other symbols in DHParser,
but we keep it as a loose convention. Regular expression are enclosed in forward
slashes and follow the standard syntax of Perl-style regular expression that is
also used by the "re"-module of the Python standard library. (Don't worry about
the number of backslashes in the line defining "_CHARS" for now!)
Finally, it is another helpful conention to indent the defintions of symbols
that have only been introduced to simplify an otherwise uneccessarily
complicated definition (e.g. the definition of "number", above) or to make
it more understandable by giving names to its componentns (like "_EOF").
Let's try this grammar on our test-string. In order to compile
this grammar into executable Python-code, we use the high-level-function
:py:func:`create_parser()` from :py:mod:`DHParser.dsl`-module.
>>> from DHParser.dsl import create_parser
>>> # from DHParser.dsl import compileEBNF
>>> # print(compileEBNF(grammar))
>>> parser = create_parser(grammar, branding="JSON")
>>> contrete_syntax_tree = parser(grammar)
>>> syntax_tree = parser(testdata)
>>> syntax_tree.content
'{"list": [1, 2, "string"], "int": 3, "bool": false}'
As expected serializing the content of the resulting syntax-tree yields exactly
the input-string of the parsing process. What we cannot see here, is that the
parser has structured the string into the individual elements described in the
grammar. Since the concrete syntax-tree that the parser vields is rather
verbose, it would not make sense to print it out. We'll just look at a small
part of it, to see what it looks like. Let's just pick the sub-tree that
captures the first json-string within the syntax-tree::
>>> print(syntax_tree.pick('string').as_sxpr())
(string (:Text '"') (_CHARS "list") (:Text '"'))
"""
......@@ -123,7 +163,7 @@ from DHParser.error import Error, AMBIGUOUS_ERROR_HANDLING, WARNING, REDECLARED_
REDEFINED_DIRECTIVE, UNUSED_ERROR_HANDLING_WARNING, INAPPROPRIATE_SYMBOL_FOR_DIRECTIVE, \
DIRECTIVE_FOR_NONEXISTANT_SYMBOL, UNDEFINED_SYMBOL_IN_TRANSTABLE_WARNING, \
UNCONNECTED_SYMBOL_WARNING, REORDERING_OF_ALTERNATIVES_REQUIRED, BAD_ORDER_OF_ALTERNATIVES, \
EMPTY_GRAMMAR_ERROR
EMPTY_GRAMMAR_ERROR, MALFORMED_REGULAR_EXPRESSION
from DHParser.parse import Parser, Grammar, mixin_comment, mixin_nonempty, Forward, RegExp, \
Drop, Lookahead, NegativeLookahead, Alternative, Series, Option, ZeroOrMore, OneOrMore, \
Text, Capture, Retrieve, Pop, optional_last_value, GrammarError, Whitespace, Always, Never, \
......@@ -1065,10 +1105,11 @@ WHITESPACE_TYPES = {'horizontal': r'[\t ]*', # default: horizontal
'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
'vertical': r'\s*'}
DROP_STRINGS = 'strings'
DROP_WSPC = 'whitespace'
DROP_REGEXP = 'regexps'
DROP_VALUES = {DROP_STRINGS, DROP_WSPC, DROP_REGEXP}
DROP_STRINGS = 'strings'
DROP_BACKTICKED = 'backticked'
DROP_WSPC = 'whitespace'
DROP_REGEXP = 'regexps'
DROP_VALUES = {DROP_STRINGS, DROP_BACKTICKED, DROP_WSPC, DROP_REGEXP}
# Representation of Python code or, rather, something that will be output as Python code
ReprType = Union[str, unrepr]
......@@ -1489,7 +1530,7 @@ class EBNFCompiler(Compiler):
re.compile(rx)
except Exception as re_error:
self.tree.new_error(node, "malformed regular expression %s: %s" %
(repr(rx), str(re_error)))
(repr(rx), str(re_error)), MALFORMED_REGULAR_EXPRESSION)
return rx
......@@ -1961,7 +2002,8 @@ class EBNFCompiler(Compiler):
self.tree.new_error(node, "Grammar does not contain any rules!", EMPTY_GRAMMAR_ERROR)
python_src = self.assemble_parser(definitions)
if get_config_value('static_analysis') == 'early':
if get_config_value('static_analysis') == 'early' and not \
any(e.code == MALFORMED_REGULAR_EXPRESSION for e in self.tree.errors):
errors = []
try:
grammar_class = compile_python_object(
......@@ -2613,7 +2655,7 @@ class EBNFCompiler(Compiler):
tk = rpl + tk[1:-1] + rpl
else:
tk = rpl + tk.replace('"', '\\"')[1:-1] + rpl
return self.TEXT_PARSER(tk, self.drop_on(DROP_STRINGS))
return self.TEXT_PARSER(tk, self.drop_on(DROP_BACKTICKED))
def on_regexp(self, node: Node) -> str:
......
......@@ -89,6 +89,7 @@ __all__ = ('ErrorCode',
'DUPLICATE_PARSERS_IN_ALTERNATIVE',
'BAD_ORDER_OF_ALTERNATIVES',
'BAD_REPETITION_COUNT',
'MALFORMED_REGULAR_EXPRESSION',
'EMPTY_GRAMMAR_ERROR',
'TREE_PROCESSING_CRASH',
'COMPILER_CRASH',
......@@ -153,6 +154,7 @@ BAD_MANDATORY_SETUP = ErrorCode(1550)
DUPLICATE_PARSERS_IN_ALTERNATIVE = ErrorCode(1560)
BAD_ORDER_OF_ALTERNATIVES = ErrorCode(1570)
BAD_REPETITION_COUNT = ErrorCode(1580)
MALFORMED_REGULAR_EXPRESSION = ErrorCode(1585)
EMPTY_GRAMMAR_ERROR = ErrorCode(1590)
# fatal errors
......
......@@ -452,7 +452,7 @@ class Parser:
This is the closest parser with a pname that contains this parser."""
if not self._symbol:
try:
self._symbol = self.grammar.associated_symbol(self).pname
self._symbol = self.grammar.associated_symbol__(self).pname
except AttributeError:
# return an empty string, if parser is not connected to grammar,
# but be sure not to save the empty string in self._symbol
......@@ -502,7 +502,7 @@ class Parser:
# catching up with parsing after an error occurred
gap = len(text) - len(pe.rest)
rules = grammar.resume_rules__.get(
self.pname or grammar.associated_symbol(self).pname, [])
self.pname or grammar.associated_symbol__(self).pname, [])
rest = pe.rest[len(pe.node):]
i = reentry_point(rest, rules, grammar.comment_rx__,
grammar.reentry_search_window__)
......@@ -1034,7 +1034,7 @@ class Grammar:
(resulting in a maximum recursion depth reached error) when
the grammar definition contains left recursions.
associated_symbol_cache__: A cache for the associated_symbol()-method.
associated_symbol_cache__: A cache for the associated_symbol__()-method.
# mirrored class attributes:
......@@ -1244,7 +1244,7 @@ class Grammar:
and (static_analysis
or (static_analysis is None
and get_config_value('static_analysis') in {'early', 'late'}))):
result = self.static_analysis()
result = self.static_analysis__()
# clears any stored errors without overwriting the pointer
while self.static_analysis_errors__:
self.static_analysis_errors__.pop()
......@@ -1527,7 +1527,7 @@ class Grammar:
else -2 # (self.document__.__len__() + 1)
def as_ebnf(self) -> str:
def as_ebnf__(self) -> str:
"""
Serializes the Grammar object as a grammar-description in the
Extended Backus-Naur-Form. Does not serialize directives and
......@@ -1543,7 +1543,7 @@ class Grammar:
return '\n'.join(ebnf)
def associated_symbol(self, parser: Parser) -> Parser:
def associated_symbol__(self, parser: Parser) -> Parser:
r"""Returns the closest named parser that contains `parser`.
If `parser` is a named parser itself, `parser` is returned.
If `parser` is not connected to any symbol in the Grammar,
......@@ -1553,7 +1553,7 @@ class Grammar:
>>> word.pname = 'word'
>>> gr = Grammar(word)
>>> anonymous_re = gr['word'].parsers[0]
>>> gr.associated_symbol(anonymous_re).pname
>>> gr.associated_symbol__(anonymous_re).pname
'word'
"""
symbol = self.associated_symbol_cache__.get(parser, None) # type: Optional[Parser]
......@@ -1581,7 +1581,7 @@ class Grammar:
return symbol
def static_analysis(self) -> List[AnalysisError]:
def static_analysis__(self) -> List[AnalysisError]:
"""
Checks the parser tree statically for possible errors.
......@@ -2364,7 +2364,7 @@ class MandatoryNary(NaryParser):
`self.grammar.skip_rules__`. If no reentry-point was found or the
skip-list ist empty, -1 is returned.
"""
skip = self.grammar.skip_rules__.get(self.grammar.associated_symbol(self).pname, [])
skip = self.grammar.skip_rules__.get(self.grammar.associated_symbol__(self).pname, [])
if skip:
gr = self._grammar
return reentry_point(text_, skip, gr.comment_rx__, gr.reentry_search_window__)
......@@ -2402,7 +2402,7 @@ class MandatoryNary(NaryParser):
location = grammar.document_length__ - len(text_)
err_node = Node(ZOMBIE_TAG, text_[:i]).with_pos(location)
found = text_[:10].replace('\n', '\\n ') + '...'
sym = self.grammar.associated_symbol(self).pname
sym = self.grammar.associated_symbol__(self).pname
err_msgs = self.grammar.error_messages__.get(sym, [])
for search, message in err_msgs:
is_func = callable(search) # search rule is a function: StringView -> bool
......@@ -2441,7 +2441,7 @@ class MandatoryNary(NaryParser):
errors = super().static_analysis()
msg = []
length = len(self.parsers)
sym = self.grammar.associated_symbol(self).pname
sym = self.grammar.associated_symbol__(self).pname
# if self.mandatory == NO_MANDATORY and sym in self.grammar.error_messages__:
# msg.append('Custom error messages require that parameter "mandatory" is set!')
# elif self.mandatory == NO_MANDATORY and sym in self.grammar.skip_rules__:
......
......@@ -16,6 +16,9 @@
# permissions and limitations under the License.
"""
Module ``syntaxtree`` encapsulates the functionality for creating
and handling syntax-trees. This includes serialization and
......@@ -1987,7 +1990,7 @@ class Node: # (collections.abc.Sized): Base class omitted for cython-compatibil
# flatten_threshold = get_config_value('flatten_sxpr_threshold')
compact_threshold = get_config_value('compact_sxpr_threshold')
if switch in ('s-expression', 'sxpr'):
if switch in ('S-expression', 'S-Expression', 's-expression', 'sxpr'):
return self.as_sxpr(flatten_threshold=get_config_value('flatten_sxpr_threshold'),
compact=exceeds_compact_threshold(self, compact_threshold))
elif switch == 'xml':
......@@ -2667,7 +2670,7 @@ class RootNode(Node):
"""
Adds an error to this tree, locating it at a specific node.
:param node: the node where the error occurred
:param node: the node where the error occurred
:param message: a string with the error message.abs
:param code: an error code to identify the type of the error
"""
......
......@@ -67,7 +67,7 @@ def trace_history(self: Parser, text: StringView) -> Tuple[Optional[Node], Strin
if mre.first_throw:
origin = mre.node.tag_name
if origin[:1] == ':':
origin = grammar.associated_symbol(mre.parser).tag_name + '->' + origin
origin = grammar.associated_symbol__(mre.parser).tag_name + '->' + origin
notice = Error( # resume notice
'Resuming from parser "{}" at position {}:{} with parser "{}": {}'
.format(origin, *lc, resumer, repr(target)),
......
......@@ -117,6 +117,18 @@ class TestDirectives:
st = parser('')
assert not st.errors and str(st) == ''
def test_drop(self):
lang = r"""
@ drop = backticked, whitespace
@ literalws = right
doc = "*" word `*`
word = /\w+/
"""
parser = create_parser(lang)
st = parser('* Hund*')
assert str(st) == "*Hund"
class TestReservedSymbols:
def test_comment_usage(self):
......
......@@ -107,7 +107,7 @@ class TestParserClass:
root__ = word
gr = MyGrammar()
regex = gr['word'].parsers[-1].parser
result = gr.associated_symbol(regex).symbol
result = gr.associated_symbol__(regex).symbol
assert result == 'word', result
......@@ -303,11 +303,11 @@ class TestInfiLoopsAndRecursion:
# snippet = " "
# syntax_tree = parser(snippet)
# assert any(e.code == INFINITE_LOOP for e in syntax_tree.errors)
# res = parser.static_analysis()
# res = parser.static_analysis__()
# assert res and res[0][2].code == INFINITE_LOOP
# minilang = """not_forever = { / / } \n"""
# parser = grammar_provider(minilang)()
# res = parser.static_analysis()
# res = parser.static_analysis__()
# assert not res
# set_config_value('static_analysis', save)
......@@ -593,7 +593,7 @@ class TestSeries:
def test_ebnf_serialization(self):
ebnf_grammar = get_ebnf_grammar()
# TODO: Add test here
ebnf = ebnf_grammar.as_ebnf()
ebnf = ebnf_grammar.as_ebnf__()
# print(ebnf)
......@@ -1576,7 +1576,7 @@ class TestStaticAnalysis:
def test_cyclical_ebnf_error(self):
doc = Text('proper'); doc.pname = "doc"
grammar = Grammar(doc)
# grammar.static_analysis()
# grammar.static_analysis__()
lang = "doc = 'proper' # this works!"
lang1 = "doc = { doc } # this parser never reaches a leaf parser."
lang2 = """doc = word | sentence # a more convoluted example
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment