Commit 3b9b93c9 authored by di68kap's avatar di68kap

Merge remote-tracking branch 'origin/master'

parents 60e20356 87779e68
......@@ -17,4 +17,5 @@ DEBUG*
DHParser Version 0.6.0 (23.4.2017)
first public release
......@@ -18,9 +18,14 @@ implied. See the License for the specific language governing
permissions and limitations under the License.
import os
# Flat namespace for the DHParser Package. Is this a good idea...?
from .toolkit import *
from .syntaxtree import *
from .parsers import *
from .ebnf import *
from .dsl import *
from .versionnumber import __version__
__version__ = '0.5.4' + '_dev' + str(os.stat(__file__).st_mtime)
__all__ = ['toolkit', 'syntaxtree', 'parsercombinators', 'EBNFcompiler', 'DSLsupport']
__author__ = "Eckhart Arnold <>"
__author__ = "Eckhart Arnold <>"
__copyright__ = ""
__all__ = ['toolkit', 'syntaxtree', 'parsers', 'ebnf', 'dsl']
""" - Support for domain specific notations for DHParser
Copyright 2016 by Eckhart Arnold (
......@@ -39,7 +37,7 @@ __all__ = ['GrammarError',
......@@ -98,10 +96,10 @@ from DHParser.parsers import GrammarBase, CompilerBase, nil_scanner, \\
Lookbehind, Lookahead, Alternative, Pop, Required, Token, \\
Optional, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Sequence, RE, Capture, \\
ZeroOrMore, Forward, NegativeLookahead, mixin_comment, full_compilation
from DHParser.syntaxtree import Node, remove_enclosing_delimiters, remove_children_if, \\
reduce_single_child, replace_by_single_child, remove_whitespace, TOKEN_KEYWORD, \\
no_operation, remove_expendables, remove_tokens, flatten, WHITESPACE_KEYWORD, \\
is_whitespace, is_expendable
from DHParser.syntaxtree import Node, traverse, remove_enclosing_delimiters, \\
remove_children_if, reduce_single_child, replace_by_single_child, remove_whitespace, \\
no_operation, remove_expendables, remove_tokens, flatten, is_whitespace, is_expendable, \\
......@@ -154,8 +152,7 @@ def get_grammar_instance(grammar):
return parser_root, grammar_src
def compileDSL(text_or_file, dsl_grammar, ast_transformation, compiler,
def compileDSL(text_or_file, scanner, dsl_grammar, ast_transformation, compiler):
"""Compiles a text in a domain specific language (DSL) with an
EBNF-specified grammar. Returns the compiled text or raises a
compilation error.
......@@ -196,7 +193,7 @@ def compileEBNF(ebnf_src, ebnf_grammar_obj=None, source_only=False):
which conforms to the language defined by ``ebnf_src``.
grammar = ebnf_grammar_obj or EBNFGrammar()
grammar_src = compileDSL(ebnf_src, grammar, EBNFTransform, EBNFCompiler())
grammar_src = compileDSL(ebnf_src, nil_scanner, grammar, EBNFTransform, EBNFCompiler())
return grammar_src if source_only else \
compile_python_object(DHPARSER_IMPORTS + grammar_src, '\w*Grammar$')
......@@ -216,7 +213,7 @@ def load_compiler_suite(compiler_suite):
raise AssertionError('File "' + compiler_suite + '" seems to be corrupted. '
'Please delete or repair file manually.')
scanner = compile_python_object(imports + scanner_py, '\w*Scanner$')
ast = compile_python_object(imports + ast_py, '\w*Pipeline$')
ast = compile_python_object(imports + ast_py, '\w*Transform$')
compiler = compile_python_object(imports + compiler_py, '\w*Compiler$')
# assume source is an ebnf grammar
......@@ -226,8 +223,8 @@ def load_compiler_suite(compiler_suite):
raise GrammarError('\n\n'.join(errors), source)
scanner = nil_scanner
ast = EBNFTransform
compiler = EBNFCompiler()
parser = compile_python_object(DHPARSER_IMPORTS + parser_py, '\w*Grammar$')()
compiler = EBNFCompiler
parser = compile_python_object(DHPARSER_IMPORTS + parser_py, '\w*Grammar$')
return scanner, parser, ast, compiler
......@@ -241,7 +238,7 @@ def suite_outdated(compiler_suite, grammar_source):
with whitespace order to trigger their recreation. Note: Do not
delete or overwrite the section marker itself.
compiler_suite: the parser class representing the grammar
or the file name of a compiler suite containing the grammar
grammar_source: File name or string representation of the
......@@ -257,7 +254,30 @@ def suite_outdated(compiler_suite, grammar_source):
return True
def run_compiler(source_file, compiler_suite="", extension=".xml"):
def run_compiler(text_or_file, compiler_suite):
"""Compiles a source with a given compiler suite.
text_or_file (str): Either the file name of the source code or
the source code directly. (Which is determined by
heuristics. If ``text_or_file`` contains at least on
linefeed then it is always assumed to be a source text and
not a file name.)
compiler_suite(str): File name of the compiler suite to be
The result of the compilation, the form and type of which
depends entirely on the compiler.
scanner, parser, ast, compiler = load_compiler_suite(compiler_suite)
return compileDSL(text_or_file, scanner, parser(), ast, compiler())
def compile_on_disk(source_file, compiler_suite="", extension=".xml"):
"""Compiles the a source file with a given compiler and writes the
result to a file.
......@@ -267,9 +287,26 @@ def run_compiler(source_file, compiler_suite="", extension=".xml"):
skeletons for a scanner, AST transformation table, and compiler.
If the Python script already exists only the parser name in the
script will be updated. (For this to work, the different names
need to be delimited section marker blocks.). `run_compiler()`
need to be delimited section marker blocks.). `compile_on_disk()`
returns a list of error messages or an empty list if no errors
source_file(str): The file name of the source text to be
compiler_suite(str): The file name of the compiler suite
(usually ending with ''), with which the source
file shall be compiled. If this is left empty, the source
file is assumed to be an EBNF-Grammar that will be compiled
with the internal EBNF-Compiler.
extension(str): The result of the compilation (if successful)
is written to a file with the same name but a different
extension than the source file. This parameter sets the
A list of error messages or an empty list if there were no
filepath = os.path.normpath(source_file)
# with open(source_file, encoding="utf-8") as f:
......@@ -277,7 +314,8 @@ def run_compiler(source_file, compiler_suite="", extension=".xml"):
rootname = os.path.splitext(filepath)[0]
compiler_name = os.path.basename(rootname)
if compiler_suite:
scanner, parser, trans, cclass = load_compiler_suite(compiler_suite)
scanner, pclass, trans, cclass = load_compiler_suite(compiler_suite)
parser = pclass()
compiler1 = cclass()
scanner = nil_scanner
......@@ -289,10 +327,9 @@ def run_compiler(source_file, compiler_suite="", extension=".xml"):
return errors
elif trans == EBNFTransform: # either an EBNF- or no compiler suite given
f = None
f = None
f = open(rootname + '', 'r', encoding="utf-8")
source =
......@@ -304,7 +341,9 @@ def run_compiler(source_file, compiler_suite="", extension=".xml"):
raise ValueError('File "' + rootname + '" seems to be corrupted. '
'Please delete or repair file manually!')
if f: f.close()
if f:
f = None
if RX_WHITESPACE.fullmatch(intro):
intro = '#!/usr/bin/python'
""" - EBNF -> Python-Parser compilation for DHParser
Copyright 2016 by Eckhart Arnold (
......@@ -18,16 +16,14 @@ implied. See the License for the specific language governing
permissions and limitations under the License.
# import collections
import keyword
from functools import partial
import keyword
import os
import regex as re
except ImportError:
import re
from .__init__ import __version__
from .toolkit import load_if_file, escape_re, md5, sane_parser_name
from .parsers import GrammarBase, mixin_comment, Forward, RE, NegativeLookahead, \
Alternative, Sequence, Optional, Required, OneOrMore, ZeroOrMore, Token, CompilerBase, \
......@@ -35,13 +31,14 @@ from .parsers import GrammarBase, mixin_comment, Forward, RE, NegativeLookahead,
from .syntaxtree import Node, traverse, remove_enclosing_delimiters, reduce_single_child, \
replace_by_single_child, TOKEN_KEYWORD, remove_expendables, remove_tokens, flatten, \
forbid, assert_content, WHITESPACE_KEYWORD
from .versionnumber import __version__
__all__ = ['EBNFGrammar',
# 'Scanner',
class EBNFGrammar(GrammarBase):
......@@ -187,11 +184,9 @@ class EBNFCompiler(CompilerBase):
'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
'vertical': r'\s*'}
def __init__(self, grammar_name="", source_text=""):
def __init__(self, grammar_name="", grammar_source=""):
super(EBNFCompiler, self).__init__()
assert grammar_name == "" or re.match('\w+\Z', grammar_name)
self.grammar_name = grammar_name
self.source_text = load_if_file(source_text)
self.set_grammar_name(grammar_name, grammar_source)
def _reset(self):
......@@ -207,6 +202,13 @@ class EBNFCompiler(CompilerBase):
'tokens': set(), # alt. 'scanner_tokens'
'counterpart': set()} # alt. 'retrieve_counterpart'
def set_grammar_name(self, grammar_name, grammar_source):
assert grammar_name == "" or re.match('\w+\Z', grammar_name)
if not grammar_name and re.fullmatch(r'[\w/:\\]+', grammar_source):
grammar_name = os.path.splitext(os.path.basename(grammar_source))[0]
self.grammar_name = grammar_name
self.grammar_source = load_if_file(grammar_source)
def gen_scanner_skeleton(self):
name = self.grammar_name + "Scanner"
return "def %s(text):\n return text\n" % name
......@@ -240,11 +242,12 @@ class EBNFCompiler(CompilerBase):
'Compiler, self).__init__()',
" assert re.match('\w+\Z', grammar_name)", '']
for name in self.definition_names:
method_name = CompilerBase.derive_method_name(name)
if name == self.root:
compiler += [' def ' + name + '(self, node):',
compiler += [' def ' + method_name + '(self, node):',
' return node', '']
compiler += [' def ' + name + '(self, node):',
compiler += [' def ' + method_name + '(self, node):',
' pass', '']
return '\n'.join(compiler)
......@@ -273,13 +276,13 @@ class EBNFCompiler(CompilerBase):
'r"""Parser for ' + article + self.grammar_name +
' source file' +
(', with this grammar:' if self.source_text else '.')]
(', with this grammar:' if self.grammar_source else '.')]
definitions.append(('parser_initialization__', '"upon instatiation"'))
if self.source_text:
if self.grammar_source:
'"%s"' % md5(self.source_text, __version__)))
'"%s"' % md5(self.grammar_source, __version__)))
declarations += [line for line in self.source_text.split('\n')]
declarations += [line for line in self.grammar_source.split('\n')]
while declarations[-1].strip() == '':
declarations = declarations[:-1]
......@@ -310,7 +313,7 @@ class EBNFCompiler(CompilerBase):
return '\n '.join(declarations)
def syntax(self, node):
def on_syntax(self, node):
definitions = []
......@@ -322,14 +325,14 @@ class EBNFCompiler(CompilerBase):
# compile definitions and directives and collect definitions
for nd in node.result:
if == "definition":
assert == "directive", nd.as_sexpr()
return self.gen_parser(definitions)
def definition(self, node):
def on_definition(self, node):
rule = node.result[0].result
if rule in self.rules:
node.add_error('A rule with name "%s" has already been defined.' % rule)
......@@ -346,7 +349,7 @@ class EBNFCompiler(CompilerBase):
% rule + '(This may change in the furute.)')
defn = self.compile__(node.result[1])
defn = self._compile(node.result[1])
if rule in self.variables:
defn = 'Capture(%s)' % defn
......@@ -370,7 +373,7 @@ class EBNFCompiler(CompilerBase):
(repr(rx), str(re_error)))
return rx
def directive(self, node):
def on_directive(self, node):
key = node.result[0].result.lower()
assert key not in self.directives['tokens']
if key in {'comment', 'whitespace'}:
......@@ -378,7 +381,7 @@ class EBNFCompiler(CompilerBase):
if len(node.result[1].result) != 1:
node.add_error('Directive "%s" must have one, but not %i values.' %
(key, len(node.result[1])))
value = self.compile__(node.result[1]).pop()
value = self._compile(node.result[1]).pop()
if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
value = EBNFCompiler.WHITESPACE[value] # replace whitespace-name by regex
......@@ -398,7 +401,7 @@ class EBNFCompiler(CompilerBase):
self.directives[key] = value
elif key == 'literalws':
value = {item.lower() for item in self.compile__(node.result[1])}
value = {item.lower() for item in self._compile(node.result[1])}
if (len(value - {'left', 'right', 'both', 'none'}) > 0
or ('none' in value and len(value) > 1)):
node.add_error('Directive "literalws" allows the values '
......@@ -409,10 +412,10 @@ class EBNFCompiler(CompilerBase):
self.directives[key] = list(ws)
elif key in {'tokens', 'scanner_tokens'}:
self.directives['tokens'] |= self.compile__(node.result[1])
self.directives['tokens'] |= self._compile(node.result[1])
elif key in {'counterpart', 'retrieve_counterpart'}:
self.directives['counterpart'] |= self.compile__(node.result[1])
self.directives['counterpart'] |= self._compile(node.result[1])
node.add_error('Unknown directive %s ! (Known ones are %s .)' %
......@@ -424,16 +427,16 @@ class EBNFCompiler(CompilerBase):
"""Compiles any non-terminal, where `parser_class` indicates the Parser class
name for the particular non-terminal.
arguments = [self.compile__(r) for r in node.result] + custom_args
arguments = [self._compile(r) for r in node.result] + custom_args
return parser_class + '(' + ', '.join(arguments) + ')'
def expression(self, node):
def on_expression(self, node):
return self.non_terminal(node, 'Alternative')
def term(self, node):
def on_term(self, node):
return self.non_terminal(node, 'Sequence')
def factor(self, node):
def on_factor(self, node):
assert isinstance(node.parser, Sequence), node.as_sexpr() # these assert statements can be removed
assert node.children
assert len(node.result) >= 2, node.as_sexpr()
......@@ -467,23 +470,23 @@ class EBNFCompiler(CompilerBase):
except KeyError:
node.add_error('Unknown prefix "%s".' % prefix)
def option(self, node):
def on_option(self, node):
return self.non_terminal(node, 'Optional')
def repetition(self, node):
def on_repetition(self, node):
return self.non_terminal(node, 'ZeroOrMore')
def oneormore(self, node):
def on_oneormore(self, node):
return self.non_terminal(node, 'OneOrMore')
def regexchain(self, node):
def on_regexchain(self, node):
raise EBNFCompilerError("Not yet implemented!")
def group(self, node):
def on_group(self, node):
raise EBNFCompilerError("Group nodes should have been eliminated by "
"AST transformation!")
def symbol(self, node):
def on_symbol(self, node):
if node.result in self.directives['tokens']:
return 'ScannerToken("' + node.result + '")'
......@@ -492,10 +495,10 @@ class EBNFCompiler(CompilerBase):
return node.result
def literal(self, node):
def on_literal(self, node):
return 'Token(' + node.result.replace('\\', r'\\') + ')' # return 'Token(' + ', '.join([node.result]) + ')' ?
def regexp(self, node):
def on_regexp(self, node):
rx = node.result
name = []
if rx[:2] == '~/':
......@@ -519,7 +522,7 @@ class EBNFCompiler(CompilerBase):
return '"' + errmsg + '"'
return 'RE(' + ', '.join([arg] + name) + ')'
def list_(self, node):
def on_list_(self, node):
assert node.children
return set(item.result.strip() for item in node.result)
""" - parser combinators for for DHParser
Copyright 2016 by Eckhart Arnold (
......@@ -337,6 +335,7 @@ class GrammarBase:
Node: The root node ot the parse tree.
assert isinstance(document, str)
if self.root__ is None:
raise NotImplementedError()
if self.dirty_flag:
......@@ -391,7 +390,6 @@ class GrammarBase:
assert self.history
if not log_file_name:
name = self.__class__.__name__
log_file_name = name[:-7] if name.lower().endswith('grammar') else name
......@@ -460,6 +458,16 @@ def nil_scanner(text):
class ScannerToken(Parser):
Parses tokens that have been inserted by a Scanner.
Scanners can generate Tokens with the ``make_token``-function.
These tokens start and end with magic characters that can only be
matched by the ScannerToken Parser. Scanner tokens can be used to
insert BEGIN - END delimiters at the beginning or ending of an
indented block. Otherwise indented block are difficult to handle
with parsing expression grammars.
def __init__(self, scanner_token):
assert isinstance(scanner_token, str) and scanner_token and \
......@@ -493,7 +501,8 @@ class ScannerToken(Parser):
class RegExp(Parser):
"""Regular expression parser.
Regular expression parser.
The RegExp-parser parses text that matches a regular expression.
RegExp can also be considered as the "atomic parser", because all
......@@ -948,16 +957,53 @@ class Forward(Parser):
class CompilerBase:
def compile__(self, node):
comp, cls =, node.parser.__class__.__name__
elem = comp or cls
def __init__(self):
self.dirty_flag = False
def _reset(self):
def compile_all(self, node):
"""Compiles the abstract syntax tree with the root ``node``.
It's called `compile_all`` to avoid confusion with the
``_compile`` that is called from within the local node
compiler methods.
if self.dirty_flag:
self.dirty_flag = True
return self._compile(node)
def derive_method_name(node_name):
"""Returns the method name for ``node_name``, e.g.
>>> CompilerBase.method_name('expression')
return 'on_' + node_name
def _compile(self, node):
"""Calls the compilation method for the given node and returns
the result of the compilation.
The method's name is dreived from either the node's parser
name or, if the parser is anonymous, the node's parser's class
name by adding the prefix 'on_'.
Note that ``_compile`` does not call any compilation functions
for the parsers of the sub nodes by itself. Rather, this should
be done within the compilation methods.
elem = or node.parser.__class__.__name__
if not sane_parser_name(elem):
node.add_error("Must not use reserved name '%s' as parser "
node.add_error("Reserved name '%s' not allowed as parser "
"name! " % elem + "(Any name starting with "
"'_' or '__' or ending with '__' is reserved.)")
"'_' or '__' or ending with '__' is reserved.)")
return None
compiler = self.__getattribute__(elem) # TODO Add support for python keyword attributes
compiler = self.__getattribute__(self.derive_method_name(elem))
result = compiler(node)
for child in node.children:
node.error_flag |= child.error_flag
......@@ -976,7 +1022,7 @@ def full_compilation(source, scanner, parser, transform, compiler):
source (str): The input text for compilation or a the name of a
file containing the input text.
scanner (funciton): text -> text. A scanner function or None,
scanner (function): text -> text. A scanner function or None,
if no scanner is needed.
parser (GrammarBase): The GrammarBase object
transform (function): A transformation function that takes
......@@ -991,8 +1037,7 @@ def full_compilation(source, scanner, parser, transform, compiler):
(result, errors, abstract syntax tree). In detail:
1. The result as returned by the compiler or ``None`` in case
of failure,
2. A list of error messages, each of which is a tuple
(position: int, error: str)
2. A list of error messages
3. The root-node of the abstract syntax treelow
assert isinstance(compiler, CompilerBase)
......@@ -1017,7 +1062,7 @@ def full_compilation(source, scanner, parser, transform, compiler):
syntax_tree.log(log_file_name, ext='.ast')
errors = syntax_tree.collect_errors()
if not errors:
result = compiler.compile__(syntax_tree)
result = compiler.compile_all(syntax_tree)
errors = syntax_tree.collect_errors()
messages = error_messages(source_text, errors)
return result, messages, syntax_tree
""" - syntax tree classes and transformation functions for
converting the concrete into the abstract syntax tree for DHParser
......@@ -22,6 +20,8 @@ permissions and limitations under the License.
import itertools
import os
from functools import partial
import regex as re
except ImportError:
......@@ -36,7 +36,7 @@ __all__ = ['WHITESPACE_KEYWORD',
......@@ -56,7 +56,25 @@ __all__ = ['WHITESPACE_KEYWORD',
class ZombieParser:
class MockParser:
MockParser objects can be used to reconstruct syntax trees from a
serialized form like S-expressions or XML. Mock objects are needed,
because Node objects require a parser object for instantiation.
Mock objects have just enough properties to serve that purpose.
Mock objects should not be used for anything other than
syntax tree (re-)construction. In all other cases where a parser
object substitute is needed, chose the singleton ZOMBIE_PARSER.