10.12., 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit 4b26772d authored by Eckhart Arnold's avatar Eckhart Arnold

- split ParserCombinators.py into different modules: version, logging,...

- split ParserCombinators.py into different modules: version, logging, syntaxtree, parser, EBNFcompiler, DSLsupport, dhparser
parent 96853fb9
......@@ -14,6 +14,8 @@ testdata/*.pdf
*~
*.old
DEBUG*
LOGS*
LOGS/
external_resources/
tmp/
#!/usr/bin/python3
"""DSLsupport.py - Support for domain specific notations for DHParser
Copyright 2016 by Eckhart Arnold (arnold@badw.de)
Bavarian Academy of Sciences an Humanities (badw.de)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
Module ``DSLsupport`` contains various functions to support the
compilation of domain specific languages based on an EBNF-grammar.
"""
import os
try:
import regex as re
except ImportError:
import re
from EBNFcompiler import EBNFGrammar, EBNFCompiler, EBNFTransTable, load_if_file, md5
from logging import LOGGING
from parser import PARSER_SYMBOLS, COMPILER_SYMBOLS, GrammarBase, CompilerBase, \
full_compilation, nil_scanner
from syntaxtree import AST_SYMBOLS, Node
from version import __version__
SECTION_MARKER = """\n
#######################################################################
#
# {marker}
#
#######################################################################
\n"""
RX_SECTION_MARKER = re.compile(SECTION_MARKER.format(marker=r'.*?SECTION.*?'))
SYMBOLS_SECTION = "SYMBOLS SECTION - Can be edited. Changes will be preserved."
SCANNER_SECTION = "SCANNER SECTION - Can be edited. Changes will be preserved."
PARSER_SECTION = "PARSER SECTION - Don't edit! CHANGES WILL BE OVERWRITTEN!"
AST_SECTION = "AST SECTION - Can be edited. Changes will be preserved."
COMPILER_SECTION = "COMPILER SECTION - Can be edited. Changes will be preserved."
END_SECTIONS_MARKER = "END OF PYDSL-SECTIONS"
# DELIMITER = "\n\n### DON'T EDIT OR REMOVE THIS LINE ###\n\n"
def is_python_code(text_or_file):
"""Checks whether 'text_or_file' is python code or the name of a file that
contains python code.
"""
if text_or_file.find('\n') < 0:
return text_or_file[-3:].lower() == '.py'
try:
compile(text_or_file, '<string>', 'exec')
return True
except (SyntaxError, ValueError, OverflowError):
pass
return False
class GrammarError(Exception):
"""Raised when (already) the grammar of a domain specific language (DSL)
contains errors.
"""
def __init__(self, error_messages, grammar_src):
self.error_messages = error_messages
self.grammar_src = grammar_src
class CompilationError(Exception):
"""Raised when a string or file in a domain specific language (DSL)
contains errors.
"""
def __init__(self, error_messages, dsl_text, dsl_grammar, AST):
self.error_messages = error_messages
self.dsl_text = dsl_text
self.dsl_grammar = dsl_grammar
self.AST = AST
def __str__(self):
return self.error_messages
def compile_python_object(python_src, obj_name_ending="Grammar"):
"""Compiles the python source code and returns the object the name of which
ends with `obj_name_ending`.
"""
code = compile(python_src, '<string>', 'exec')
module_vars = globals()
allowed_symbols = PARSER_SYMBOLS | AST_SYMBOLS | COMPILER_SYMBOLS
namespace = {k: module_vars[k] for k in allowed_symbols}
exec(code, namespace) # safety risk?
for key in namespace.keys():
if key.endswith(obj_name_ending):
parser = namespace[key]
break
else:
parser = None
return parser
def get_grammar_instance(grammar):
"""Returns a grammar object and the source code of the grammar, from
the given `grammar`-data which can be either a file name, ebnf-code,
python-code, a GrammarBase-derived grammar class or an instance of
such a class (i.e. a grammar object already).
"""
if isinstance(grammar, str):
# read grammar
grammar_src = load_if_file(grammar)
if is_python_code(grammar):
parser_py, errors, AST = grammar_src, '', None
else:
parser_py, errors, AST = full_compilation(grammar_src,
EBNFGrammar(), EBNFTransTable, EBNFCompiler())
if errors:
raise GrammarError(errors, grammar_src)
parser_root = compile_python_object(parser_py, 'Grammar')()
else:
# assume that dsl_grammar is a ParserHQ-object or Grammar class
grammar_src = ''
if isinstance(grammar, GrammarBase):
parser_root = grammar
else:
# assume `grammar` is a grammar class and get the root object
parser_root = grammar()
return parser_root, grammar_src
def load_compiler_suite(compiler_suite):
global RX_SECTION_MARKER
assert isinstance(compiler_suite, str)
source = load_if_file(compiler_suite)
if is_python_code(compiler_suite):
try:
intro, syms, scanner_py, parser_py, ast_py, compiler_py, outro = \
RX_SECTION_MARKER.split(source)
except ValueError as error:
raise ValueError('File "' + compiler_suite + '" seems to be corrupted. '
'Please delete or repair file manually.')
scanner = compile_python_object(scanner_py, 'Scanner')
ast = compile_python_object(ast_py, 'TransTable')
compiler = compile_python_object(compiler_py, 'Compiler')
else:
# assume source is an ebnf grammar
parser_py, errors, AST = full_compilation(
source, EBNFGrammar(), EBNFTransTable, EBNFCompiler())
if errors:
raise GrammarError(errors, source)
scanner = nil_scanner
ast = EBNFTransTable
compiler = EBNFCompiler()
parser = compile_python_object(parser_py, 'Grammar')()
return scanner, parser, ast, compiler
def compileDSL(text_or_file, dsl_grammar, trans_table, compiler,
scanner=nil_scanner):
"""Compiles a text in a domain specific language (DSL) with an
EBNF-specified grammar. Returns the compiled text.
"""
assert isinstance(text_or_file, str)
assert isinstance(compiler, CompilerBase)
assert isinstance(trans_table, dict)
parser_root, grammar_src = get_grammar_instance(dsl_grammar)
src = scanner(load_if_file(text_or_file))
result, errors, AST = full_compilation(src, parser_root, trans_table,
compiler)
if errors: raise CompilationError(errors, src, grammar_src, AST)
return result
def run_compiler(source_file, compiler_suite="", extension=".xml"):
"""Compiles the a source file with a given compiler and writes the
result to a file.
If no ``compiler_suite`` is given it is assumed that the source
file is an EBNF grammar. In this case the result will be a Python
script containing a parser for that grammar as well as the
skeletons for a scanner, AST transformation table, and compiler.
If the Python script already exists only the parser name in the
script will be updated. (For this to work, the different names
need to be delimited section marker blocks.). `run_compiler()`
returns a list of error messages or an empty list if no errors
occurred.
"""
def import_block(module, symbols):
"""Generates an Python-``import`` statement that imports all
alls symbols in ``symbols`` (set or other container) from
module ``module``."""
symlist = list(symbols)
grouped = [symlist[i:i + 4] for i in range(0, len(symlist), 4)]
return ("\nfrom " + module + " import "
+ ', \\\n '.join(', '.join(g) for g in grouped) + '\n\n')
filepath = os.path.normpath(source_file)
with open(source_file, encoding="utf-8") as f:
source = f.read()
rootname = os.path.splitext(filepath)[0]
if compiler_suite:
scanner, parser, trans, cclass = load_compiler_suite(compiler_suite)
compiler = cclass()
else:
scanner = nil_scanner
parser = EBNFGrammar()
trans = EBNFTransTable
compiler = EBNFCompiler(os.path.basename(rootname), source)
result, errors, ast = full_compilation(scanner(source), parser,
trans, compiler)
if errors:
return errors
elif trans == EBNFTransTable: # either an EBNF- or no compiler suite given
f = None
global SECTION_MARKER, RX_SECTION_MARKER, SCANNER_SECTION, PARSER_SECTION, \
AST_SECTION, COMPILER_SECTION, END_SECTIONS_MARKER
try:
f = open(rootname + '_compiler.py', 'r', encoding="utf-8")
source = f.read()
intro, syms, scanner, parser, ast, compiler, outro = RX_SECTION_MARKER.split(source)
except (PermissionError, FileNotFoundError, IOError) as error:
intro, outro = '', ''
syms = import_block("PyDSL", PARSER_SYMBOLS | AST_SYMBOLS | {'CompilerBase'})
scanner = compiler.gen_scanner_skeleton()
ast = compiler.gen_AST_skeleton()
compiler = compiler.gen_compiler_skeleton()
except ValueError as error:
raise ValueError('File "' + rootname + '_compiler.py" seems to be corrupted. '
'Please delete or repair file manually!')
finally:
if f: f.close()
try:
f = open(rootname + '_compiler.py', 'w', encoding="utf-8")
f.write(intro)
f.write(SECTION_MARKER.format(marker=SYMBOLS_SECTION))
f.write(syms)
f.write(SECTION_MARKER.format(marker=SCANNER_SECTION))
f.write(scanner)
f.write(SECTION_MARKER.format(marker=PARSER_SECTION))
f.write(result)
f.write(SECTION_MARKER.format(marker=AST_SECTION))
f.write(ast)
f.write(SECTION_MARKER.format(marker=COMPILER_SECTION))
f.write(compiler)
f.write(SECTION_MARKER.format(marker=END_SECTIONS_MARKER))
f.write(outro)
except (PermissionError, FileNotFoundError, IOError) as error:
print('# Could not write file "' + rootname + '_compiler.py" because of: '
+ "\n# ".join(str(error).split('\n)')))
print(result)
finally:
if f: f.close()
else:
try:
f = open(rootname + extension, 'w', encoding="utf-8")
if isinstance(result, Node):
f.write(result.as_xml())
else:
f.write(result)
except (PermissionError, FileNotFoundError, IOError) as error:
print('# Could not write file "' + rootname + '.py" because of: '
+ "\n# ".join(str(error).split('\n)')))
print(result)
finally:
if f: f.close()
if LOGGING:
print(ast)
return []
def source_changed(grammar_source, grammar_class):
"""Returns `True` if `grammar_class` does not reflect the latest
changes of `grammar_source`
Parameters:
grammar_source: File name or string representation of the
grammar source
grammar_class: the parser class representing the grammar
or the file name of a compiler suite containing the grammar
Returns (bool):
True, if the source text of the grammar is different from the
source from which the grammar class was generated
"""
grammar = load_if_file(grammar_source)
chksum = md5(grammar, __version__)
if isinstance(grammar_class, str):
# grammar_class = load_compiler_suite(grammar_class)[1]
with open(grammar_class, 'r', encoding='utf8') as f:
pycode = f.read()
m = re.search('class \w*\(GrammarBase\)', pycode)
if m:
m = re.search(' source_hash__ *= *"([a-z0-9]*)"',
pycode[m.span()[1]:])
return not (m and m.groups() and m.groups()[-1] == chksum)
else:
return True
else:
return chksum != grammar_class.source_hash__
#!/usr/bin/python3
"""EBNFcompiler.py - EBNF -> Python-Parser compilation for DHParser
Copyright 2016 by Eckhart Arnold (arnold@badw.de)
Bavarian Academy of Sciences an Humanities (badw.de)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
"""
import collections
import hashlib
import keyword
from functools import partial
try:
import regex as re
except ImportError:
import re
from parser import mixin_comment, RE, Token, Required, NegativeLookahead, Optional, ZeroOrMore, \
Sequence, Alternative, Forward, OneOrMore, GrammarBase, CompilerBase, escape_re, \
sane_parser_name
from syntaxtree import replace_by_single_child, reduce_single_child, remove_expendables, \
flatten, remove_tokens, remove_brackets, TOKEN_KEYWORD, WHITESPACE_KEYWORD, Node
from version import __version__
########################################################################
#
# EBNF-Grammar-Compiler
#
########################################################################
class EBNFGrammar(GrammarBase):
r"""Parser for an EBNF source file, with this grammar:
# EBNF-Grammar in EBNF
@ comment = /#.*(?:\n|$)/ # comments start with '#' and eat all chars up to and including '\n'
@ whitespace = /\s*/ # whitespace includes linefeed
@ literalws = right # trailing whitespace of literals will be ignored tacitly
syntax = [~//] { definition | directive } §EOF
definition = symbol §"=" expression
directive = "@" §symbol §"=" ( regexp | literal | list_ )
expression = term { "|" term }
term = { factor }+
factor = [flowmarker] [retrieveop] symbol !"=" # negative lookahead to be sure it's not a definition
| [flowmarker] literal
| [flowmarker] regexp
| [flowmarker] group
| [flowmarker] oneormore
| repetition
| option
flowmarker = "!" | "&" | "§" | # '!' negative lookahead, '&' positive lookahead, '§' required
"-!" | "-&" # '-' negative lookbehind, '-&' positive lookbehind
retrieveop = "::" | ":" # '::' pop, ':' retrieve
group = "(" expression §")"
option = "[" expression §"]"
oneormore = "{" expression "}+"
repetition = "{" expression §"}"
symbol = /(?!\d)\w+/~ # e.g. expression, factor, parameter_list
literal = /"(?:[^"]|\\")*?"/~ # e.g. "(", '+', 'while'
| /'(?:[^']|\\')*?'/~ # whitespace following literals will be ignored tacitly.
regexp = /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
# '~' is a whitespace-marker, if present leading or trailing
# whitespace of a regular expression will be ignored tacitly.
list_ = /\w+\s*(?:,\s*\w+\s*)*/~ # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
# BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
EOF = !/./
"""
expression = Forward()
source_hash__ = "1065c2e43262a5cb3aa438ec4d347c32"
parser_initialization__ = "upon instatiation"
wsp__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
wspL__ = ''
wspR__ = wsp__
EOF = NegativeLookahead(RE('.', wR=''))
list_ = RE('\\w+\\s*(?:,\\s*\\w+\\s*)*')
regexp = RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
symbol = RE('(?!\\d)\\w+')
repetition = Sequence(Token("{"), expression, Required(Token("}")))
oneormore = Sequence(Token("{"), expression, Token("}+"))
option = Sequence(Token("["), expression, Required(Token("]")))
group = Sequence(Token("("), expression, Required(Token(")")))
retrieveop = Alternative(Token("::"), Token(":"))
flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
factor = Alternative(Sequence(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
Sequence(Optional(flowmarker), literal), Sequence(Optional(flowmarker), regexp),
Sequence(Optional(flowmarker), group), Sequence(Optional(flowmarker), oneormore), repetition,
option)
term = OneOrMore(factor)
expression.set(Sequence(term, ZeroOrMore(Sequence(Token("|"), term))))
directive = Sequence(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
definition = Sequence(symbol, Required(Token("=")), expression)
syntax = Sequence(Optional(RE('', wR='', wL=wsp__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
root__ = syntax
EBNFTransTable = {
# AST Transformations for EBNF-grammar
"syntax":
remove_expendables,
"directive, definition":
partial(remove_tokens, tokens={'@', '='}),
"expression, chain":
[replace_by_single_child, flatten,
partial(remove_tokens, tokens={'|', '--'})],
"term":
[replace_by_single_child, flatten], # supports both idioms: "{ factor }+" and "factor { factor }"
"factor, flowmarker, retrieveop":
replace_by_single_child,
"group":
[remove_brackets, replace_by_single_child],
"oneormore, repetition, option":
[reduce_single_child, remove_brackets],
"symbol, literal, regexp, list_":
[remove_expendables, reduce_single_child],
(TOKEN_KEYWORD, WHITESPACE_KEYWORD):
[remove_expendables, reduce_single_child],
"":
[remove_expendables, replace_by_single_child]
}
def load_if_file(text_or_file):
"""Reads and returns content of a file if parameter `text_or_file` is a
file name (i.e. a single line string), otherwise (i.e. if `text_or_file` is
a multiline string) returns the content of `text_or_file`.
"""
if text_or_file and text_or_file.find('\n') < 0:
with open(text_or_file, encoding="utf-8") as f:
content = f.read()
return content
else:
return text_or_file
class EBNFCompilerError(Exception):
"""Error raised by `EBNFCompiler` class. (Not compilation errors
in the strict sense, see `CompilationError` below)"""
pass
Scanner = collections.namedtuple('Scanner',
'symbol instantiation_call cls_name cls')
def md5(*txt):
"""Returns the md5-checksum for `txt`. This can be used to test if
some piece of text, for example a grammar source file, has changed.
"""
md5_hash = hashlib.md5()
for t in txt:
md5_hash.update(t.encode('utf8'))
return md5_hash.hexdigest()
class EBNFCompiler(CompilerBase):
"""Generates a Parser from an abstract syntax tree of a grammar specified
in EBNF-Notation.
"""
COMMENT_KEYWORD = "COMMENT__"
RESERVED_SYMBOLS = {TOKEN_KEYWORD, WHITESPACE_KEYWORD, COMMENT_KEYWORD}
KNOWN_DIRECTIVES = {'comment', 'whitespace', 'tokens', 'literalws'}
VOWELS = {'A', 'E', 'I', 'O', 'U'} # what about cases like 'hour', 'universe' etc.?
AST_ERROR = "Badly structured syntax tree. " \
"Potentially due to erroneuos AST transformation."
PREFIX_TABLE = [('§', 'Required'), ('&', 'Lookahead'),
('!', 'NegativeLookahead'), ('-&', 'Lookbehind'),
('-!', 'NegativeLookbehind'), ('::', 'Pop'),
(':', 'Retrieve')]
def __init__(self, grammar_name="", source_text=""):
super(EBNFCompiler, self).__init__()
assert grammar_name == "" or re.match('\w+\Z', grammar_name)
self.grammar_name = grammar_name
self.source_text = load_if_file(source_text)
self._reset()
def _reset(self):
self.rules = set()
self.symbols = set()
self.variables = set()
self.scanner_tokens = set()
self.definition_names = []
self.recursive = set()
self.root = ""
self.directives = {'whitespace': '\s*',
'comment': '',
'literalws': ['wR=' + WHITESPACE_KEYWORD]}
def gen_scanner_skeleton(self):
name = self.grammar_name + "Scanner"
return "def %s(text):\n return text\n" % name
def gen_AST_skeleton(self):
if not self.definition_names:
raise EBNFCompilerError('Compiler has not been run before calling '
'"gen_AST_Skeleton()"!')
transtable = [self.grammar_name + 'TransTable = {',
' # AST Transformations for the ' +
self.grammar_name + '-grammar']
for name in self.definition_names:
transtable.append(' "' + name + '": no_transformation,')
transtable += [' "": no_transformation', '}', '']
return '\n'.join(transtable)
def gen_compiler_skeleton(self):
if not self.definition_names:
raise EBNFCompilerError('Compiler has not been run before calling '
'"gen_Compiler_Skeleton()"!')
compiler = ['class ' + self.grammar_name + 'Compiler(CompilerBase):',
' """Compiler for the abstract-syntax-tree of a ' +
self.grammar_name + ' source file.',
' """', '',
' def __init__(self, grammar_name="' +
self.grammar_name + '"):',
' super(' + self.grammar_name +
'Compiler, self).__init__()',
" assert re.match('\w+\Z', grammar_name)", '']
for name in self.definition_names:
if name == self.root:
compiler += [' def ' + name + '(self, node):',
' return node', '']
else:
compiler += [' def ' + name + '(self, node):',
' pass', '']
return '\n'.join(compiler + [''])
def gen_parser(self, definitions):
# fix capture of variables that have been defined before usage [sic!]
if self.variables:
for i in range(len(definitions)):
if definitions[i][0] in self.variables:
definitions[i] = (definitions[i][0], 'Capture(%s, "%s")' %
(definitions[1], definitions[0]))
self.definition_names = [defn[0] for defn in definitions]
definitions.append(('wspR__', WHITESPACE_KEYWORD \
if 'right' in self.directives['literalws'] else "''"))
definitions.append(('wspL__', WHITESPACE_KEYWORD \
if 'left' in self.directives['literalws'] else "''"))
definitions.append((WHITESPACE_KEYWORD,
("mixin_comment(whitespace="
"r'{whitespace}', comment=r'{comment}')").
format(**self.directives)))
definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))
# prepare parser class header and docstring and
# add EBNF grammar to the doc string of the parser class
article = 'an ' if self.grammar_name[0:1].upper() \
in EBNFCompiler.VOWELS else 'a '
declarations = ['class ' + self.grammar_name +
'Grammar(GrammarBase):',
'r"""Parser for ' + article + self.grammar_name +
' source file' +
(', with this grammar:' if self.source_text else '.')]
definitions.append(('parser_initialization__', '"upon instatiation"'))
if self.source_text:
definitions.append(('source_hash__',
'"%s"' % md5(self.source_text, __version__)))
declarations.append('')
declarations += [line for line in self.source_text.split('\n')]
while declarations[-1].strip() == '':
declarations = declarations[:-1]
declarations.append('"""')
# turn definitions into declarations in reverse order
self.root = definitions[0][0] if definitions else ""
definitions.reverse()
declarations += [symbol + ' = Forward()'
for symbol in sorted(list(self.recursive))]
for symbol, statement in definitions:
if symbol in self.recursive:
declarations += [symbol + '.set(' + statement + ')']
else:
declarations += [symbol + ' = ' + statement]
for nd in self.symbols:
if nd.result not in self.rules:
nd.add_error("Missing production for symbol '%s'" % nd.result)
if self.root and 'root__' not in self.symbols:
declarations.append('root__ = ' + self.root)
declarations.append('')
return '\n '.join(declarations)
def syntax(self, node):
self._reset()
definitions = []
# drop the wrapping sequence node
if isinstance(node.parser, Sequence) and \
isinstance(node.result[0].parser, ZeroOrMore):
node = node.result[0]