Commit 4b26772d authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

- split ParserCombinators.py into different modules: version, logging,...

- split ParserCombinators.py into different modules: version, logging, syntaxtree, parser, EBNFcompiler, DSLsupport, dhparser
parent 96853fb9
......@@ -14,6 +14,8 @@ testdata/*.pdf
*~
*.old
DEBUG*
LOGS*
LOGS/
external_resources/
tmp/
#!/usr/bin/python3
"""DSLsupport.py - Support for domain specific notations for DHParser
Copyright 2016 by Eckhart Arnold (arnold@badw.de)
Bavarian Academy of Sciences an Humanities (badw.de)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
Module ``DSLsupport`` contains various functions to support the
compilation of domain specific languages based on an EBNF-grammar.
"""
import os
try:
import regex as re
except ImportError:
import re
from EBNFcompiler import EBNFGrammar, EBNFCompiler, EBNFTransTable, load_if_file, md5
from logging import LOGGING
from parser import PARSER_SYMBOLS, COMPILER_SYMBOLS, GrammarBase, CompilerBase, \
full_compilation, nil_scanner
from syntaxtree import AST_SYMBOLS, Node
from version import __version__
SECTION_MARKER = """\n
#######################################################################
#
# {marker}
#
#######################################################################
\n"""
RX_SECTION_MARKER = re.compile(SECTION_MARKER.format(marker=r'.*?SECTION.*?'))
SYMBOLS_SECTION = "SYMBOLS SECTION - Can be edited. Changes will be preserved."
SCANNER_SECTION = "SCANNER SECTION - Can be edited. Changes will be preserved."
PARSER_SECTION = "PARSER SECTION - Don't edit! CHANGES WILL BE OVERWRITTEN!"
AST_SECTION = "AST SECTION - Can be edited. Changes will be preserved."
COMPILER_SECTION = "COMPILER SECTION - Can be edited. Changes will be preserved."
END_SECTIONS_MARKER = "END OF PYDSL-SECTIONS"
# DELIMITER = "\n\n### DON'T EDIT OR REMOVE THIS LINE ###\n\n"
def is_python_code(text_or_file):
"""Checks whether 'text_or_file' is python code or the name of a file that
contains python code.
"""
if text_or_file.find('\n') < 0:
return text_or_file[-3:].lower() == '.py'
try:
compile(text_or_file, '<string>', 'exec')
return True
except (SyntaxError, ValueError, OverflowError):
pass
return False
class GrammarError(Exception):
"""Raised when (already) the grammar of a domain specific language (DSL)
contains errors.
"""
def __init__(self, error_messages, grammar_src):
self.error_messages = error_messages
self.grammar_src = grammar_src
class CompilationError(Exception):
"""Raised when a string or file in a domain specific language (DSL)
contains errors.
"""
def __init__(self, error_messages, dsl_text, dsl_grammar, AST):
self.error_messages = error_messages
self.dsl_text = dsl_text
self.dsl_grammar = dsl_grammar
self.AST = AST
def __str__(self):
return self.error_messages
def compile_python_object(python_src, obj_name_ending="Grammar"):
"""Compiles the python source code and returns the object the name of which
ends with `obj_name_ending`.
"""
code = compile(python_src, '<string>', 'exec')
module_vars = globals()
allowed_symbols = PARSER_SYMBOLS | AST_SYMBOLS | COMPILER_SYMBOLS
namespace = {k: module_vars[k] for k in allowed_symbols}
exec(code, namespace) # safety risk?
for key in namespace.keys():
if key.endswith(obj_name_ending):
parser = namespace[key]
break
else:
parser = None
return parser
def get_grammar_instance(grammar):
"""Returns a grammar object and the source code of the grammar, from
the given `grammar`-data which can be either a file name, ebnf-code,
python-code, a GrammarBase-derived grammar class or an instance of
such a class (i.e. a grammar object already).
"""
if isinstance(grammar, str):
# read grammar
grammar_src = load_if_file(grammar)
if is_python_code(grammar):
parser_py, errors, AST = grammar_src, '', None
else:
parser_py, errors, AST = full_compilation(grammar_src,
EBNFGrammar(), EBNFTransTable, EBNFCompiler())
if errors:
raise GrammarError(errors, grammar_src)
parser_root = compile_python_object(parser_py, 'Grammar')()
else:
# assume that dsl_grammar is a ParserHQ-object or Grammar class
grammar_src = ''
if isinstance(grammar, GrammarBase):
parser_root = grammar
else:
# assume `grammar` is a grammar class and get the root object
parser_root = grammar()
return parser_root, grammar_src
def load_compiler_suite(compiler_suite):
global RX_SECTION_MARKER
assert isinstance(compiler_suite, str)
source = load_if_file(compiler_suite)
if is_python_code(compiler_suite):
try:
intro, syms, scanner_py, parser_py, ast_py, compiler_py, outro = \
RX_SECTION_MARKER.split(source)
except ValueError as error:
raise ValueError('File "' + compiler_suite + '" seems to be corrupted. '
'Please delete or repair file manually.')
scanner = compile_python_object(scanner_py, 'Scanner')
ast = compile_python_object(ast_py, 'TransTable')
compiler = compile_python_object(compiler_py, 'Compiler')
else:
# assume source is an ebnf grammar
parser_py, errors, AST = full_compilation(
source, EBNFGrammar(), EBNFTransTable, EBNFCompiler())
if errors:
raise GrammarError(errors, source)
scanner = nil_scanner
ast = EBNFTransTable
compiler = EBNFCompiler()
parser = compile_python_object(parser_py, 'Grammar')()
return scanner, parser, ast, compiler
def compileDSL(text_or_file, dsl_grammar, trans_table, compiler,
scanner=nil_scanner):
"""Compiles a text in a domain specific language (DSL) with an
EBNF-specified grammar. Returns the compiled text.
"""
assert isinstance(text_or_file, str)
assert isinstance(compiler, CompilerBase)
assert isinstance(trans_table, dict)
parser_root, grammar_src = get_grammar_instance(dsl_grammar)
src = scanner(load_if_file(text_or_file))
result, errors, AST = full_compilation(src, parser_root, trans_table,
compiler)
if errors: raise CompilationError(errors, src, grammar_src, AST)
return result
def run_compiler(source_file, compiler_suite="", extension=".xml"):
"""Compiles the a source file with a given compiler and writes the
result to a file.
If no ``compiler_suite`` is given it is assumed that the source
file is an EBNF grammar. In this case the result will be a Python
script containing a parser for that grammar as well as the
skeletons for a scanner, AST transformation table, and compiler.
If the Python script already exists only the parser name in the
script will be updated. (For this to work, the different names
need to be delimited section marker blocks.). `run_compiler()`
returns a list of error messages or an empty list if no errors
occurred.
"""
def import_block(module, symbols):
"""Generates an Python-``import`` statement that imports all
alls symbols in ``symbols`` (set or other container) from
module ``module``."""
symlist = list(symbols)
grouped = [symlist[i:i + 4] for i in range(0, len(symlist), 4)]
return ("\nfrom " + module + " import "
+ ', \\\n '.join(', '.join(g) for g in grouped) + '\n\n')
filepath = os.path.normpath(source_file)
with open(source_file, encoding="utf-8") as f:
source = f.read()
rootname = os.path.splitext(filepath)[0]
if compiler_suite:
scanner, parser, trans, cclass = load_compiler_suite(compiler_suite)
compiler = cclass()
else:
scanner = nil_scanner
parser = EBNFGrammar()
trans = EBNFTransTable
compiler = EBNFCompiler(os.path.basename(rootname), source)
result, errors, ast = full_compilation(scanner(source), parser,
trans, compiler)
if errors:
return errors
elif trans == EBNFTransTable: # either an EBNF- or no compiler suite given
f = None
global SECTION_MARKER, RX_SECTION_MARKER, SCANNER_SECTION, PARSER_SECTION, \
AST_SECTION, COMPILER_SECTION, END_SECTIONS_MARKER
try:
f = open(rootname + '_compiler.py', 'r', encoding="utf-8")
source = f.read()
intro, syms, scanner, parser, ast, compiler, outro = RX_SECTION_MARKER.split(source)
except (PermissionError, FileNotFoundError, IOError) as error:
intro, outro = '', ''
syms = import_block("PyDSL", PARSER_SYMBOLS | AST_SYMBOLS | {'CompilerBase'})
scanner = compiler.gen_scanner_skeleton()
ast = compiler.gen_AST_skeleton()
compiler = compiler.gen_compiler_skeleton()
except ValueError as error:
raise ValueError('File "' + rootname + '_compiler.py" seems to be corrupted. '
'Please delete or repair file manually!')
finally:
if f: f.close()
try:
f = open(rootname + '_compiler.py', 'w', encoding="utf-8")
f.write(intro)
f.write(SECTION_MARKER.format(marker=SYMBOLS_SECTION))
f.write(syms)
f.write(SECTION_MARKER.format(marker=SCANNER_SECTION))
f.write(scanner)
f.write(SECTION_MARKER.format(marker=PARSER_SECTION))
f.write(result)
f.write(SECTION_MARKER.format(marker=AST_SECTION))
f.write(ast)
f.write(SECTION_MARKER.format(marker=COMPILER_SECTION))
f.write(compiler)
f.write(SECTION_MARKER.format(marker=END_SECTIONS_MARKER))
f.write(outro)
except (PermissionError, FileNotFoundError, IOError) as error:
print('# Could not write file "' + rootname + '_compiler.py" because of: '
+ "\n# ".join(str(error).split('\n)')))
print(result)
finally:
if f: f.close()
else:
try:
f = open(rootname + extension, 'w', encoding="utf-8")
if isinstance(result, Node):
f.write(result.as_xml())
else:
f.write(result)
except (PermissionError, FileNotFoundError, IOError) as error:
print('# Could not write file "' + rootname + '.py" because of: '
+ "\n# ".join(str(error).split('\n)')))
print(result)
finally:
if f: f.close()
if LOGGING:
print(ast)
return []
def source_changed(grammar_source, grammar_class):
"""Returns `True` if `grammar_class` does not reflect the latest
changes of `grammar_source`
Parameters:
grammar_source: File name or string representation of the
grammar source
grammar_class: the parser class representing the grammar
or the file name of a compiler suite containing the grammar
Returns (bool):
True, if the source text of the grammar is different from the
source from which the grammar class was generated
"""
grammar = load_if_file(grammar_source)
chksum = md5(grammar, __version__)
if isinstance(grammar_class, str):
# grammar_class = load_compiler_suite(grammar_class)[1]
with open(grammar_class, 'r', encoding='utf8') as f:
pycode = f.read()
m = re.search('class \w*\(GrammarBase\)', pycode)
if m:
m = re.search(' source_hash__ *= *"([a-z0-9]*)"',
pycode[m.span()[1]:])
return not (m and m.groups() and m.groups()[-1] == chksum)
else:
return True
else:
return chksum != grammar_class.source_hash__
This diff is collapsed.
......@@ -56,11 +56,13 @@ https://bitbucket.org/apalala/grako
import collections
import copy
from functools import partial
import hashlib
import keyword
import os
from functools import partial
from typing import NamedTuple
try:
import regex as re
except ImportError:
......@@ -70,67 +72,31 @@ import sys
__version__ = '0.5.3' + '_dev' + str(os.stat(__file__).st_mtime)
LOGGING = "LOGS"
DEBUG = "DEBUG"
def DEBUG_DIR():
"""Returns a path of a directory where debug files will be stored.
Usually, this is just a sub-directory named 'DEBUG'. The directory
def LOGS_DIR():
"""Returns a path of a directory where log files will be stored.
Usually, this is just a sub-directory named 'LOGS'. The directory
will be created if it does not exist.
"""
global DEBUG
if not DEBUG:
raise AssertionError("Cannot use DEBUG_DIR() if debugging is turned off!")
dirname = DEBUG
if os.path.exists(DEBUG):
if not os.path.isdir(DEBUG):
raise IOError('"' + DEBUG + '" cannot be used as debug directory, '
global LOGGING
if not LOGGING:
raise AssertionError("Cannot use LOGGING_DIR() if LOGGINGging is turned off!")
dirname = LOGGING
if os.path.exists(LOGGING):
if not os.path.isdir(LOGGING):
raise IOError('"' + LOGGING + '" cannot be used as log directory, '
'because it is not a directory!')
else:
os.mkdir(DEBUG)
os.mkdir(LOGGING)
return dirname
def DEBUG_FILE_NAME(grammar_base):
"""Returns a file name without extension based on the class name of
the ``grammar_base``-object.
"""
name = grammar_base.__class__.__name__
return name[:-7] if name.endswith('Grammar') else name
########################################################################
#
# Scanner / Preprocessor support
#
########################################################################
RX_SCANNER_TOKEN = re.compile('\w+')
BEGIN_SCANNER_TOKEN = '\x1b'
END_SCANNER_TOKEN = '\x1c'
def make_token(token, argument=''):
"""Turns the ``token`` and ``argument`` into a special token that
will be caught by the `ScannerToken`-parser.
"""
assert RX_SCANNER_TOKEN.match(token)
assert argument.find(BEGIN_SCANNER_TOKEN) < 0
assert argument.find(END_SCANNER_TOKEN) < 0
return BEGIN_SCANNER_TOKEN + token + argument + END_SCANNER_TOKEN
nil_scanner = lambda text: text
########################################################################
#
# Parser tree
# syntax tree
#
########################################################################
......@@ -223,7 +189,7 @@ class Node:
self.error_flag = any(r.error_flag for r in self.result) if self.children else False
self._len = len(self.result) if not self.children else \
sum(child._len for child in self.children)
# self.pos = 0
# self.pos = 0 # coninuous updating of pos values
self._pos = -1
def __str__(self):
......@@ -386,6 +352,13 @@ class Node:
return errors
return []
def log(self, log_file_name, ext):
global LOGGING
if LOGGING:
st_file_name = log_file_name + ext
with open(os.path.join(LOGS_DIR(), st_file_name), "w", encoding="utf-8") as f:
f.write(self.as_sexpr())
def navigate(self, path):
"""EXPERIMENTAL! NOT YET TESTED!!!
Returns the first descendant element matched by `path`, e.g.
......@@ -433,19 +406,11 @@ def error_messages(text, errors):
########################################################################
#
# Abstract syntax tree support
# syntax tree transformation
#
########################################################################
def DEBUG_DUMP_SYNTAX_TREE(grammar_base, syntax_tree, ext):
global DEBUG
if DEBUG:
st_file_name = DEBUG_FILE_NAME(grammar_base) + ext
with open(os.path.join(DEBUG_DIR(), st_file_name), "w", encoding="utf-8") as f:
f.write(syntax_tree.as_sexpr())
def expand_table(compact_table):
"""Expands a table by separating keywords that are tuples or strings
containing comma separated words into single keyword entries with
......@@ -637,8 +602,8 @@ LEFT_RECURSION_DEPTH = 10 # because of pythons recursion depth limit, this
# value ought not to be set too high
MAX_DROPOUTS = 25 # stop trying to recover parsing after so many errors
WHITESPACE_KEYWORD = 'wsp__'
TOKEN_KEYWORD = 'token__'
WHITESPACE_KEYWORD = 'WSP__'
TOKEN_KEYWORD = 'TOKEN__'
class HistoryRecord:
......@@ -816,7 +781,9 @@ class GrammarBase:
def __init__(self):
self.all_parsers = set()
self.dirty_flag = False
self.track_history = DEBUG
self.track_history = LOGGING
name = self.__class__.__name__
self.log_file_name = name[:-7] if name.lower().endswith('grammar') else name
self._reset()
self._assign_parser_names()
self.root__ = copy.deepcopy(self.__class__.root__)
......@@ -834,6 +801,7 @@ class GrammarBase:
def _reset(self):
self.variables = dict() # support for Pop and Retrieve operators
self.document = "" # source document
self.last_node = None
self.call_stack = [] # support for call stack tracing
self.history = [] # snapshots of call stacks
......@@ -863,6 +831,7 @@ class GrammarBase:
parser.reset()
else:
self.dirty_flag = True
self.document = document
parser = self.root__
result = ""
stitches = []
......@@ -889,34 +858,39 @@ class GrammarBase:
result.pos = 0 # calculate all positions
return result
def log_parsing_history(self):
"""Writes a log of the parsing history of the most recently parsed
document.
"""
def prepare_line(record):
excerpt = self.document.__getitem__(slice(*record.extent))[:25].replace('\n', '\\n')
excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
return (record.stack, record.status, excerpt)
def write_log(history, log_name):
path = os.path.join(LOGS_DIR(), self.log_file_name + log_name + "_parser.log")
if history:
with open(path, "w", encoding="utf-8") as f:
f.write("\n".join(history))
elif os.path.exists(path):
os.remove(path)
global LOGGING
if LOGGING:
assert self.history
full_history, match_history, errors_only = [], [], []
for record in self.history:
line = "; ".join(prepare_line(record))
full_history.append(line)
if record.node and record.node.parser.name != WHITESPACE_KEYWORD:
match_history.append(line)
if record.node.errors:
errors_only.append(line)
write_log(full_history, '_full')
write_log(match_history, '_match')
write_log(errors_only, '_errors')
def DEBUG_DUMP_PARSING_HISTORY(grammar_base, document):
def prepare_line(record):
excerpt = document.__getitem__(slice(*record.extent))[:25].replace('\n', '\\n')
excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
return (record.stack, record.status, excerpt)
def write_log(history, log_name):
path = os.path.join(DEBUG_DIR(), DEBUG_FILE_NAME(grammar_base) + log_name + "_parser.log")
if history:
with open(path, "w", encoding="utf-8") as f:
f.write("\n".join(history))
elif os.path.exists(path):
os.remove(path)
global DEBUG
if DEBUG:
full_history, match_history, errors_only = [], [], []
for record in grammar_base.history:
line = "; ".join(prepare_line(record))
full_history.append(line)
if record.node and record.node.parser.name != WHITESPACE_KEYWORD:
match_history.append(line)
if record.node.errors:
errors_only.append(line)
write_log(full_history, '_full')
write_log(match_history, '_match')
write_log(errors_only, '_errors')
########################################################################
......@@ -926,6 +900,29 @@ def DEBUG_DUMP_PARSING_HISTORY(grammar_base, document):
########################################################################
RX_SCANNER_TOKEN = re.compile('\w+')
BEGIN_SCANNER_TOKEN = '\x1b'
END_SCANNER_TOKEN = '\x1c'
def make_token(token, argument=''):
"""Turns the ``token`` and ``argument`` into a special token that
will be caught by the `ScannerToken`-parser.
This function is a support function that should be used by scanners
to inject scanner tokens into the source text.
"""
assert RX_SCANNER_TOKEN.match(token)
assert argument.find(BEGIN_SCANNER_TOKEN) < 0
assert argument.find(END_SCANNER_TOKEN) < 0
return BEGIN_SCANNER_TOKEN + token + argument + END_SCANNER_TOKEN
nil_scanner = lambda text: text
class ScannerToken(Parser):
def __init__(self, scanner_token):
assert isinstance(scanner_token, str) and scanner_token and \
......@@ -1139,8 +1136,6 @@ class Sequence(NaryOperator):
def __init__(self, *parsers, name=None):
super(Sequence, self).__init__(*parsers, name=name)
assert len(self.parsers) >= 1
# commented, because sequences can be empty:
# assert not all(isinstance(p, Optional) for p in self.parsers)
def __call__(self, text):
results = ()
......@@ -1410,8 +1405,8 @@ def full_compilation(source, grammar_base, AST_transformations, compiler):
assert isinstance(compiler, CompilerBase)
syntax_tree = grammar_base.parse(source)
DEBUG_DUMP_SYNTAX_TREE(grammar_base, syntax_tree, ext='.cst')
DEBUG_DUMP_PARSING_HISTORY(grammar_base, source)
syntax_tree.log(grammar_base.log_file_name, ext='.cst')
grammar_base.log_parsing_history()
assert syntax_tree.error_flag or str(syntax_tree) == source, str(syntax_tree)
# only compile if there were no syntax errors, for otherwise it is
......@@ -1420,7 +1415,7 @@ def full_compilation(source, grammar_base, AST_transformations, compiler):
result = None
else: