Commit 4b26772d authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

- split ParserCombinators.py into different modules: version, logging,...

- split ParserCombinators.py into different modules: version, logging, syntaxtree, parser, EBNFcompiler, DSLsupport, dhparser
parent 96853fb9
...@@ -14,6 +14,8 @@ testdata/*.pdf ...@@ -14,6 +14,8 @@ testdata/*.pdf
*~ *~
*.old *.old
DEBUG* DEBUG*
LOGS*
LOGS/
external_resources/ external_resources/
tmp/ tmp/
#!/usr/bin/python3
"""DSLsupport.py - Support for domain specific notations for DHParser
Copyright 2016 by Eckhart Arnold (arnold@badw.de)
Bavarian Academy of Sciences an Humanities (badw.de)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
Module ``DSLsupport`` contains various functions to support the
compilation of domain specific languages based on an EBNF-grammar.
"""
import os
try:
import regex as re
except ImportError:
import re
from EBNFcompiler import EBNFGrammar, EBNFCompiler, EBNFTransTable, load_if_file, md5
from logging import LOGGING
from parser import PARSER_SYMBOLS, COMPILER_SYMBOLS, GrammarBase, CompilerBase, \
full_compilation, nil_scanner
from syntaxtree import AST_SYMBOLS, Node
from version import __version__
SECTION_MARKER = """\n
#######################################################################
#
# {marker}
#
#######################################################################
\n"""
RX_SECTION_MARKER = re.compile(SECTION_MARKER.format(marker=r'.*?SECTION.*?'))
SYMBOLS_SECTION = "SYMBOLS SECTION - Can be edited. Changes will be preserved."
SCANNER_SECTION = "SCANNER SECTION - Can be edited. Changes will be preserved."
PARSER_SECTION = "PARSER SECTION - Don't edit! CHANGES WILL BE OVERWRITTEN!"
AST_SECTION = "AST SECTION - Can be edited. Changes will be preserved."
COMPILER_SECTION = "COMPILER SECTION - Can be edited. Changes will be preserved."
END_SECTIONS_MARKER = "END OF PYDSL-SECTIONS"
# DELIMITER = "\n\n### DON'T EDIT OR REMOVE THIS LINE ###\n\n"
def is_python_code(text_or_file):
"""Checks whether 'text_or_file' is python code or the name of a file that
contains python code.
"""
if text_or_file.find('\n') < 0:
return text_or_file[-3:].lower() == '.py'
try:
compile(text_or_file, '<string>', 'exec')
return True
except (SyntaxError, ValueError, OverflowError):
pass
return False
class GrammarError(Exception):
"""Raised when (already) the grammar of a domain specific language (DSL)
contains errors.
"""
def __init__(self, error_messages, grammar_src):
self.error_messages = error_messages
self.grammar_src = grammar_src
class CompilationError(Exception):
"""Raised when a string or file in a domain specific language (DSL)
contains errors.
"""
def __init__(self, error_messages, dsl_text, dsl_grammar, AST):
self.error_messages = error_messages
self.dsl_text = dsl_text
self.dsl_grammar = dsl_grammar
self.AST = AST
def __str__(self):
return self.error_messages
def compile_python_object(python_src, obj_name_ending="Grammar"):
"""Compiles the python source code and returns the object the name of which
ends with `obj_name_ending`.
"""
code = compile(python_src, '<string>', 'exec')
module_vars = globals()
allowed_symbols = PARSER_SYMBOLS | AST_SYMBOLS | COMPILER_SYMBOLS
namespace = {k: module_vars[k] for k in allowed_symbols}
exec(code, namespace) # safety risk?
for key in namespace.keys():
if key.endswith(obj_name_ending):
parser = namespace[key]
break
else:
parser = None
return parser
def get_grammar_instance(grammar):
"""Returns a grammar object and the source code of the grammar, from
the given `grammar`-data which can be either a file name, ebnf-code,
python-code, a GrammarBase-derived grammar class or an instance of
such a class (i.e. a grammar object already).
"""
if isinstance(grammar, str):
# read grammar
grammar_src = load_if_file(grammar)
if is_python_code(grammar):
parser_py, errors, AST = grammar_src, '', None
else:
parser_py, errors, AST = full_compilation(grammar_src,
EBNFGrammar(), EBNFTransTable, EBNFCompiler())
if errors:
raise GrammarError(errors, grammar_src)
parser_root = compile_python_object(parser_py, 'Grammar')()
else:
# assume that dsl_grammar is a ParserHQ-object or Grammar class
grammar_src = ''
if isinstance(grammar, GrammarBase):
parser_root = grammar
else:
# assume `grammar` is a grammar class and get the root object
parser_root = grammar()
return parser_root, grammar_src
def load_compiler_suite(compiler_suite):
global RX_SECTION_MARKER
assert isinstance(compiler_suite, str)
source = load_if_file(compiler_suite)
if is_python_code(compiler_suite):
try:
intro, syms, scanner_py, parser_py, ast_py, compiler_py, outro = \
RX_SECTION_MARKER.split(source)
except ValueError as error:
raise ValueError('File "' + compiler_suite + '" seems to be corrupted. '
'Please delete or repair file manually.')
scanner = compile_python_object(scanner_py, 'Scanner')
ast = compile_python_object(ast_py, 'TransTable')
compiler = compile_python_object(compiler_py, 'Compiler')
else:
# assume source is an ebnf grammar
parser_py, errors, AST = full_compilation(
source, EBNFGrammar(), EBNFTransTable, EBNFCompiler())
if errors:
raise GrammarError(errors, source)
scanner = nil_scanner
ast = EBNFTransTable
compiler = EBNFCompiler()
parser = compile_python_object(parser_py, 'Grammar')()
return scanner, parser, ast, compiler
def compileDSL(text_or_file, dsl_grammar, trans_table, compiler,
scanner=nil_scanner):
"""Compiles a text in a domain specific language (DSL) with an
EBNF-specified grammar. Returns the compiled text.
"""
assert isinstance(text_or_file, str)
assert isinstance(compiler, CompilerBase)
assert isinstance(trans_table, dict)
parser_root, grammar_src = get_grammar_instance(dsl_grammar)
src = scanner(load_if_file(text_or_file))
result, errors, AST = full_compilation(src, parser_root, trans_table,
compiler)
if errors: raise CompilationError(errors, src, grammar_src, AST)
return result
def run_compiler(source_file, compiler_suite="", extension=".xml"):
"""Compiles the a source file with a given compiler and writes the
result to a file.
If no ``compiler_suite`` is given it is assumed that the source
file is an EBNF grammar. In this case the result will be a Python
script containing a parser for that grammar as well as the
skeletons for a scanner, AST transformation table, and compiler.
If the Python script already exists only the parser name in the
script will be updated. (For this to work, the different names
need to be delimited section marker blocks.). `run_compiler()`
returns a list of error messages or an empty list if no errors
occurred.
"""
def import_block(module, symbols):
"""Generates an Python-``import`` statement that imports all
alls symbols in ``symbols`` (set or other container) from
module ``module``."""
symlist = list(symbols)
grouped = [symlist[i:i + 4] for i in range(0, len(symlist), 4)]
return ("\nfrom " + module + " import "
+ ', \\\n '.join(', '.join(g) for g in grouped) + '\n\n')
filepath = os.path.normpath(source_file)
with open(source_file, encoding="utf-8") as f:
source = f.read()
rootname = os.path.splitext(filepath)[0]
if compiler_suite:
scanner, parser, trans, cclass = load_compiler_suite(compiler_suite)
compiler = cclass()
else:
scanner = nil_scanner
parser = EBNFGrammar()
trans = EBNFTransTable
compiler = EBNFCompiler(os.path.basename(rootname), source)
result, errors, ast = full_compilation(scanner(source), parser,
trans, compiler)
if errors:
return errors
elif trans == EBNFTransTable: # either an EBNF- or no compiler suite given
f = None
global SECTION_MARKER, RX_SECTION_MARKER, SCANNER_SECTION, PARSER_SECTION, \
AST_SECTION, COMPILER_SECTION, END_SECTIONS_MARKER
try:
f = open(rootname + '_compiler.py', 'r', encoding="utf-8")
source = f.read()
intro, syms, scanner, parser, ast, compiler, outro = RX_SECTION_MARKER.split(source)
except (PermissionError, FileNotFoundError, IOError) as error:
intro, outro = '', ''
syms = import_block("PyDSL", PARSER_SYMBOLS | AST_SYMBOLS | {'CompilerBase'})
scanner = compiler.gen_scanner_skeleton()
ast = compiler.gen_AST_skeleton()
compiler = compiler.gen_compiler_skeleton()
except ValueError as error:
raise ValueError('File "' + rootname + '_compiler.py" seems to be corrupted. '
'Please delete or repair file manually!')
finally:
if f: f.close()
try:
f = open(rootname + '_compiler.py', 'w', encoding="utf-8")
f.write(intro)
f.write(SECTION_MARKER.format(marker=SYMBOLS_SECTION))
f.write(syms)
f.write(SECTION_MARKER.format(marker=SCANNER_SECTION))
f.write(scanner)
f.write(SECTION_MARKER.format(marker=PARSER_SECTION))
f.write(result)
f.write(SECTION_MARKER.format(marker=AST_SECTION))
f.write(ast)
f.write(SECTION_MARKER.format(marker=COMPILER_SECTION))
f.write(compiler)
f.write(SECTION_MARKER.format(marker=END_SECTIONS_MARKER))
f.write(outro)
except (PermissionError, FileNotFoundError, IOError) as error:
print('# Could not write file "' + rootname + '_compiler.py" because of: '
+ "\n# ".join(str(error).split('\n)')))
print(result)
finally:
if f: f.close()
else:
try:
f = open(rootname + extension, 'w', encoding="utf-8")
if isinstance(result, Node):
f.write(result.as_xml())
else:
f.write(result)
except (PermissionError, FileNotFoundError, IOError) as error:
print('# Could not write file "' + rootname + '.py" because of: '
+ "\n# ".join(str(error).split('\n)')))
print(result)
finally:
if f: f.close()
if LOGGING:
print(ast)
return []
def source_changed(grammar_source, grammar_class):
"""Returns `True` if `grammar_class` does not reflect the latest
changes of `grammar_source`
Parameters:
grammar_source: File name or string representation of the
grammar source
grammar_class: the parser class representing the grammar
or the file name of a compiler suite containing the grammar
Returns (bool):
True, if the source text of the grammar is different from the
source from which the grammar class was generated
"""
grammar = load_if_file(grammar_source)
chksum = md5(grammar, __version__)
if isinstance(grammar_class, str):
# grammar_class = load_compiler_suite(grammar_class)[1]
with open(grammar_class, 'r', encoding='utf8') as f:
pycode = f.read()
m = re.search('class \w*\(GrammarBase\)', pycode)
if m:
m = re.search(' source_hash__ *= *"([a-z0-9]*)"',
pycode[m.span()[1]:])
return not (m and m.groups() and m.groups()[-1] == chksum)
else:
return True
else:
return chksum != grammar_class.source_hash__
This diff is collapsed.
...@@ -56,11 +56,13 @@ https://bitbucket.org/apalala/grako ...@@ -56,11 +56,13 @@ https://bitbucket.org/apalala/grako
import collections import collections
import copy import copy
from functools import partial
import hashlib import hashlib
import keyword import keyword
import os import os
from functools import partial
from typing import NamedTuple from typing import NamedTuple
try: try:
import regex as re import regex as re
except ImportError: except ImportError:
...@@ -70,67 +72,31 @@ import sys ...@@ -70,67 +72,31 @@ import sys
__version__ = '0.5.3' + '_dev' + str(os.stat(__file__).st_mtime) __version__ = '0.5.3' + '_dev' + str(os.stat(__file__).st_mtime)
LOGGING = "LOGS"
DEBUG = "DEBUG"
def LOGS_DIR():
def DEBUG_DIR(): """Returns a path of a directory where log files will be stored.
"""Returns a path of a directory where debug files will be stored. Usually, this is just a sub-directory named 'LOGS'. The directory
Usually, this is just a sub-directory named 'DEBUG'. The directory
will be created if it does not exist. will be created if it does not exist.
""" """
global DEBUG global LOGGING
if not DEBUG: if not LOGGING:
raise AssertionError("Cannot use DEBUG_DIR() if debugging is turned off!") raise AssertionError("Cannot use LOGGING_DIR() if LOGGINGging is turned off!")
dirname = DEBUG dirname = LOGGING
if os.path.exists(DEBUG): if os.path.exists(LOGGING):
if not os.path.isdir(DEBUG): if not os.path.isdir(LOGGING):
raise IOError('"' + DEBUG + '" cannot be used as debug directory, ' raise IOError('"' + LOGGING + '" cannot be used as log directory, '
'because it is not a directory!') 'because it is not a directory!')
else: else:
os.mkdir(DEBUG) os.mkdir(LOGGING)
return dirname return dirname
def DEBUG_FILE_NAME(grammar_base):
"""Returns a file name without extension based on the class name of
the ``grammar_base``-object.
"""
name = grammar_base.__class__.__name__
return name[:-7] if name.endswith('Grammar') else name
########################################################################
#
# Scanner / Preprocessor support
#
########################################################################
RX_SCANNER_TOKEN = re.compile('\w+')
BEGIN_SCANNER_TOKEN = '\x1b'
END_SCANNER_TOKEN = '\x1c'
def make_token(token, argument=''):
"""Turns the ``token`` and ``argument`` into a special token that
will be caught by the `ScannerToken`-parser.
"""
assert RX_SCANNER_TOKEN.match(token)
assert argument.find(BEGIN_SCANNER_TOKEN) < 0
assert argument.find(END_SCANNER_TOKEN) < 0
return BEGIN_SCANNER_TOKEN + token + argument + END_SCANNER_TOKEN
nil_scanner = lambda text: text
######################################################################## ########################################################################
# #
# Parser tree # syntax tree
# #
######################################################################## ########################################################################
...@@ -223,7 +189,7 @@ class Node: ...@@ -223,7 +189,7 @@ class Node:
self.error_flag = any(r.error_flag for r in self.result) if self.children else False self.error_flag = any(r.error_flag for r in self.result) if self.children else False
self._len = len(self.result) if not self.children else \ self._len = len(self.result) if not self.children else \
sum(child._len for child in self.children) sum(child._len for child in self.children)
# self.pos = 0 # self.pos = 0 # coninuous updating of pos values
self._pos = -1 self._pos = -1
def __str__(self): def __str__(self):
...@@ -386,6 +352,13 @@ class Node: ...@@ -386,6 +352,13 @@ class Node:
return errors return errors
return [] return []
def log(self, log_file_name, ext):
global LOGGING
if LOGGING:
st_file_name = log_file_name + ext
with open(os.path.join(LOGS_DIR(), st_file_name), "w", encoding="utf-8") as f:
f.write(self.as_sexpr())
def navigate(self, path): def navigate(self, path):
"""EXPERIMENTAL! NOT YET TESTED!!! """EXPERIMENTAL! NOT YET TESTED!!!
Returns the first descendant element matched by `path`, e.g. Returns the first descendant element matched by `path`, e.g.
...@@ -433,19 +406,11 @@ def error_messages(text, errors): ...@@ -433,19 +406,11 @@ def error_messages(text, errors):
######################################################################## ########################################################################
# #
# Abstract syntax tree support # syntax tree transformation
# #
######################################################################## ########################################################################
def DEBUG_DUMP_SYNTAX_TREE(grammar_base, syntax_tree, ext):
global DEBUG
if DEBUG:
st_file_name = DEBUG_FILE_NAME(grammar_base) + ext
with open(os.path.join(DEBUG_DIR(), st_file_name), "w", encoding="utf-8") as f:
f.write(syntax_tree.as_sexpr())
def expand_table(compact_table): def expand_table(compact_table):
"""Expands a table by separating keywords that are tuples or strings """Expands a table by separating keywords that are tuples or strings
containing comma separated words into single keyword entries with containing comma separated words into single keyword entries with
...@@ -637,8 +602,8 @@ LEFT_RECURSION_DEPTH = 10 # because of pythons recursion depth limit, this ...@@ -637,8 +602,8 @@ LEFT_RECURSION_DEPTH = 10 # because of pythons recursion depth limit, this
# value ought not to be set too high # value ought not to be set too high
MAX_DROPOUTS = 25 # stop trying to recover parsing after so many errors MAX_DROPOUTS = 25 # stop trying to recover parsing after so many errors
WHITESPACE_KEYWORD = 'wsp__' WHITESPACE_KEYWORD = 'WSP__'
TOKEN_KEYWORD = 'token__' TOKEN_KEYWORD = 'TOKEN__'
class HistoryRecord: class HistoryRecord:
...@@ -816,7 +781,9 @@ class GrammarBase: ...@@ -816,7 +781,9 @@ class GrammarBase:
def __init__(self): def __init__(self):
self.all_parsers = set() self.all_parsers = set()
self.dirty_flag = False self.dirty_flag = False
self.track_history = DEBUG self.track_history = LOGGING
name = self.__class__.__name__
self.log_file_name = name[:-7] if name.lower().endswith('grammar') else name
self._reset() self._reset()
self._assign_parser_names() self._assign_parser_names()
self.root__ = copy.deepcopy(self.__class__.root__) self.root__ = copy.deepcopy(self.__class__.root__)
...@@ -834,6 +801,7 @@ class GrammarBase: ...@@ -834,6 +801,7 @@ class GrammarBase:
def _reset(self): def _reset(self):
self.variables = dict() # support for Pop and Retrieve operators self.variables = dict() # support for Pop and Retrieve operators
self.document = "" # source document
self.last_node = None self.last_node = None
self.call_stack = [] # support for call stack tracing self.call_stack = [] # support for call stack tracing
self.history = [] # snapshots of call stacks self.history = [] # snapshots of call stacks
...@@ -863,6 +831,7 @@ class GrammarBase: ...@@ -863,6 +831,7 @@ class GrammarBase:
parser.reset() parser.reset()
else: else:
self.dirty_flag = True self.dirty_flag = True
self.document = document
parser = self.root__ parser = self.root__
result = "" result = ""
stitches = [] stitches = []
...@@ -889,34 +858,39 @@ class GrammarBase: ...@@ -889,34 +858,39 @@ class GrammarBase:
result.pos = 0 # calculate all positions result.pos = 0 # calculate all positions
return result return result
def log_parsing_history(self):
"""Writes a log of the parsing history of the most recently parsed
document.
"""
def prepare_line(record):
excerpt = self.document.__getitem__(slice(*record.extent))[:25].replace('\n', '\\n')
excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
return (record.stack, record.status, excerpt)
def write_log(history, log_name):
path = os.path.join(LOGS_DIR(), self.log_file_name + log_name + "_parser.log")
if history:
with open(path, "w", encoding="utf-8") as f:
f.write("\n".join(history))
elif os.path.exists(path):
os.remove(path)
global LOGGING
if LOGGING:
assert self.history
full_history, match_history, errors_only = [], [], []
for record in self.history:
line = "; ".join(prepare_line(record))
full_history.append(line)
if record.node and record.node.parser.name != WHITESPACE_KEYWORD:
match_history.append(line)
if record.node.errors:
errors_only.append(line)
write_log(full_history, '_full')
write_log(match_history, '_match')
write_log(errors_only, '_errors')
def DEBUG_DUMP_PARSING_HISTORY(grammar_base, document):
def prepare_line(record):
excerpt = document.__getitem__(slice(*record.extent))[:25].replace('\n', '\\n')
excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
return (record.stack, record.status, excerpt)
def write_log(history, log_name):
path = os.path.join(DEBUG_DIR(), DEBUG_FILE_NAME(grammar_base) + log_name + "_parser.log")
if history:
with open(path, "w", encoding="utf-8") as f:
f.write("\n".join(history))
elif os.path.exists(path):
os.remove(path)
global DEBUG
if DEBUG:
full_history, match_history, errors_only = [], [], []
for record in grammar_base.history:
line = "; ".join(prepare_line(record))
full_history.append(line)
if record.node and record.node.parser.name != WHITESPACE_KEYWORD:
match_history.append(line)