In January 2021 we will introduce a 10 GB quota for project repositories. Higher limits for individual projects will be available on request. Please see https://doku.lrz.de/display/PUBLIC/GitLab for more information.

Commit b5fd9558 authored by Eckhart Arnold's avatar Eckhart Arnold

- rudimentary support for semantic validation in syntaxtree.py; more unittests; bug fixes

parent 84e7061a
......@@ -32,7 +32,7 @@ from .parsercombinators import GrammarBase, mixin_comment, Forward, RE, Negative
Alternative, Sequence, Optional, Required, OneOrMore, ZeroOrMore, Token, CompilerBase
from .syntaxtree import Node, remove_enclosing_delimiters, reduce_single_child, \
replace_by_single_child, TOKEN_KEYWORD, remove_expendables, remove_tokens, flatten, \
WHITESPACE_KEYWORD
forbid, assert_content, WHITESPACE_KEYWORD
__all__ = ['EBNFGrammar',
......@@ -61,6 +61,7 @@ class EBNFGrammar(GrammarBase):
| [flowmarker] literal
| [flowmarker] regexp
| [flowmarker] group
| [flowmarker] regexchain
| [flowmarker] oneormore
| repetition
| option
......@@ -70,9 +71,12 @@ class EBNFGrammar(GrammarBase):
retrieveop = "::" | ":" # '::' pop, ':' retrieve
group = "(" expression §")"
option = "[" expression §"]"
regexchain = "<" expression §">" # compiles "expression" into a singular regular expression
oneormore = "{" expression "}+"
repetition = "{" expression §"}"
option = "[" expression §"]"
link = regexp | symbol | literal # semantic restriction: symbol must evaluate to a regexp or chain
symbol = /(?!\d)\w+/~ # e.g. expression, factor, parameter_list
literal = /"(?:[^"]|\\")*?"/~ # e.g. "(", '+', 'while'
......@@ -80,36 +84,39 @@ class EBNFGrammar(GrammarBase):
regexp = /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
# '~' is a whitespace-marker, if present leading or trailing
# whitespace of a regular expression will be ignored tacitly.
list_ = /\w+\s*(?:,\s*\w+\s*)*/~ # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
list_ = /\w+/~ { "," /\w+/~ } # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
# BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
EOF = !/./
"""
expression = Forward()
source_hash__ = "1065c2e43262a5cb3aa438ec4d347c32"
source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
parser_initialization__ = "upon instatiation"
wsp__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
COMMENT__ = r'#.*(?:\n|$)'
WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
wspL__ = ''
wspR__ = wsp__
wspR__ = WSP__
EOF = NegativeLookahead(RE('.', wR=''))
list_ = RE('\\w+\\s*(?:,\\s*\\w+\\s*)*')
list_ = Sequence(RE('\\w+'), ZeroOrMore(Sequence(Token(","), RE('\\w+'))))
regexp = RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
symbol = RE('(?!\\d)\\w+')
link = Alternative(regexp, symbol, literal)
option = Sequence(Token("["), expression, Required(Token("]")))
repetition = Sequence(Token("{"), expression, Required(Token("}")))
oneormore = Sequence(Token("{"), expression, Token("}+"))
option = Sequence(Token("["), expression, Required(Token("]")))
regexchain = Sequence(Token("<"), expression, Required(Token(">")))
group = Sequence(Token("("), expression, Required(Token(")")))
retrieveop = Alternative(Token("::"), Token(":"))
flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
factor = Alternative(Sequence(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
Sequence(Optional(flowmarker), literal), Sequence(Optional(flowmarker), regexp),
Sequence(Optional(flowmarker), group), Sequence(Optional(flowmarker), oneormore), repetition,
option)
Sequence(Optional(flowmarker), group), Sequence(Optional(flowmarker), regexchain),
Sequence(Optional(flowmarker), oneormore), repetition, option)
term = OneOrMore(factor)
expression.set(Sequence(term, ZeroOrMore(Sequence(Token("|"), term))))
directive = Sequence(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
definition = Sequence(symbol, Required(Token("=")), expression)
syntax = Sequence(Optional(RE('', wR='', wL=wsp__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
syntax = Sequence(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
root__ = syntax
......@@ -140,8 +147,14 @@ EBNF_ASTTransform = {
[remove_expendables, replace_by_single_child]
}
EBNF_semantic_validation = {
# Semantic validation on the AST
"repetition, option, oneormore":
[partial(forbid, child_tags=['repetition', 'option', 'oneormore']),
partial(assert_content, regex=r'(?!§)')],
}
EBNF_ASTPipeline = [EBNF_ASTTransform]
EBNF_ASTPipeline = [EBNF_ASTTransform, EBNF_semantic_validation]
class EBNFCompilerError(Exception):
......
......@@ -59,7 +59,7 @@ try:
except ImportError:
import re
from .toolkit import IS_LOGGING, LOGS_DIR, escape_re, sane_parser_name, sequence
from .toolkit import IS_LOGGING, LOGS_DIR, escape_re, sane_parser_name, smart_list
from .syntaxtree import WHITESPACE_KEYWORD, TOKEN_KEYWORD, ZOMBIE_PARSER, Node, \
traverse
from DHParser.toolkit import error_messages
......@@ -460,6 +460,13 @@ class ScannerToken(Parser):
class RegExp(Parser):
"""Regular expression parser.
The RegExp-parser parses text that matches a regular expression.
RegExp can also be considered as the "atomic parser", because all
other parsers delegate part of the parsing job to other parsers,
but do not match text directly.
"""
def __init__(self, regexp, name=None):
super(RegExp, self).__init__(name)
self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp
......@@ -482,9 +489,33 @@ class RegExp(Parser):
class RE(Parser):
"""Regular Expressions with optional leading or trailing whitespace.
The RE-parser parses pieces of text that match a given regular
expression. Other than the ``RegExp``-Parser it can also skip
"implicit whitespace" before or after the matched text.
The whitespace is in turn defined by a regular expression. It
should be made sure that this expression also matches the empty
string, e.g. use r'\s*' or r'[\t ]+', but not r'\s+'. If the
respective parameters in the constructor are set to ``None`` the
default whitespace expression from the Grammar object will be used.
"""
def __init__(self, regexp, wL=None, wR=None, name=None):
"""Constructor for class RE.
Args:
regexp (str or regex object): The regular expression to be
used for parsing.
wL (str or regexp): Left whitespace regular expression,
i.e. either ``None``, the empty string or a regular
expression (e.g. "\s*") that defines whitespace. An
empty string means no whitespace will be skipped,
``None`` means that the default whitespace will be
used.
wR (str or regexp): Right whitespace regular expression.
See above.
name: The optional name of the parser.
"""
super(RE, self).__init__(name)
# assert wR or regexp == '.' or isinstance(self, Token)
self.wL = wL
......@@ -520,6 +551,7 @@ class RE(Parser):
def _grammar_assigned_notifier(self):
if self.grammar:
# use default whitespace parsers if not otherwise specified
if self.wL is None:
self.wspLeft = self.grammar.wsp_left_parser__
if self.wR is None:
......@@ -535,11 +567,24 @@ class RE(Parser):
def Token(token, wL=None, wR=None, name=None):
"""Returns an RE-parser that matches plain strings that are
considered as 'tokens'.
If the ``name``-parameter is empty, the parser's name will be set
to the TOKEN_KEYWORD, making it easy to identify tokens in the
abstract syntax tree transformation and compilation stage.
"""
return RE(escape_re(token), wL, wR, name or TOKEN_KEYWORD)
def mixin_comment(whitespace, comment):
"""Mixes comment-regexp into whitespace regexp.
"""Returns a regular expression that merges comment and whitespace
regexps. Thus comments cann occur whereever whitespace is allowed
and will be skipped just as implicit whitespace.
Note, that because this works on the level of regular expressions,
nesting comments is not possible. It also makes it much harder to
use directives inside comments (which isn't recommended, anyway).
"""
wspc = '(?:' + whitespace + '(?:' + comment + whitespace + ')*)'
return wspc
......@@ -868,7 +913,10 @@ class CompilerBase:
return None
else:
compiler = self.__getattribute__(elem) # TODO Add support for python keyword attributes
return compiler(node)
result = compiler(node)
for child in node.children:
node.error_flag |= child.error_flag
return result
def full_compilation(source, grammar_base, AST_pipeline, compiler):
......@@ -879,7 +927,7 @@ def full_compilation(source, grammar_base, AST_pipeline, compiler):
The compilations stage is only invoked if no errors occurred in
either of the two previous stages.
Paraemters:
Args:
source (str): The input text for compilation
grammar_base (GrammarBase): The GrammarBase object
AST_pipeline (dict or list of dicts): A syntax-tree processing
......@@ -912,12 +960,15 @@ def full_compilation(source, grammar_base, AST_pipeline, compiler):
# likely that error list gets littered with compile error messages
if syntax_tree.error_flag:
result = None
errors = syntax_tree.collect_errors()
else:
for processing_table in sequence(AST_pipeline):
for processing_table in smart_list(AST_pipeline):
traverse(syntax_tree, processing_table)
syntax_tree.log(log_file_name, ext='.ast')
result = compiler.compile__(syntax_tree)
errors = syntax_tree.collect_errors()
errors = syntax_tree.collect_errors()
if not errors:
result = compiler.compile__(syntax_tree)
errors = syntax_tree.collect_errors()
messages = error_messages(source, errors)
return result, messages, syntax_tree
......
......@@ -28,7 +28,7 @@ except ImportError:
import re
from typing import NamedTuple
from .toolkit import IS_LOGGING, LOGS_DIR, expand_table, line_col, sequence
from .toolkit import IS_LOGGING, LOGS_DIR, expand_table, line_col, smart_list
__all__ = ['WHITESPACE_KEYWORD',
......@@ -50,7 +50,10 @@ __all__ = ['WHITESPACE_KEYWORD',
'remove_expendables',
'remove_tokens',
'flatten',
'remove_enclosing_delimiters']
'remove_enclosing_delimiters',
'forbid',
'require',
'assert_content']
class ZombieParser:
......@@ -297,6 +300,7 @@ class Node:
of a set of tuples (position, error_message), where position
is always relative to this node.
"""
errors = []
if self.error_flag:
errors = self.errors
if clear_errors:
......@@ -305,8 +309,7 @@ class Node:
if self.children:
for child in self.result:
errors.extend(child.collect_errors(clear_errors))
return errors
return []
return errors
def log(self, log_file_name, ext):
if IS_LOGGING():
......@@ -393,13 +396,14 @@ def traverse(root_node, processing_table):
"""
# normalize processing_table entries by turning single values into lists
# with a single value
table = {name: sequence(call) for name, call in list(processing_table.items())}
table = {name: smart_list(call) for name, call in list(processing_table.items())}
table = expand_table(table)
def traverse_recursive(node):
if node.children:
for child in node.result:
traverse_recursive(child)
node.error_flag |= child.error_flag # propagate error flag
sequence = table.get(node.parser.name,
table.get('~', [])) + table.get('*', [])
for call in sequence:
......@@ -531,3 +535,29 @@ def remove_enclosing_delimiters(node):
node.result = node.result[1:-1]
########################################################################
#
# syntax tree validation functions
#
########################################################################
def require(node, child_tags):
for child in node.children:
if child.tag_name not in child_tags:
node.add_error('Element "%s" is not allowed inside "%s".' %
(child.tag_name, node.tag_name))
def forbid(node, child_tags):
for child in node.children:
if child.tag_name in child_tags:
node.add_error('Element "%s" cannot be nested inside "%s".' %
(child.tag_name, node.tag_name))
def assert_content(node, regex):
content = str(node)
if not re.match(regex, content):
node.add_error('Element "%s" violates %s on %s' %
(node.tag_name, str(regex), content))
......@@ -52,7 +52,7 @@ __all__ = ['logging_on',
'is_python_code',
'md5',
'expand_table',
'sequence',
'smart_list',
'sane_parser_name']
......@@ -189,6 +189,47 @@ def md5(*txt):
return md5_hash.hexdigest()
def smart_list(arg):
"""Returns the argument as list, depending on its type and content.
If the argument is a string, it will be interpreted as a list of
comma separated values, trying ';', ',', ' ' as possible delimiters
in this order, e.g.
>>> smart_list("1; 2, 3; 4")
["1", "2, 3", "4"]
>>> smart_list("2, 3")
["2", "3"]
>>> smart_list("a b cd")
["a", "b", "cd"]
If the argument is a collection other than a string, it will be
returned as is, e.g.
>>> smart_list((1, 2, 3))
(1, 2, 3)
>>> smart_list({1, 2, 3})
{1, 2, 3}
If the argument is another iterable than a collection, it will
be converted into a list, e.g.
>>> smart_list(i for i in {1,2,3})
[1, 2, 3]
Finally, if none of the above is true, the argument will be
wrapped in a list and returned, e.g.
>>> smart_list(125)
[125]
"""
if isinstance(arg, str):
for delimiter in (';', ','):
lst = arg.split(delimiter)
if len(lst) > 1:
return (s.strip() for s in lst)
return (s.strip() for s in arg.strip().split(' '))
elif isinstance(arg, collections.abc.Collection):
return arg
elif isinstance(arg, collections.abc.Iterable):
return list(arg)
else:
return [arg]
def expand_table(compact_table):
"""Expands a table by separating keywords that are tuples or strings
containing comma separated words into single keyword entries with
......@@ -201,22 +242,11 @@ def expand_table(compact_table):
keys = list(compact_table.keys())
for key in keys:
value = compact_table[key]
if isinstance(key, str):
parts = (s.strip() for s in key.split(','))
else:
assert isinstance(key, collections.abc.Iterable)
parts = key
for p in parts:
expanded_table[p] = value
for k in smart_list(key):
expanded_table[k] = value
return expanded_table
def sequence(arg):
"""Returns the argument if it is a sequence, otherwise returns a
list containing the argument as sole item."""
return arg if isinstance(arg, collections.abc.Sequence) else [arg]
def sane_parser_name(name):
"""Checks whether given name is an acceptable parser name. Parser names
must not be preceeded or succeeded by a double underscore '__'!
......
......@@ -23,6 +23,9 @@ limitations under the License.
import os
import sys
sys.path.append(os.path.abspath('../../'))
from DHParser.syntaxtree import traverse
from DHParser.parsercombinators import full_compilation
from DHParser.EBNFcompiler import EBNFGrammar, EBNF_ASTPipeline, EBNFCompiler
from DHParser.DSLsupport import compileEBNF
......@@ -59,6 +62,7 @@ class TestDirectives:
syntax_tree = parser.parse("3 + 4 \n * 12")
assert syntax_tree.collect_errors()
class TestPopRetrieve:
mini_language = """
document = { text | codeblock }
......@@ -78,6 +82,9 @@ class TestPopRetrieve:
teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
syntax_tree = self.minilang_parser.parse(teststr)
assert not syntax_tree.collect_errors()
delim = str(next(syntax_tree.find(lambda node: node.tag_name == "delimiter")))
pop = str(next(syntax_tree.find(lambda node: node.tag_name == "Pop")))
assert delim == pop
if WRITE_LOGS:
syntax_tree.log("test_PopRetrieve_single_line", '.cst')
# self.minilang_parser.log_parsing_history("test_PopRetrieve_single_line")
......@@ -93,11 +100,45 @@ class TestPopRetrieve:
"""
syntax_tree = self.minilang_parser.parse(teststr)
assert not syntax_tree.collect_errors()
delim = str(next(syntax_tree.find(lambda node: node.tag_name == "delimiter")))
pop = str(next(syntax_tree.find(lambda node: node.tag_name == "Pop")))
assert delim == pop
if WRITE_LOGS:
syntax_tree.log("test_PopRetrieve_multi_line", '.cst')
# self.minilang_parser.log_parsing_history("test_PopRetrieve_multi_line")
class TestSemanticValidation:
def check(self, minilang, bool_filter=lambda x: x):
grammar = EBNFGrammar()
st = grammar.parse(minilang)
assert not st.collect_errors()
for table in EBNF_ASTPipeline:
traverse(st, table)
assert bool_filter(st.collect_errors())
def test_illegal_nesting(self):
self.check('impossible = { [ "an optional requirement" ] }')
def test_illegal_nesting_option_required(self):
self.check('impossible = [ §"an optional requirement" ]')
def test_illegal_nesting_oneormore_option(self):
self.check('impossible = { [ "no use"] }+')
def test_legal_nesting(self):
self.check('possible = { [ "+" ] "1" }', lambda x: not x)
class TestCompilerErrors:
def test_error_propagation(self):
ebnf = "@ literalws = wrongvalue # testing error propagation"
result, messages, st = full_compilation(ebnf, EBNFGrammar(), EBNF_ASTPipeline,
EBNFCompiler('ErrorPropagationTest'))
assert messages
if __name__ == "__main__":
from run import run_tests
run_tests("TestDirectives TestPopRetrieve", globals())
run_tests("TestCompilerErrors", globals())
......@@ -25,7 +25,8 @@ import re
import sys
sys.path.append(os.path.abspath('../../'))
from DHParser.toolkit import compact_sexpr
from DHParser.syntaxtree import Node
from DHParser.syntaxtree import Node, traverse
class DummyParser:
def __init__(self, name=''):
......@@ -115,6 +116,21 @@ class TestNode:
assert found[0].result == 'x' and found[1].result == 'y'
class TestErrorHandling:
def test_error_propagations(self):
tree = from_sexpr('(a (b c) (d (e (f (g h)))))')
def find_h(node):
if node.result == "h":
node.add_error("an error deep inside the syntax tree")
assert not tree.error_flag
traverse(tree, {"*": find_h})
assert tree.error_flag
if __name__ == "__main__":
from run import run_tests
run_tests("TestSExpr TestNode", globals())
run_tests("TestErrorHandling", globals())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment