Commit 25d0fd78 authored by eckhart's avatar eckhart
Browse files

ebnf.py: ebnf-parser now resumes after syntax errors

parent 573dec31
......@@ -34,12 +34,13 @@ from DHParser.compile import CompilerError, Compiler, ResultTuple, compile_sourc
from DHParser.configuration import access_thread_locals, get_config_value
from DHParser.error import Error
from DHParser.parse import Grammar, mixin_comment, mixin_nonempty, Forward, RegExp, Drop, \
NegativeLookahead, Alternative, Series, Option, ZeroOrMore, Token, Capture, Retrieve, Pop, \
optional_last_value, GrammarError, Whitespace, INFINITE
NegativeLookahead, Alternative, Series, Option, ZeroOrMore, OneOrMore, Token, \
Capture, Retrieve, Pop, optional_last_value, GrammarError, Whitespace, INFINITE
from DHParser.preprocess import nil_preprocessor, PreprocessorFunc
from DHParser.syntaxtree import Node, WHITESPACE_PTYPE, TOKEN_PTYPE
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name, re, expand_table, \
unrepr, compile_python_object, DHPARSER_PARENTDIR, RX_NEVER_MATCH
from DHParser.toolkit import load_if_file, escape_re, escape_control_characters, md5, \
sane_parser_name, re, expand_table, unrepr, compile_python_object, DHPARSER_PARENTDIR, \
RX_NEVER_MATCH
from DHParser.transform import TransformationFunc, traverse, remove_brackets, \
reduce_single_child, replace_by_single_child, remove_empty, remove_children, \
remove_tokens, flatten, forbid, assert_content, apply_unless, has_parent
......@@ -208,71 +209,55 @@ class EBNFGrammar(Grammar):
AND = `,` | ``
ENDL = `;` | ``
"""
AND = Forward()
DEF = Forward()
ENDL = Forward()
OR = Forward()
element = Forward()
expression = Forward()
source_hash__ = "7d0821ca4b634b6da341a614570d47f5"
anonymous__ = re.compile('pure_elem$')
source_hash__ = "c1e15d681796de8731d634be579585ab"
anonymous__ = re.compile('pure_elem$|EOF$')
static_analysis_pending__ = [] # type: List[bool]
parser_initialization__ = ["upon instantiation"]
error_messages__ = {'definition': [[re.compile(r','), 'Delimiter "," not expected in definition. Either this was meant to be a directive and the directive symbol @ is missing or the error is due to inconsistent use of the comma as a delimiter for the elements of a sequence.']]}
resume_rules__ = {'definition': [re.compile(r'\n\s*(?=@|\w+\w*=)')],
'directive': [re.compile(r'\n\s*(?=@|\w+\w*=)')]}
COMMENT__ = r'#.*(?:\n|$)'
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = Drop(Whitespace(WSP_RE__))
ENDL = Capture(Alternative(Token(";"), Token("")))
AND = Capture(Alternative(Token(","), Token("")))
OR = Capture(Token("|"))
DEF = Capture(Alternative(Token("="), Token(":="), Token("::=")))
EOF = Series(NegativeLookahead(RegExp('.')),
Option(Pop(DEF, match_func=optional_last_value)),
Option(Pop(OR, match_func=optional_last_value)),
Option(Pop(AND, match_func=optional_last_value)),
Option(Pop(ENDL, match_func=optional_last_value)))
EOF = Drop(Drop(Series(Drop(NegativeLookahead(RegExp('.'))), Drop(Option(Drop(Pop(DEF, match_func=optional_last_value)))), Drop(Option(Drop(Pop(OR, match_func=optional_last_value)))), Drop(Option(Drop(Pop(AND, match_func=optional_last_value)))), Drop(Option(Drop(Pop(ENDL, match_func=optional_last_value)))))))
ENDL.set(Capture(Alternative(Token(";"), Token(""))))
AND.set(Capture(Alternative(Token(","), Token(""))))
OR.set(Capture(Token("|")))
DEF.set(Capture(Alternative(Token("="), Token(":="), Token("::="))))
whitespace = Series(RegExp('~'), dwsp__)
regexp = Series(RegExp('/(?:(?<!\\\\)\\\\(?:/)|[^/])*?/'), dwsp__)
plaintext = Series(RegExp('`(?:(?<!\\\\)\\\\`|[^`])*?`'), dwsp__)
literal = Alternative(Series(RegExp('"(?:(?<!\\\\)\\\\"|[^"])*?"'), dwsp__),
Series(RegExp("'(?:(?<!\\\\)\\\\'|[^'])*?'"), dwsp__))
literal = Alternative(Series(RegExp('"(?:(?<!\\\\)\\\\"|[^"])*?"'), dwsp__), Series(RegExp("'(?:(?<!\\\\)\\\\'|[^'])*?'"), dwsp__))
literals = OneOrMore(literal)
symbol = Series(RegExp('(?!\\d)\\w+'), dwsp__)
option = Alternative(Series(Series(Token("["), dwsp__), expression,
Series(Token("]"), dwsp__), mandatory=1),
Series(element, Series(Token("?"), dwsp__)))
repetition = Alternative(Series(Series(Token("{"), dwsp__), expression,
Series(Token("}"), dwsp__), mandatory=1),
Series(element, Series(Token("*"), dwsp__)))
oneormore = Alternative(Series(Series(Token("{"), dwsp__), expression,
Series(Token("}+"), dwsp__)),
Series(element, Series(Token("+"), dwsp__)))
group = Series(Series(Token("("), dwsp__), expression,
Series(Token(")"), dwsp__), mandatory=1)
retrieveop = Alternative(Series(Token("::"), dwsp__),
Series(Token(":?"), dwsp__),
Series(Token(":"), dwsp__))
flowmarker = Alternative(Series(Token("!"), dwsp__), Series(Token("&"), dwsp__),
Series(Token("<-!"), dwsp__), Series(Token("<-&"), dwsp__))
element.set(Alternative(Series(Option(retrieveop), symbol, NegativeLookahead(DEF)),
literal, plaintext, regexp, whitespace, group))
option = Alternative(Series(Series(Token("["), dwsp__), expression, Series(Token("]"), dwsp__), mandatory=1), Series(element, Series(Token("?"), dwsp__)))
repetition = Alternative(Series(Series(Token("{"), dwsp__), expression, Series(Token("}"), dwsp__), mandatory=1), Series(element, Series(Token("*"), dwsp__)))
oneormore = Alternative(Series(Series(Token("{"), dwsp__), expression, Series(Token("}+"), dwsp__)), Series(element, Series(Token("+"), dwsp__)))
group = Series(Series(Token("("), dwsp__), expression, Series(Token(")"), dwsp__), mandatory=1)
retrieveop = Alternative(Series(Token("::"), dwsp__), Series(Token(":?"), dwsp__), Series(Token(":"), dwsp__))
flowmarker = Alternative(Series(Token("!"), dwsp__), Series(Token("&"), dwsp__), Series(Token("<-!"), dwsp__), Series(Token("<-&"), dwsp__))
element.set(Alternative(Series(Option(retrieveop), symbol, NegativeLookahead(DEF)), literal, plaintext, regexp, whitespace, group))
pure_elem = Series(element, NegativeLookahead(RegExp('[?*+]')), mandatory=1)
term = Alternative(oneormore, repetition, option, pure_elem)
lookaround = Series(flowmarker, Alternative(oneormore, pure_elem))
interleave = Series(term, ZeroOrMore(Series(Series(Token("°"), dwsp__),
Option(Series(Token("§"), dwsp__)), term)))
sequence = Series(Option(Series(Token("§"), dwsp__)), Alternative(interleave, lookaround),
ZeroOrMore(Series(Retrieve(AND), dwsp__, Option(Series(Token("§"), dwsp__)),
Alternative(interleave, lookaround))))
difference = Series(term, Option(Series(Series(Token("-"), dwsp__), Alternative(oneormore, pure_elem), mandatory=1)))
lookaround = Series(flowmarker, Alternative(oneormore, pure_elem), mandatory=1)
interleave = Series(difference, ZeroOrMore(Series(Series(Token("°"), dwsp__), Option(Series(Token("§"), dwsp__)), difference)))
sequence = Series(Option(Series(Token("§"), dwsp__)), Alternative(interleave, lookaround), ZeroOrMore(Series(Retrieve(AND), dwsp__, Option(Series(Token("§"), dwsp__)), Alternative(interleave, lookaround))))
expression.set(Series(sequence, ZeroOrMore(Series(Retrieve(OR), dwsp__, sequence))))
directive = Series(Series(Token("@"), dwsp__), symbol, Series(Token("="), dwsp__),
Alternative(regexp, literal, symbol),
ZeroOrMore(Series(Series(Token(","), dwsp__),
Alternative(regexp, literal, symbol))), mandatory=1)
definition = Series(symbol, Retrieve(DEF), dwsp__,
expression, Retrieve(ENDL), dwsp__, mandatory=1)
syntax = Series(Option(Series(dwsp__, RegExp(''))),
ZeroOrMore(Alternative(definition, directive)), EOF, mandatory=2)
directive = Series(Series(Token("@"), dwsp__), symbol, Series(Token("="), dwsp__), Alternative(regexp, literals, symbol), ZeroOrMore(Series(Series(Token(","), dwsp__), Alternative(regexp, literals, symbol))), mandatory=1)
definition = Series(symbol, Retrieve(DEF), dwsp__, expression, Retrieve(ENDL), dwsp__, mandatory=1, err_msgs=error_messages__["definition"])
syntax = Series(Option(Series(dwsp__, RegExp(''))), ZeroOrMore(Alternative(definition, directive)), EOF, mandatory=2)
root__ = syntax
def grammar_changed(grammar_class, grammar_source: str) -> bool:
"""
Returns ``True`` if ``grammar_class`` does not reflect the latest
......@@ -360,6 +345,8 @@ EBNF_AST_transformation_table = {
forbid('repetition', 'option', 'oneormore'), assert_content(r'(?!§)(?:.|\n)*')],
"symbol, literal, regexp":
[reduce_single_child],
"literals":
[replace_by_single_child],
(TOKEN_PTYPE, WHITESPACE_PTYPE):
[reduce_single_child],
"EOF, DEF, OR, AND, ENDL":
......@@ -568,6 +555,11 @@ class EBNFCompilerError(CompilerError):
pass
# def escape_backslash(s: str) -> str:
# """Replaces backslashes by double backslash and newline by r'\n'."""
# return s.replace('\\', r'\\').replace('\n', r'\n')
class EBNFCompiler(Compiler):
"""
Generates a Parser from an abstract syntax tree of a grammar specified
......@@ -1230,8 +1222,25 @@ class EBNFCompiler(Compiler):
self.drop_flag = False
return rule, defn
@staticmethod
def join_literals(nd):
assert nd.tag_name == "literals"
parts = [nd.children[0].content[:-1]]
for child in nd.children[1:-1]:
parts.append(child.content[1:-1])
parts.append(nd.children[-1].content[1:])
nd.result = "".join(parts)
nd.tag_name = "literal"
def on_directive(self, node: Node) -> str:
for child in node.children:
if child.tag_name == "literal":
child.result = escape_control_characters(child.content)
elif child.tag_name == "literals":
self.join_literals(child)
child.result = escape_control_characters(child.content)
key = node.children[0].content
assert key not in self.directives.tokens
......@@ -1621,8 +1630,9 @@ class EBNFCompiler(Compiler):
return 'dwsp__'
return 'wsp__'
def on_literal(self, node: Node) -> str:
center = self.TOKEN_PARSER(node.content.replace('\\', r'\\'))
center = self.TOKEN_PARSER(escape_control_characters(node.content))
force = DROP_TOKEN in self.directives.drop
left = self.WSPC_PARSER(force) if 'left' in self.directives.literalws else ''
right = self.WSPC_PARSER(force) if 'right' in self.directives.literalws else ''
......@@ -1632,7 +1642,7 @@ class EBNFCompiler(Compiler):
def on_plaintext(self, node: Node) -> str:
tk = node.content.replace('\\', r'\\')
tk = escape_control_characters(node.content)
rpl = '"' if tk.find('"') < 0 else "'" if tk.find("'") < 0 else ''
if rpl:
tk = rpl + tk[1:-1] + rpl
......
This diff is collapsed.
......@@ -1664,7 +1664,8 @@ class RegExp(Parser):
return 'whitespace__'
except (AttributeError, NameError):
pass
return '/' + escape_control_characters('%s' % abbreviate_middle(pattern, 118)) + '/'
return '/' + escape_control_characters('%s' % abbreviate_middle(pattern, 118))\
.replace('/', '\\/') + '/'
def DropToken(text: str) -> Token:
......
......@@ -169,10 +169,16 @@ def escape_re(strg: str) -> str:
def escape_control_characters(strg: str) -> str:
r"""
Replace all control characters (e.g. `\n` `\t`) as well as the
forward slash `/` in a string by their backslashed representation.
"""
return repr(strg).replace('\\\\', '\\').replace('/', '\\/')[1:-1]
Replace all control characters (e.g. `\n` `\t`) in a string
by their backslashed representation and replaces backslash by
double backslash.
"""
s = repr(strg.replace('\\', r'\\')).replace('\\\\', '\\')[1:-1]
if s.startswith(r"\'") and s.endswith((r"\'")):
return ''.join(["'", s[2:-2], "'"])
elif s.startswith(r'\"') and s.endswith((r'\"')):
return ''.join(['"', s[2:-2], '"'])
return s
def lstrip_docstring(docstring: str) -> str:
......
......@@ -13,15 +13,18 @@
# specialized error messages for certain cases
@ definition_error = /,/, 'Delimiter "," not expected in definition. Either this was meant to be a directive and the directive symbol @ is missing or the error is due to inconsistent use of the comma as a delimiter for the elements of a sequence.'
@ definition_error = /,/, 'Delimiter "," not expected in definition. Either this was meant to '
'be a directive and the directive symbol @ is missing or the error is '
'due to inconsistent use of the comma as a delimiter for the elements '
'of a sequence.'
#: top-level
syntax = [~//] { definition | directive } §EOF
definition = symbol §:DEF~ expression :ENDL~
directive = "@" §symbol "="
(regexp | literal | symbol)
{ "," (regexp | literal | symbol) }
(regexp | literals | symbol)
{ "," (regexp | literals | symbol) }
#: components
......@@ -59,6 +62,7 @@ option = "[" §expression "]" | element "?"
#: leaf-elements
symbol = /(?!\d)\w+/~ # e.g. expression, term, parameter_list
literals = { literal }+ # string chaining, only allowed in directives!
literal = /"(?:(?<!\\)\\"|[^"])*?"/~ # e.g. "(", '+', 'while'
| /'(?:(?<!\\)\\'|[^'])*?'/~ # whitespace following literals will be ignored tacitly.
plaintext = /`(?:(?<!\\)\\`|[^`])*?`/~ # like literal but does not eat whitespace
......
......@@ -29,18 +29,18 @@ except ImportError:
from DHParser import start_logging, suspend_logging, resume_logging, is_filename, load_if_file, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, Whitespace, Drop, \
Lookbehind, Lookahead, Alternative, Pop, Token, Synonym, Interleave, \
Unordered, Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \
Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, Capture, \
ZeroOrMore, Forward, NegativeLookahead, Required, mixin_comment, compile_source, \
grammar_changed, last_value, matching_bracket, PreprocessorFunc, is_empty, remove_if, \
Node, TransformationFunc, TransformationDict, transformation_factory, traverse, \
remove_children_if, move_adjacent, normalize_whitespace, is_anonymous, matches_re, \
reduce_single_child, replace_by_single_child, replace_or_reduce, remove_whitespace, \
replace_by_children, remove_empty, remove_tokens, flatten, is_insignificant_whitespace, \
merge_adjacent, collapse, collapse_children_if, replace_content, WHITESPACE_PTYPE, \
merge_adjacent, collapse, collapse_children_if, WHITESPACE_PTYPE, \
TOKEN_PTYPE, remove_children, remove_content, remove_brackets, change_tag_name, \
remove_anonymous_tokens, keep_children, is_one_of, not_one_of, has_content, apply_if, peek, \
remove_anonymous_empty, keep_nodes, traverse_locally, strip, lstrip, rstrip, \
replace_content, replace_content_by, forbid, assert_content, remove_infix_operator, \
forbid, assert_content, remove_infix_operator, \
add_error, error_on, recompile_grammar, left_associative, lean_left, set_config_value, \
get_config_value, XML_SERIALIZATION, SXPRESSION_SERIALIZATION, \
COMPACT_SERIALIZATION, JSON_SERIALIZATION, access_thread_locals, access_presets, \
......@@ -77,7 +77,7 @@ class FlexibleEBNFGrammar(Grammar):
OR = Forward()
element = Forward()
expression = Forward()
source_hash__ = "7d9e44b282738715df9b2e51d8be4bd4"
source_hash__ = "eff7b5194e67993e79afbb25f4f0af40"
anonymous__ = re.compile('pure_elem$|EOF$')
static_analysis_pending__ = [] # type: List[bool]
parser_initialization__ = ["upon instantiation"]
......@@ -99,6 +99,7 @@ class FlexibleEBNFGrammar(Grammar):
regexp = Series(RegExp('/(?:(?<!\\\\)\\\\(?:/)|[^/])*?/'), dwsp__)
plaintext = Series(RegExp('`(?:(?<!\\\\)\\\\`|[^`])*?`'), dwsp__)
literal = Alternative(Series(RegExp('"(?:(?<!\\\\)\\\\"|[^"])*?"'), dwsp__), Series(RegExp("'(?:(?<!\\\\)\\\\'|[^'])*?'"), dwsp__))
literals = OneOrMore(literal)
symbol = Series(RegExp('(?!\\d)\\w+'), dwsp__)
option = Alternative(Series(Series(Token("["), dwsp__), expression, Series(Token("]"), dwsp__), mandatory=1), Series(element, Series(Token("?"), dwsp__)))
repetition = Alternative(Series(Series(Token("{"), dwsp__), expression, Series(Token("}"), dwsp__), mandatory=1), Series(element, Series(Token("*"), dwsp__)))
......@@ -114,7 +115,7 @@ class FlexibleEBNFGrammar(Grammar):
interleave = Series(difference, ZeroOrMore(Series(Series(Token("°"), dwsp__), Option(Series(Token("§"), dwsp__)), difference)))
sequence = Series(Option(Series(Token("§"), dwsp__)), Alternative(interleave, lookaround), ZeroOrMore(Series(Retrieve(AND), dwsp__, Option(Series(Token("§"), dwsp__)), Alternative(interleave, lookaround))))
expression.set(Series(sequence, ZeroOrMore(Series(Retrieve(OR), dwsp__, sequence))))
directive = Series(Series(Token("@"), dwsp__), symbol, Series(Token("="), dwsp__), Alternative(regexp, literal, symbol), ZeroOrMore(Series(Series(Token(","), dwsp__), Alternative(regexp, literal, symbol))), mandatory=1)
directive = Series(Series(Token("@"), dwsp__), symbol, Series(Token("="), dwsp__), Alternative(regexp, literals, symbol), ZeroOrMore(Series(Series(Token(","), dwsp__), Alternative(regexp, literals, symbol))), mandatory=1)
definition = Series(symbol, Retrieve(DEF), dwsp__, expression, Retrieve(ENDL), dwsp__, mandatory=1, err_msgs=error_messages__["definition"])
syntax = Series(Option(Series(dwsp__, RegExp(''))), ZeroOrMore(Alternative(definition, directive)), EOF, mandatory=2)
root__ = syntax
......
# EBNF-Grammar in EBNF
@ comment = /#.*(?:\n|$)/ # comments start with '#' and eat all chars up to and including '\n'
@ whitespace = /\s*/ # whitespace includes linefeed
@ literalws = right # trailing whitespace of literals will be ignored tacitly
@ anonymous = pure_elem, EOF
@ drop = whitespace, EOF # do not include these even in the concrete syntax tree
#: top-level
syntax = [~//] { definition | directive } §EOF
definition = symbol §:DEF~ expression :ENDL~
directive = "@" §symbol "="
(regexp | literal | symbol)
{ "," (regexp | literal | symbol) }
#: components
expression = sequence { :OR~ sequence }
sequence = ["§"] ( interleave | lookaround ) # "§" means all following terms mandatory
{ :AND~ ["§"] ( interleave | lookaround ) }
interleave = term { "°" ["§"] term }
lookaround = flowmarker (oneormore | pure_elem)
term = oneormore | repetition | option | pure_elem
#: elements
pure_elem = element § !/[?*+]/ # element strictly without a suffix
element = [retrieveop] symbol !DEF # negative lookahead to be sure it's not a definition
| literal
| plaintext
| regexp
| whitespace
| group
#: flow-operators
flowmarker = "!" | "&" # '!' negative lookahead, '&' positive lookahead
| "<-!" | "<-&" # '<-' negative lookbehind, '<-&' positive lookbehind
retrieveop = "::" | ":?" | ":" # '::' pop, ':?' optional pop, ':' retrieve
#: groups
group = "(" §expression ")"
oneormore = "{" expression "}+" | element "+"
repetition = "{" §expression "}" | element "*"
option = "[" §expression "]" | element "?"
#: leaf-elements
symbol = /(?!\d)\w+/~ # e.g. expression, term, parameter_list
literal = /"(?:(?<!\\)\\"|[^"])*?"/~ # e.g. "(", '+', 'while'
| /'(?:(?<!\\)\\'|[^'])*?'/~ # whitespace following literals will be ignored tacitly.
plaintext = /`(?:(?<!\\)\\`|[^`])*?`/~ # like literal but does not eat whitespace
regexp = /\/(?:(?<!\\)\\(?:\/)|[^\/])*?\//~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
whitespace = /~/~ # insignificant whitespace
#: delimiters
DEF = `=` | `:=` | `::=`
OR = `|`
AND = `,` | ``
ENDL = `;` | ``
EOF = !/./ [:?DEF] [:?OR] [:?AND] [:?ENDL] # [:?DEF], [:?OR], ... clear stack by eating stored value
......@@ -624,6 +624,17 @@ class TestErrorCustomizationErrors:
result, messages, ast = compile_ebnf(lang)
assert len(messages) == 1
def test_long_error_message(self):
lang = """
document = series
@series_error = 'an error message that spreads\n over '
'several strings'
series = "A" § "B" "C"
"""
parser = grammar_provider(lang)()
result = parser('ADX')
assert "several strings" in str(result.errors)
class TestCustomizedResumeParsing:
def setup(self):
......
......@@ -342,7 +342,7 @@ class TestRegex:
parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
result = parser(testdoc)
# log_parsing_history(parser, "test.log")
assert not result.error_flag
assert not result.error_flag, str(result.errors_sorted)
class TestGrammar:
......@@ -704,7 +704,7 @@ class TestPopRetrieve:
def test_optional_match(self):
test1 = '<info>Hey, you</info>'
st = self.minilang_parser4(test1)
assert not st.error_flag
assert not st.error_flag, str(st.errors_sorted)
test12 = '<info>Hey, <emph>you</emph></info>'
st = self.minilang_parser4(test1)
assert not st.error_flag
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment