Commit 41c88d26 authored by di68kap's avatar di68kap
Browse files

- DHParser/parse.py: resume and skip directives now jump to the reentry-point...

- DHParser/parse.py: resume and skip directives now jump to the reentry-point right after the match of the regular expression for re-entry
parent 6d1e7958
......@@ -739,10 +739,11 @@ class EBNFCompiler(Compiler):
if nd.tag_name == 'regexp':
return unrepr("re.compile(r'%s')" % self._extract_regex(nd))
elif nd.tag_name == 'literal':
s = nd.content.strip()
return s.strip('"') if s[0] == '"' else s.strip("'")
s = nd.content[1:-1] # remove quotation marks
return unrepr("re.compile(r'%s')" % escape_re(s))
return ''
def _gen_search_list(self, nodes: Sequence[Node]) -> List[Union[unrepr, str]]:
search_list = [] # type: List[Union[unrepr, str]]
for child in nodes:
......
......@@ -43,7 +43,7 @@ from DHParser.stringview import StringView, EMPTY_STRING_VIEW
from DHParser.syntaxtree import Node, FrozenNode, RootNode, WHITESPACE_PTYPE, \
TOKEN_PTYPE, ZOMBIE_TAG, ResultType
from DHParser.toolkit import sane_parser_name, escape_control_characters, re, cython, \
RX_NEVER_MATCH
RX_NEVER_MATCH, RxPatternType
__all__ = ('Parser',
......@@ -118,19 +118,22 @@ class ParserError(Exception):
return "%i: %s %s" % (self.node.pos, str(self.rest[:25]), repr(self.node))
ResumeList = List[Union[str, Any]] # list of strings or regular expressiones
ResumeList = List[RxPatternType] # list of regular expressiones
def reentry_point(rest: StringView, rules: ResumeList, comment_regex) -> int:
"""
Finds the point where parsing should resume after a ParserError has been caught.
The algorithm makes sure that this reentry-point does not lie inside a comment.
The re-entry point is always the point after the end of the match of the regular
expression defining the re-entry point. (Use look ahead, if you wand to define
the re-entry point by what follows rather than by what text precedes the point.)
Args:
rest: The rest of the parsed text or, in other words, the point where
a ParserError was thrown.
rules: A list of strings or regular expressions. The rest of the text is
searched for each of these. The closest match is the point where
parsing will be resumed.
a ParserError was thrown.
rules: A list of regular expressions. The rest of the text is searched for
each of these. The closest match is the point where parsing will be
resumed.
comment_regex: A regular expression object that matches comments.
Returns:
The integer index of the closest reentry point or -1 if no reentry-point
......@@ -151,9 +154,9 @@ def reentry_point(rest: StringView, rules: ResumeList, comment_regex) -> int:
comments = None
return -1, -2
def str_search(s, start: int = 0) -> Tuple[int, int]:
nonlocal rest
return rest.find(s, start), len(rule)
# def str_search(s, start: int = 0) -> Tuple[int, int]:
# nonlocal rest
# return rest.find(s, start), len(s)
def rx_search(rx, start: int = 0) -> Tuple[int, int]:
nonlocal rest
......@@ -169,18 +172,17 @@ def reentry_point(rest: StringView, rules: ResumeList, comment_regex) -> int:
while a < b <= k:
a, b = next_comment()
while a <= k < b:
k, length = search_func(search_rule, k + length)
k, length = search_func(search_rule, k + max(length, 1))
while a < b <= k:
a, b = next_comment()
return k if k >= 0 else upper_limit
return k + length if k >= 0 else upper_limit
# find closest match
for rule in rules:
comments = rest.finditer(comment_regex)
if isinstance(rule, str):
pos = entry_point(str_search, rule)
else: # rule is a compiled regular expression
pos = entry_point(rx_search, rule)
assert not isinstance(rule, str), \
'Strings not allowed as search rules, use a regular expression instead.'
pos = entry_point(rx_search, rule)
closest_match = min(pos, closest_match)
# in case no rule matched return -1
......@@ -676,9 +678,9 @@ class Grammar:
of yet incomplete grammars class Grammar does not assume that this
is the case.
resume_rules__: A mapping of parser names to a list of regular expressions or search
strings that act as rules to find the the reentry point if a ParserError
was thrown during the execution of the parser with the respective name.
resume_rules__: A mapping of parser names to a list of regular expressions
that act as rules to find the reentry point if a ParserError was
thrown during the execution of the parser with the respective name.
parser_initializiation__: Before the parser class (!) has been initialized,
which happens upon the first time it is instantiated (see
......
......@@ -54,6 +54,7 @@ __all__ = ('typing',
'cython_optimized',
'NEVER_MATCH_PATTERN',
'RX_NEVER_MATCH',
'RxPatternType',
're_find',
'escape_re',
'escape_control_characters',
......@@ -108,6 +109,12 @@ NEVER_MATCH_PATTERN = r'..(?<=^)'
RX_NEVER_MATCH = re.compile(NEVER_MATCH_PATTERN)
try:
RxPatternType = re.Pattern
except AttributeError:
RxPatternType = type(re.compile(''))
def re_find(s, r, pos=0, endpos=9223372036854775807):
"""
Returns the match of the first occurrence of the regular expression
......
......@@ -61,7 +61,7 @@ class ArithmeticGrammar(Grammar):
r"""Parser for an Arithmetic source file.
"""
expression = Forward()
source_hash__ = "b75119067b29e37cd0bfe66facbcad22"
source_hash__ = "4197ddd06ba30244927f160c6f46e30f"
static_analysis_pending__ = [True]
parser_initialization__ = ["upon instantiation"]
resume_rules__ = {}
......
......@@ -65,7 +65,7 @@ class ArithmeticRightRecursiveGrammar(Grammar):
sign = Forward()
tail = Forward()
term = Forward()
source_hash__ = "0c78c1f796133256eba3d3784356105a"
source_hash__ = "3d81c718b586fbd4490776d2cd4e3e53"
static_analysis_pending__ = [True]
parser_initialization__ = ["upon instantiation"]
resume_rules__ = {}
......
......@@ -62,7 +62,7 @@ class ArithmeticRightRecursiveGrammar(Grammar):
"""
expression = Forward()
term = Forward()
source_hash__ = "d98f130442f0561e9217728aceb8eec3"
source_hash__ = "9f9acd23245ae0a07680aa9cfda7952f"
static_analysis_pending__ = [True]
parser_initialization__ = ["upon instantiation"]
resume_rules__ = {}
......
......@@ -61,7 +61,7 @@ class ArithmeticSimpleGrammar(Grammar):
r"""Parser for an ArithmeticSimple source file.
"""
expression = Forward()
source_hash__ = "e255247d005cd6cb213068881d76ffb4"
source_hash__ = "057a467fa65d6c91e5658319e2c9469f"
static_analysis_pending__ = [True]
parser_initialization__ = ["upon instantiation"]
resume_rules__ = {}
......
......@@ -59,11 +59,12 @@ class BibTeXGrammar(Grammar):
r"""Parser for a BibTeX source file.
"""
text = Forward()
source_hash__ = "61400955f6b57b8ec517dd11b6563d47"
source_hash__ = "197c4727de18b0cb980ff33a8e33f5ca"
static_analysis_pending__ = [True]
parser_initialization__ = ["upon instantiation"]
resume_rules__ = {}
COMMENT__ = r'(?i)%[^\n]*\n'
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
wsp__ = Whitespace(WSP_RE__)
......
......@@ -63,7 +63,7 @@ class EBNFGrammar(Grammar):
r"""Parser for an EBNF source file.
"""
expression = Forward()
source_hash__ = "be91994c910201cdf0bd2da656c7cc01"
source_hash__ = "a7929c507e1b8319071d18dc9eaccdf7"
static_analysis_pending__ = [True]
parser_initialization__ = ["upon instantiation"]
resume_rules__ = {}
......
......@@ -62,7 +62,7 @@ class LaTeXGrammar(Grammar):
paragraph = Forward()
tabular_config = Forward()
text_element = Forward()
source_hash__ = "4e6ab6dede977be94d488bcc9990a5bb"
source_hash__ = "e01e15066b585d52cc71c0c60d2adff1"
static_analysis_pending__ = [True]
parser_initialization__ = ["upon instantiation"]
resume_rules__ = {}
......
......@@ -68,7 +68,7 @@ class XMLGrammar(Grammar):
extSubsetDecl = Forward()
ignoreSectContents = Forward()
markupdecl = Forward()
source_hash__ = "fabca55375f62d0a2f009cdfd76f0f77"
source_hash__ = "4cd0cef2b3f3559b014e4d34e5d8b1f6"
static_analysis_pending__ = [True]
parser_initialization__ = ["upon instantiation"]
resume_rules__ = {}
......
......@@ -62,7 +62,7 @@ class XMLSnippetGrammar(Grammar):
"""
Name = Forward()
element = Forward()
source_hash__ = "54cd2aa8a44307a7c802ab2014a483eb"
source_hash__ = "251e31d28ec63ce674dc7a67686acaf1"
static_analysis_pending__ = [True]
parser_initialization__ = ["upon instantiation"]
resume_rules__ = {}
......
......@@ -3,11 +3,14 @@
M1: """
{
"leading and trailing whitespace": true,
"leading and trailing whitespace": true
}
"""
[ast:json]
[fail:json]
M2: """
{
......@@ -17,10 +20,6 @@ M2: """
"""
[ast:json]
[fail:json]
[match:element]
......
......@@ -19,16 +19,16 @@
#
#######################################################################
@ object_resume = /(?<=\})/
@ object_resume = /\}\s*/
@ member_error = /\w+/, 'Possible non-numerical and non-string values are `true`, `false` or `null` (always written with small letters and without quotation marks).'
@ member_error = /["\'`´]/, 'String values must be enclosed by double-quotation marks: "..."!'
@ member_error = /\\/, 'Possible escaped values are /, \\, b, n, r, t, or u.'
@ member_error = /\d/, '{1} does not represent a valid number or other value.'
@ member_resume = /,|\}/
@ member_resume = /(?=,|\})/
@ string_error = '', 'Illegal character "{1}" in string.'
@ string_skip = /"/
@ string_skip = /(?=")/
json = ~ element EOF
element = value
......
......@@ -64,13 +64,13 @@ class jsonGrammar(Grammar):
"""
element = Forward()
value = Forward()
source_hash__ = "fe49705afe85da112b73e44a1690fde2"
source_hash__ = "ffc7450f5d46ff7934b8597317bc3393"
static_analysis_pending__ = [True]
parser_initialization__ = ["upon instantiation"]
string_skip__ = [re.compile(r'"')]
string_err_msg__ = [('', 'Illegal character "{1}" in string.')]
string_skip__ = [re.compile(r'(?=")')]
string_err_msg__ = [(re.compile(r''), 'Illegal character "{1}" in string.')]
member_err_msg__ = [(re.compile(r'\w+'), 'Possible non-numerical and non-string values are `true`, `false` or `null` (always written with small letters and without quotation marks).'), (re.compile(r'["\'`´]'), 'String values must be enclosed by double-quotation marks: "..."!'), (re.compile(r'\\'), 'Possible escaped values are /, \\, b, n, r, t, or u.'), (re.compile(r'\d'), '{1} does not represent a valid number or other value.')]
resume_rules__ = {'object': [re.compile(r'(?<=\})')], 'member': [re.compile(r',|\}')]}
resume_rules__ = {'object': [re.compile(r'\}\s*')], 'member': [re.compile(r'(?=,|\})')]}
COMMENT__ = r'(?:\/\/|#).*'
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
......
......@@ -64,7 +64,7 @@ class yamlGrammar(Grammar):
"""
element = Forward()
value = Forward()
source_hash__ = "32d03563d86b8ca074f0155c29076a36"
source_hash__ = "9660424d926b901b9654279f23e6c1f1"
static_analysis_pending__ = [True]
parser_initialization__ = ["upon instantiation"]
resume_rules__ = {}
......
......@@ -594,9 +594,9 @@ class TestErrorCustomizationErrors:
class TestCustomizedResumeParsing:
def setup(self):
lang = r"""
@ alpha_resume = 'BETA', GAMMA_STR
@ alpha_resume = /(?=BETA)/, /(?=GAMMA)/
@ beta_resume = GAMMA_RE
@ bac_resume = /GA\w+/
@ bac_resume = /(?=GA\w+)/
document = alpha [beta] gamma "."
alpha = "ALPHA" abc
abc = §"a" "b" "c"
......@@ -606,8 +606,7 @@ class TestCustomizedResumeParsing:
gamma = "GAMMA" §(cab | cba)
cab = "c" "a" §"b"
cba = "c" "b" §"a"
GAMMA_RE = /GA\w+/
GAMMA_STR = "GAMMA"
GAMMA_RE = /(?=GA\w+)/
"""
self.gr = grammar_provider(lang)()
......@@ -645,7 +644,7 @@ class TestInSeriesResume:
def setup(self):
lang = """
document = series
@series_skip = /B/, /C/, /D/, /E/, /F/, /G/
@series_skip = /(?=[BCDEFG])/
series = "A" §"B" "C" "D" "E" "F" "G"
"""
self.gr = grammar_provider(lang)()
......@@ -702,7 +701,7 @@ class TestAllOfResume:
def test_allof_resume_later(self):
lang = """
document = flow "."
@ flow_resume = '.'
@ flow_resume = /(?=\.)/
flow = allof | series
@ allof_error = '{} erwartet, {} gefunden :-('
allof = < "A" "B" § "C" "D" "E" "F" "G" >
......@@ -725,12 +724,12 @@ class TestAllOfResume:
def test_complex_resume_task(self):
lang = """
document = flow { flow } "."
@ flow_resume = '.'
@ flow_resume = /(?=[.])/
flow = allof | series
@ allof_error = '{} erwartet, {} gefunden :-('
@ allof_resume = 'E', 'A'
@ allof_resume = /(?=E)/, /(?=A)/
allof = < "A" "B" § "C" "D" "E" "F" "G" >
@ series_resume = 'E', 'A'
@ series_resume = /(?=E)/, /(?=A)/
series = "E" "X" §"Y" "Z"
"""
gr = grammar_provider(lang)()
......@@ -746,6 +745,7 @@ class TestAllOfResume:
st = gr('FCB_GAED.')
assert len(st.errors_sorted) == 2
st = gr('EXY EXYZ.')
print(st.errors)
assert len(st.errors_sorted) == 1
......
......@@ -27,7 +27,7 @@ scriptpath = os.path.dirname(__file__) or '.'
sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
from DHParser.configuration import get_config_value, set_config_value
from DHParser.toolkit import compile_python_object
from DHParser.toolkit import compile_python_object, re
from DHParser.log import is_logging, log_ST, log_parsing_history
from DHParser.error import Error, is_error
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
......@@ -483,7 +483,7 @@ class TestErrorRecovery:
def test_series_skip(self):
lang = """
document = series | /.*/
@series_skip = /[A-Z]/
@series_skip = /(?=[A-Z])/
series = "A" "B" §"C" "D"
"""
parser = grammar_provider(lang)()
......@@ -763,7 +763,7 @@ class TestReentryAfterError:
def test_simple_resume_rule(self):
gr = self.gr; gr.resume_rules = dict()
gr.resume_rules__['alpha'] = ['BETA']
gr.resume_rules__['alpha'] = [re.compile(r'(?=BETA)')]
content = 'ALPHA acb BETA bac GAMMA cab .'
cst = gr(content)
assert cst.error_flag
......@@ -774,7 +774,7 @@ class TestReentryAfterError:
def test_failing_resume_rule(self):
gr = self.gr; gr.resume_rules = dict()
gr.resume_rules__['alpha'] = ['XXX']
gr.resume_rules__['alpha'] = [re.compile(r'(?=XXX)')]
content = 'ALPHA acb BETA bac GAMMA cab .'
cst = gr(content)
assert cst.error_flag
......@@ -783,7 +783,7 @@ class TestReentryAfterError:
def test_severl_reentry_points(self):
gr = self.gr; gr.resume_rules = dict()
gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
gr.resume_rules__['alpha'] = [re.compile(r'(?=BETA)'), re.compile(r'(?=GAMMA)')]
content = 'ALPHA acb BETA bac GAMMA cab .'
cst = gr(content)
assert cst.error_flag
......@@ -794,7 +794,7 @@ class TestReentryAfterError:
def test_several_reentry_points_second_point_matching(self):
gr = self.gr; gr.resume_rules = dict()
gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
gr.resume_rules__['alpha'] = [re.compile(r'(?=BETA)'), re.compile(r'(?=GAMMA)')]
content = 'ALPHA acb GAMMA cab .'
cst = gr(content)
assert cst.error_flag
......@@ -805,9 +805,9 @@ class TestReentryAfterError:
def test_several_resume_rules_innermost_rule_matching(self):
gr = self.gr; gr.resume_rules = dict()
gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
gr.resume_rules__['beta'] = ['GAMMA']
gr.resume_rules__['bac'] = ['GAMMA']
gr.resume_rules__['alpha'] = [re.compile(r'(?=BETA)'), re.compile(r'(?=GAMMA)')]
gr.resume_rules__['beta'] = [re.compile(r'(?=GAMMA)')]
gr.resume_rules__['bac'] = [re.compile(r'(?=GAMMA)')]
content = 'ALPHA abc BETA bad GAMMA cab .'
cst = gr(content)
assert cst.error_flag
......@@ -828,7 +828,7 @@ class TestReentryAfterError:
lang = r"""
@ comment = /(?:\/\/.*)|(?:\/\*(?:.|\n)*?\*\/)/ # Kommentare im C++-Stil
document = block_A block_B
@ block_A_resume = /x/
@ block_A_resume = /(?=x)/
block_A = "a" §"b" "c"
block_B = "x" "y" "z"
"""
......@@ -845,13 +845,9 @@ class TestReentryAfterError:
# test regex-defined resume rule
grammar = grammar_provider(lang)()
print(grammar.resume_rules__)
mini_suite(grammar)
# test string-defined resume rule
alt_lang = lang.replace('@ block_A_resume = /x/',
'@ block_A_resume = "x"')
grammar = grammar_provider(alt_lang)()
mini_suite(grammar)
def test_unambiguous_error_location(self):
lang = r"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment