Commit ffdc5a38 authored by di68kap's avatar di68kap
Browse files

parse.py: preparations for allowing free algorithms for searching re-entry...

parse.py: preparations for allowing free algorithms for searching re-entry points or picking error messages
parent b69d78ce
......@@ -32,9 +32,9 @@ from typing import Callable, Dict, List, Set, Tuple, Sequence, Union, Optional,
from DHParser.compile import CompilerError, Compiler, ResultTuple, compile_source, visitor_name
from DHParser.configuration import access_thread_locals, get_config_value
from DHParser.error import Error, AMBIGUOUS_ERROR_HANDLING, WARNING, REDECLARED_TOKEN_WARNING, REDEFINED_DIRECTIVE, \
UNUSED_ERROR_HANDLING_WARNING, INAPPROPRIATE_SYMBOL_FOR_DIRECTIVE, DIRECTIVE_FOR_NONEXISTANT_SYMBOL, \
UNDEFINED_SYMBOL_IN_TRANSTABLE_WARNING
from DHParser.error import Error, AMBIGUOUS_ERROR_HANDLING, WARNING, REDECLARED_TOKEN_WARNING,\
REDEFINED_DIRECTIVE, UNUSED_ERROR_HANDLING_WARNING, INAPPROPRIATE_SYMBOL_FOR_DIRECTIVE, \
DIRECTIVE_FOR_NONEXISTANT_SYMBOL, UNDEFINED_SYMBOL_IN_TRANSTABLE_WARNING
from DHParser.parse import Grammar, mixin_comment, mixin_nonempty, Forward, RegExp, Drop, \
Lookahead, NegativeLookahead, Alternative, Series, Option, ZeroOrMore, OneOrMore, Token, \
Capture, Retrieve, Pop, optional_last_value, GrammarError, Whitespace, INFINITE
......
......@@ -49,7 +49,44 @@ __all__ = ('ErrorCode',
'is_warning',
'has_errors',
'only_errors',
'adjust_error_locations')
'adjust_error_locations',
'NO_ERROR',
'NOTICE',
'WARNING',
'ERROR',
'FATAL',
'HIGHEST',
'RESUME_NOTICE',
'REDECLARED_TOKEN_WARNING',
'UNUSED_ERROR_HANDLING_WARNING',
'LEFT_RECURSION_WARNING',
'UNDEFINED_SYMBOL_IN_TRANSTABLE_WARNING',
'CANNOT_VERIFY_TRANSTABLE_WARNING',
'CAPTURE_DROPPED_CONTENT_WARNING',
'OPTIONAL_REDUNDANTLY_NESTED_WARNING',
'MANDATORY_CONTINUATION',
'MANDATORY_CONTINUATION_AT_EOF',
'PARSER_DID_NOT_MATCH',
'PARSER_LOOKAHEAD_FAILURE_ONLY',
'PARSER_STOPPED_BEFORE_END',
'PARSER_LOOKAHEAD_MATCH_ONLY',
'CAPTURE_STACK_NOT_EMPTY',
'MALFORMED_ERROR_STRING',
'AMBIGUOUS_ERROR_HANDLING',
'REDEFINED_DIRECTIVE',
'UNDEFINED_RETRIEVE',
'DIRECTIVE_FOR_NONEXISTANT_SYMBOL',
'INAPPROPRIATE_SYMBOL_FOR_DIRECTIVE',
'CAPTURE_WITHOUT_PARSERNAME',
'LOOKAHEAD_WITH_OPTIONAL_PARSER',
'BADLY_NESTED_OPTIONAL_PARSER',
'NARY_WITHOUT_PARSERS',
'BAD_MANDATORY_SETUP',
'DUPLICATE_PARSERS_IN_ALTERNATIVE',
'BAD_ORDER_OF_ALTERNATIVES',
'TREE_PROCESSING_CRASH',
'COMPILER_CRASH',
'AST_TRANSFORM_CRASH')
class ErrorCode(int):
......
......@@ -128,7 +128,9 @@ class ParserError(Exception):
return "%i: %s %s" % (self.node.pos, str(self.rest[:25]), repr(self.node))
ResumeList = List[RxPatternType] # list of regular expressiones
ResumeList = List[Union[RxPatternType, str, Callable]] # list of strings or regular expressiones
ReentryPointAlgorithm = Callable[[StringView, int], Tuple[int, int]]
# (text, start point) => (reentry point, match length)
@cython.locals(upper_limit=cython.int, closest_match=cython.int, pos=cython.int)
......@@ -145,7 +147,8 @@ def reentry_point(rest: StringView,
Args:
rest: The rest of the parsed text or, in other words, the point where
a ParserError was thrown.
rules: A list of regular expressions. The rest of the text is searched for
rules: A list of strings, regular expressions or callable, i.e.
reentry-point-search-functions. The rest of the text is searched for
each of these. The closest match is the point where parsing will be
resumed.
comment_regex: A regular expression object that matches comments.
......@@ -171,9 +174,10 @@ def reentry_point(rest: StringView,
comments = None
return -1, -2
# def str_search(s, start: int = 0) -> Tuple[int, int]:
# nonlocal rest
# return rest.find(s, start, start + search_window), len(s)
@cython.locals(start=cython.int)
def str_search(s, start: int = 0) -> Tuple[int, int]:
nonlocal rest
return rest.find(s, start, start + search_window), len(s)
@cython.locals(start=cython.int, end=cython.int)
def rx_search(rx, start: int = 0) -> Tuple[int, int]:
......@@ -184,6 +188,10 @@ def reentry_point(rest: StringView,
return rest.index(begin), end - begin
return -1, 0
def algorithm_search(func: Callable, start: int = 0):
nonlocal rest
return func(rest, start)
@cython.locals(a=cython.int, b=cython.int, k=cython.int, length=cython.int)
def entry_point(search_func, search_rule) -> int:
a, b = next_comment()
......@@ -200,9 +208,13 @@ def reentry_point(rest: StringView,
# find closest match
for rule in rules:
comments = rest.finditer(comment_regex)
assert not isinstance(rule, str), \
'Strings not allowed as search rules, use a regular expression instead.'
pos = entry_point(rx_search, rule)
if callable(rule):
search_func = algorithm_search
elif isinstance(rule, str):
search_func = str_search
else:
search_func = rx_search
pos = entry_point(search_func, rule)
closest_match = min(pos, closest_match)
# in case no rule matched return -1
......@@ -2039,7 +2051,7 @@ class OneOrMore(UnaryParser):
return None
MessagesType = List[Tuple[Union[str, Any], str]]
MessagesType = List[Tuple[Union[str, RxPatternType, Callable], str]]
NO_MANDATORY = 2**30
......@@ -2054,9 +2066,9 @@ class MandatoryNary(NaryParser):
parameter might change depending on the sub-class implementing
it.
err_msgs: A list of pairs of regular expressions (or simple
strings for that matter) and error messages that are chosen
if the regular expression matches the text where the error
occurred.
strings or boolean valued functions) and error messages
that are chosen if the regular expression matches the text
where the error occurred.
skip: A list of regular expressions. The rest of the text is searched for
each of these. The closest match is the point where parsing will be
resumed.
......@@ -2136,8 +2148,12 @@ class MandatoryNary(NaryParser):
err_node = Node(ZOMBIE_TAG, text_[:i]).with_pos(location)
found = text_[:10].replace('\n', '\\n ') + '...'
for search, message in self.err_msgs:
rxs = not isinstance(search, str)
if (rxs and text_.match(search)) or (not rxs and text_.startswith(search)):
is_func = callable(search) # search rule is a function: StringView -> bool
is_str = isinstance(search, str) # search rule is a simple string
is_rxs = not is_func and not is_str # search rule is a regular expression
if (is_func and search(text_)) \
or (is_rxs and text_.match(search)) \
or (is_str and text_.startswith(search)):
try:
msg = message.format(expected, found)
break
......
......@@ -20,7 +20,7 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
# Quite slow under MS Windows, threrefore renamed to notest_server.py
# Quite slow under MS Windows! Therefore, renamed to notest_server.py
# so that it not regularly called when running pytest/nosetest on
# the test directory.
......@@ -59,9 +59,9 @@ def compiler_dummy(src: str, log_dir: str='') -> str:
return src
def long_running(duration: float) -> str:
def long_running(duration: float) -> float:
time.sleep(float(duration))
return(duration)
return duration
def send_request(request: str, expect_response: bool = True) -> str:
......@@ -91,13 +91,10 @@ def json_rpc(method: str, params: dict) -> str:
class TestServer:
# def test_server(self):
# cs = Server(compiler_dummy)
# cs.run_server()
spawn = multiprocessing.get_start_method() == "spawn"
def setup(self):
stop_server('127.0.0.1', TEST_PORT)
self.spawn = multiprocessing.get_start_method() == "spawn"
def teardown(self):
stop_server('127.0.0.1', TEST_PORT)
......
......@@ -61,8 +61,7 @@ class SerializingTestCompiler(Compiler):
class TestCompilerClass:
def setup(self):
self.original = parse_sxpr('(A (B "1") (C (D (E "2") (F "3"))))')
original = parse_sxpr('(A (B "1") (C (D (E "2") (F "3"))))')
def test_zero_compiler(self):
"""Tests the fallback-method and boilerplate of the compiler."""
......
......@@ -30,8 +30,8 @@ sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
from DHParser.toolkit import compile_python_object, re, DHPARSER_PARENTDIR
from DHParser.preprocess import nil_preprocessor
from DHParser import compile_source
from DHParser.error import has_errors, Error, PARSER_DID_NOT_MATCH, MANDATORY_CONTINUATION, REDEFINED_DIRECTIVE, \
UNUSED_ERROR_HANDLING_WARNING, AMBIGUOUS_ERROR_HANDLING
from DHParser.error import has_errors, Error, PARSER_DID_NOT_MATCH, MANDATORY_CONTINUATION, \
REDEFINED_DIRECTIVE, UNUSED_ERROR_HANDLING_WARNING, AMBIGUOUS_ERROR_HANDLING
from DHParser.syntaxtree import WHITESPACE_PTYPE
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, EBNFTransform, \
EBNFDirectives, get_ebnf_compiler, compile_ebnf, DHPARSER_IMPORTS
......@@ -143,10 +143,11 @@ class TestEBNFParser:
}
}
EBNF = get_ebnf_grammar()
def setup(self):
self.save_dir = os.getcwd()
os.chdir(scriptpath)
self.EBNF = get_ebnf_grammar()
def teardown(self):
clean_report('REPORT_TestEBNFParser')
......@@ -340,10 +341,9 @@ class TestSelfHosting:
class TestBoundaryCases:
def setup(self):
self.gr = get_ebnf_grammar()
self.tr = get_ebnf_transformer()
self.cp = get_ebnf_compiler()
gr = get_ebnf_grammar()
tr = get_ebnf_transformer()
cp = get_ebnf_compiler()
def test_empty_grammar(self):
t = self.gr("")
......@@ -638,23 +638,22 @@ class TestErrorCustomizationErrors:
class TestCustomizedResumeParsing:
def setup(self):
lang = r"""
@ alpha_resume = "BETA", "GAMMA"
@ beta_resume = GAMMA_RE
@ bac_resume = /(?=GA\w+)/
document = alpha [beta] gamma "."
alpha = "ALPHA" abc
abc = §"a" "b" "c"
beta = "BETA" (bac | bca)
bac = "b" "a" §"c"
bca = "b" "c" §"a"
gamma = "GAMMA" §(cab | cba)
cab = "c" "a" §"b"
cba = "c" "b" §"a"
GAMMA_RE = /(?=GA\w+)/
"""
self.gr = grammar_provider(lang)()
lang = r"""
@ alpha_resume = "BETA", "GAMMA"
@ beta_resume = GAMMA_RE
@ bac_resume = /(?=GA\w+)/
document = alpha [beta] gamma "."
alpha = "ALPHA" abc
abc = §"a" "b" "c"
beta = "BETA" (bac | bca)
bac = "b" "a" §"c"
bca = "b" "c" §"a"
gamma = "GAMMA" §(cab | cba)
cab = "c" "a" §"b"
cba = "c" "b" §"a"
GAMMA_RE = /(?=GA\w+)/
"""
gr = grammar_provider(lang)()
def test_several_resume_rules_innermost_rule_matching(self):
gr = self.gr
......@@ -750,14 +749,13 @@ class TestInSeriesResume:
class TestInterleaveResume:
def setup(self):
lang = """
document = allof
@ allof_error = '{} erwartet, {} gefunden :-('
@ allof_skip = "D", "E", "F", "G"
allof = "A" ° "B" ° §"C" ° "D" ° "E" ° "F" ° "G"
"""
self.gr = grammar_provider(lang)()
lang = """
document = allof
@ allof_error = '{} erwartet, {} gefunden :-('
@ allof_skip = "D", "E", "F", "G"
allof = "A" ° "B" ° §"C" ° "D" ° "E" ° "F" ° "G"
"""
gr = grammar_provider(lang)()
def test_garbage_added(self):
st = self.gr('BAGFCED')
......
......@@ -222,13 +222,12 @@ class TestInfiLoopsAndRecursion:
class TestFlowControl:
def setup(self):
self.t1 = """
All work and no play
makes Jack a dull boy
END
"""
self.t2 = "All word and not play makes Jack a dull boy END\n"
t1 = """
All work and no play
makes Jack a dull boy
END
"""
t2 = "All word and not play makes Jack a dull boy END\n"
def test_lookbehind(self):
ws = RegExp(r'\s*')
......@@ -348,17 +347,16 @@ class TestRegex:
class TestGrammar:
def setup(self):
grammar = r"""@whitespace = horizontal
haupt = textzeile LEERZEILE
textzeile = { WORT }+
WORT = /[^ \t]+/~
LEERZEILE = /\n[ \t]*(?=\n)/~
"""
self.pyparser, messages, _ = compile_source(grammar, None, get_ebnf_grammar(),
get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
assert self.pyparser
assert not messages
grammar = r"""@whitespace = horizontal
haupt = textzeile LEERZEILE
textzeile = { WORT }+
WORT = /[^ \t]+/~
LEERZEILE = /\n[ \t]*(?=\n)/~
"""
pyparser, messages, _ = compile_source(grammar, None, get_ebnf_grammar(),
get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
assert pyparser
assert not messages
def test_pos_values_initialized(self):
# checks whether pos values in the parsing result and in the
......@@ -868,9 +866,7 @@ class TestWhitespaceHandling:
ar = /A/
br = /B/
"""
def setup(self):
self.gr = grammar_provider(self.minilang)()
gr = grammar_provider(minilang)()
def test_token_whitespace(self):
st = self.gr("AB", 'doc')
......@@ -1015,19 +1011,18 @@ EOF = !/./ [:?DEF] [:?OR] [:?AND] [:?ENDL]
class TestReentryAfterError:
def setup(self):
lang = """
document = alpha [beta] gamma "."
alpha = "ALPHA" abc
abc = §"a" "b" "c"
beta = "BETA" (bac | bca)
bac = "b" "a" §"c"
bca = "b" "c" §"a"
gamma = "GAMMA" §(cab | cba)
cab = "c" "a" §"b"
cba = "c" "b" §"a"
"""
self.gr = grammar_provider(lang)()
testlang = """
document = alpha [beta] gamma "."
alpha = "ALPHA" abc
abc = §"a" "b" "c"
beta = "BETA" (bac | bca)
bac = "b" "a" §"c"
bca = "b" "c" §"a"
gamma = "GAMMA" §(cab | cba)
cab = "c" "a" §"b"
cba = "c" "b" §"a"
"""
gr = grammar_provider(testlang)()
def test_no_resume_rules(self):
gr = self.gr; gr.resume_rules = dict()
......@@ -1194,19 +1189,18 @@ class TestUnknownParserError:
class TestEarlyTokenWhitespaceDrop:
def setup(self):
self.lang = r"""
@ drop = token, whitespace
expression = term { ("+" | "-") term}
term = factor { ("*"|"/") factor}
factor = number | variable | "(" expression ")"
| constant | fixed
variable = /[a-z]/~
number = /\d+/~
constant = "A" | "B"
fixed = "X"
"""
self.gr = grammar_provider(self.lang)()
lang = r"""
@ drop = token, whitespace
expression = term { ("+" | "-") term}
term = factor { ("*"|"/") factor}
factor = number | variable | "(" expression ")"
| constant | fixed
variable = /[a-z]/~
number = /\d+/~
constant = "A" | "B"
fixed = "X"
"""
gr = grammar_provider(lang)()
def test_drop(self):
cst = self.gr('4 + 3 * 5')
......@@ -1223,12 +1217,11 @@ class TestEarlyTokenWhitespaceDrop:
class TestMetaParser:
def setup(self):
self.mp = MetaParser()
self.mp.grammar = Grammar() # override placeholder warning
self.mp.pname = "named"
self.mp.anonymous = False
self.mp.tag_name = self.mp.pname
mp = MetaParser()
mp.grammar = Grammar() # override placeholder warning
mp.pname = "named"
mp.anonymous = False
mp.tag_name = mp.pname
def test_return_value(self):
save = get_config_value('flatten_tree')
......
......@@ -58,10 +58,10 @@ class TestMakeToken:
class TestSourceMapping:
def setup(self):
self.code = "All persons are mortal AND Socrates is a person YIELDS Socrates is mortal"
self.tokenized = self.code.replace('AND', make_token('CONJUNCTION', 'AND')) \
.replace('YIELDS', make_token('IMPLICATION', 'YIELDS'))
code = "All persons are mortal AND Socrates is a person YIELDS Socrates is mortal"
tokenized = code.replace('AND', make_token('CONJUNCTION', 'AND')) \
.replace('YIELDS', make_token('IMPLICATION', 'YIELDS'))
def test_tokenized_to_original_mapping(self):
srcmap = tokenized_to_original_mapping(self.tokenized)
......@@ -84,67 +84,67 @@ class TestSourceMapping:
pos = source_map(0, srcmap)
def preprocess_indentation(src: str) -> str:
transformed = []
indent_level = 0
for line in src.split('\n'):
indent = len(line) - len(line.lstrip()) if line.strip() else indent_level * 4
assert indent % 4 == 0
if indent > indent_level * 4:
assert indent == (indent_level + 1) * 4, str(indent) # indent must be 4 spaces
indent_level += 1
line = make_token('BEGIN_INDENT') + line
elif indent <= (indent_level - 1) * 4:
while indent <= (indent_level - 1) * 4:
line = make_token('END_INDENT') + line
indent_level -= 1
assert indent == (indent_level + 1) * 4 # indent must be 4 spaces
else:
assert indent == indent_level * 4
transformed.append(line)
while indent_level > 0:
transformed[-1] += make_token('END_INDENT')
indent_level -= 1
tokenized = '\n'.join(transformed)
# print(prettyprint_tokenized(tokenized))
return tokenized
def preprocess_comments(src: str) -> Tuple[str, SourceMapFunc]:
lines = src.split('\n')
positions, offsets = [0], [0]
pos = 0
for i, line in enumerate(lines):
comment_pos = line.find('#')
if comment_pos >= 0:
pos += comment_pos
lines[i] = line[:comment_pos]
positions.append(pos - offsets[-1])
offsets.append(offsets[-1] + len(line) - comment_pos)
pos += len(lines[i])
positions.append(pos)
offsets.append(offsets[-1])
return '\n'.join(lines), partial(source_map, srcmap=SourceMap(positions, offsets))
class TestTokenParsing:
def preprocess_indentation(self, src: str) -> str:
transformed = []
indent_level = 0
for line in src.split('\n'):
indent = len(line) - len(line.lstrip()) if line.strip() else indent_level * 4
assert indent % 4 == 0
if indent > indent_level * 4:
assert indent == (indent_level + 1) * 4, str(indent) # indent must be 4 spaces
indent_level += 1
line = make_token('BEGIN_INDENT') + line
elif indent <= (indent_level - 1) * 4:
while indent <= (indent_level - 1) * 4:
line = make_token('END_INDENT') + line
indent_level -= 1
assert indent == (indent_level + 1) * 4 # indent must be 4 spaces
else:
assert indent == indent_level * 4
transformed.append(line)
while indent_level > 0:
transformed[-1] += make_token('END_INDENT')
indent_level -= 1
tokenized = '\n'.join(transformed)
# print(prettyprint_tokenized(tokenized))
return tokenized
def preprocess_comments(self, src: str) -> Tuple[str, SourceMapFunc]:
lines = src.split('\n')
positions, offsets = [0], [0]
pos = 0
for i, line in enumerate(lines):
comment_pos = line.find('#')
if comment_pos >= 0:
pos += comment_pos
lines[i] = line[:comment_pos]
positions.append(pos - offsets[-1])
offsets.append(offsets[-1] + len(line) - comment_pos)
pos += len(lines[i])
positions.append(pos)
offsets.append(offsets[-1])
return '\n'.join(lines), partial(source_map, srcmap=SourceMap(positions, offsets))
def setup(self):
self.ebnf = r"""
@ tokens = BEGIN_INDENT, END_INDENT
@ whitespace = /[ \t]*/
block = { line | indentBlock }+
line = ~/[^\x1b\x1c\x1d\n]*\n/
indentBlock = BEGIN_INDENT block END_INDENT
"""
set_config_value('max_parser_dropouts', 3)
self.grammar = grammar_provider(self.ebnf)()
self.code = lstrip_docstring("""
def func(x, y):
if x > 0: # a comment
if y > 0:
print(x) # another comment
print(y)
""")
self.tokenized = self.preprocess_indentation(self.code)
self.srcmap = tokenized_to_original_mapping(self.tokenized)
ebnf = r"""
@ tokens = BEGIN_INDENT, END_INDENT
@ whitespace = /[ \t]*/
block = { line | indentBlock }+
line = ~/[^\x1b\x1c\x1d\n]*\n/
indentBlock = BEGIN_INDENT block END_INDENT
"""
set_config_value('max_parser_dropouts', 3)
grammar = grammar_provider(ebnf)()
code = lstrip_docstring("""
def func(x, y):
if x > 0: # a comment
if y > 0:
print(x) # another comment
print(y)
""")
tokenized = preprocess_indentation(code)
srcmap = tokenized_to_original_mapping(tokenized)
def verify_mapping(self, teststr, orig_text, preprocessed_text, mapping):
mapped_pos = preprocessed_text.find(teststr)
......@@ -180,7 +180,7 @@ class TestTokenParsing:
previous_index = index
def test_non_token_preprocessor(self):
tokenized, mapping = self.preprocess_comments(self.code)
tokenized, mapping = preprocess_comments(self.code)
self.verify_mapping("def func", self.code, tokenized, mapping)
self.verify_mapping("x > 0:", self.code, tokenized, mapping)
self.verify_mapping("if y > 0:", self.code, tokenized, mapping)
......@@ -188,7 +188,7 @@ class TestTokenParsing:
self.verify_mapping("print(y)", self.code, tokenized, mapping)
def test_chained_preprocessors(self):
pchain = chain_preprocessors(self.preprocess_comments, self.preprocess_indentation)
pchain = chain_preprocessors(preprocess_comments, preprocess_indentation)
tokenized, mapping = pchain(self.code)
self.verify_mapping("def func", self.code, tokenized, mapping)
self.verify_mapping("x > 0:", self.code, tokenized, mapping)
......@@ -198,7 +198,7 @@ class TestTokenParsing:
def test_error_position(self):
orig_src = self.code.replace('#', '\x1b')
prepr = chain_preprocessors(self.preprocess_comments, self.preprocess_indentation)
prepr = chain_preprocessors(preprocess_comments, preprocess_indentation)
self.grammar.max_parser_dropouts__ = 3
result, messages, syntaxtree = compile_source(orig_src, prepr, self.grammar,
lambda i: i, lambda i: i)
......
......@@ -151,14 +151,13 @@ class TestStringView:
class TestTextBuffer:
def setup(self):
self.test_text = "\n