test_parse.py 54.3 KB
Newer Older
1
#!/usr/bin/env python3
2

3
"""test_parse.py - tests of the parsers-module of DHParser
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

eckhart's avatar
eckhart committed
22
import copy
23
import os
24
import sys
25
from functools import partial
26
from typing import List, Tuple
27

28
29
scriptpath = os.path.dirname(__file__) or '.'
sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
30

eckhart's avatar
eckhart committed
31
from DHParser.configuration import get_config_value, set_config_value
32
from DHParser.toolkit import compile_python_object, re
33
from DHParser.log import is_logging, log_ST, log_parsing_history
eckhart's avatar
eckhart committed
34
35
36
from DHParser.error import Error, is_error, adjust_error_locations, MANDATORY_CONTINUATION, \
    MALFORMED_ERROR_STRING, MANDATORY_CONTINUATION_AT_EOF, RESUME_NOTICE, PARSER_STOPPED_BEFORE_END, \
    PARSER_NEVER_TOUCHES_DOCUMENT
37
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
38
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, \
39
    Interleave, UnknownParserError, CombinedParser, Text, EMPTY_NODE, Capture, Drop, Whitespace, \
40
    GrammarError, Counted, Always, INFINITE
41
from DHParser import compile_source
42
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, \
43
    parse_ebnf, DHPARSER_IMPORTS, compile_ebnf
eckhart's avatar
eckhart committed
44
from DHParser.dsl import grammar_provider, create_parser, raw_compileEBNF
45
from DHParser.syntaxtree import Node, parse_sxpr
46
from DHParser.stringview import StringView
47
from DHParser.trace import set_tracer, trace_history, resume_notices_on
48
49


50
51
52
53
54
55

class TestWhitespace:
    # TODO: add test cases here
    def test_whitespace_comment_mangling(self):
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
56
    def test_non_empty_derivation(self):
57
58
        pass

59

60
61
class TestParserError:
    def test_parser_error_str(self):
62
        pe = ParserError(Node('TAG', 'test').with_pos(0), StringView('Beispiel'), None, True)
63
64
        assert str(pe).find('Beispiel') >= 0 and str(pe).find('TAG') >= 0

65
    def test_false_lookahead_only_message(self):
66
        """PARSER_LOOKAHEAD_*_ONLY errors must not be reported if there
67
68
69
70
71
72
        no lookahead parser in the history!"""
        lang = """
        word = letters { letters | `-` letters }
        letters = /[A-Za-z]+/
        """
        gr = grammar_provider(lang)()
73
74
        set_tracer(gr, trace_history)
        st = gr('hard-time')
75
        assert not st.errors
76
        st = gr('hard-')
77
78
        assert st.errors and not any(e.code == 1045 for e in st.errors)

eckhart's avatar
eckhart committed
79

eckhart's avatar
eckhart committed
80
81
82
83
84
85
86
87
88
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
89
90
        def visitor(context: List[Parser]):
            p = context[-1]
di68kap's avatar
di68kap committed
91
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
92
93
94
95
96
97
98
99
100
101
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3

102
103
    def test_symbol(self):
        class MyGrammar(Grammar):
104
            wrong = Text('wrong')
105
106
107
108
109
            word = OneOrMore(wrong) + Whitespace(r'\s*') + OneOrMore(RegExp(r'\w+'))
            root__ = word
        gr = MyGrammar()
        regex = gr['word'].parsers[-1].parser
        result = gr.associated_symbol(regex).symbol
110
        assert result == 'word', result
111

eckhart's avatar
eckhart committed
112

113
class TestInfiLoopsAndRecursion:
eckhart's avatar
eckhart committed
114
115
116
117
118
119
120
121
122
123
    def test_very_simple(self):
        minilang = """
            term = term (`*`|`/`) factor | factor
            factor = /[0-9]+/
            """
        parser = grammar_provider(minilang)()
        snippet = "5*4*3"
        st = parser(snippet)
        assert not is_error(st.error_flag)

124
    def test_direct_left_recursion1(self):
125
        minilang = """@literalws = right
126
127
128
129
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
130
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
131
        parser = grammar_provider(minilang)()
eckhart's avatar
eckhart committed
132
        # print(raw_compileEBNF(minilang).result)
133
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
134
        syntax_tree = parser(snippet)
135
136
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)
Eckhart Arnold's avatar
Eckhart Arnold committed
137
        if is_logging():
138
            log_ST(syntax_tree, "test_LeftRecursion_direct.cst")
139
            log_parsing_history(parser, "test_LeftRecursion_direct")
140

141
    def test_direct_left_recursion2(self):
142
        minilang = """@literalws = right
143
144
145
146
147
            expr = ex
            ex   = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
148
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
149
        parser = grammar_provider(minilang)()
150
        assert parser
151
        syntax_tree = parser(snippet)
152
153
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
154

155
    def test_indirect_left_recursion1(self):
156
        minilang = """@literalws = right
157
            Expr    = //~ (Product | Sum | Value)
158
159
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
160
            Value   = /[0-9.]+/~ | '(' §Expr ')'
161
            """
eckhart's avatar
eckhart committed
162
        # print(raw_compileEBNF(minilang).result)
163
        parser = grammar_provider(minilang)()
164
        snippet = "8 * 4"
165
        syntax_tree = parser(snippet)
166
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
167
168
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
169
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
170
171
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
172
173
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
174
        if is_logging():
175
            log_ST(syntax_tree, "test_LeftRecursion_indirect.cst")
176
            log_parsing_history(parser, "test_LeftRecursion_indirect")
177

178
179
    # # BEWARE: EXPERIMENTAL TEST can be long running
    # def test_indirect_left_recursion2(self):
180
    #     arithmetic_syntax = """@literalws = right
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
    #         expression     = addition | subtraction
    #         addition       = (expression | term) "+" (expression | term)
    #         subtraction    = (expression | term) "-" (expression | term)
    #         term           = multiplication | division
    #         multiplication = (term | factor) "*" (term | factor)
    #         division       = (term | factor) "/" (term | factor)
    #         factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
    #         group          = "(" expression ")"
    #         SIGN           = /[+-]/
    #         NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
    #         VARIABLE       = /[A-Za-z]/~
    #         """
    #     arithmetic = grammar_provider(arithmetic_syntax)()
    #     arithmetic.left_recursion_depth__ = 2
    #     assert arithmetic
    #     syntax_tree = arithmetic("(a + b) * (a - b)")
    #     assert syntax_tree.errors
198

199
200
201
    def test_break_inifnite_loop_ZeroOrMore(self):
        forever = ZeroOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
202
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
203
204
205
206

    def test_break_inifnite_loop_OneOrMore(self):
        forever = OneOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
207
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
208

209
210
211
212
213
214
215
216
217
218
    def test_break_infinite_loop_Counted(self):
        forever = Counted(Always(), (0, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (5, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (INFINITE, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
eckhart's avatar
eckhart committed
219
        forever = Counted(Always(), (1000, INFINITE - 1))
220
221
222
223
224
225
226
227
228
229
230
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    def test_break_infinite_loop_Interleave(self):
        forever = Interleave(Always(), repetitions = [(0, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Interleave(Always(), Always(),
                             repetitions = [(5, INFINITE), (INFINITE, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
eckhart's avatar
eckhart committed
231
        forever = Interleave(Always(), repetitions = [(1000, INFINITE - 1)])
232
233
234
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

235
236
237
238
239
    # def test_infinite_loops(self):
    #     minilang = """forever = { // } \n"""
    #     try:
    #         parser_class = grammar_provider(minilang)
    #     except CompilationError as error:
240
    #         assert all(e.code == INFINITE_LOOP for e in error.errors)
241
242
243
244
245
246
    #     save = get_config_value('static_analysis')
    #     set_config_value('static_analysis', 'late')
    #     provider = grammar_provider(minilang)
    #     try:
    #         parser = provider()
    #     except GrammarError as error:
247
    #         assert error.errors[0][2].code == INFINITE_LOOP
248
249
250
251
    #     set_config_value('static_analysis', 'none')
    #     parser = provider()
    #     snippet = " "
    #     syntax_tree = parser(snippet)
252
    #     assert any(e.code == INFINITE_LOOP for e in syntax_tree.errors)
253
    #     res = parser.static_analysis()
254
    #     assert res and res[0][2].code == INFINITE_LOOP
255
256
257
258
259
    #     minilang = """not_forever = { / / } \n"""
    #     parser = grammar_provider(minilang)()
    #     res = parser.static_analysis()
    #     assert not res
    #     set_config_value('static_analysis', save)
260

eckhart's avatar
eckhart committed
261

262
263
264
265
266
267
# class TestStaticAnalysis:
#     def test_alternative(self):
#         lang = 'doc = "A" | "AB"'
#         parser = create_parser(lang)


Eckhart Arnold's avatar
Eckhart Arnold committed
268
class TestFlowControl:
269
270
271
272
273
274
    t1 = """
         All work and no play
         makes Jack a dull boy
         END
         """
    t2 = "All word and not play makes Jack a dull boy END\n"
275

Eckhart Arnold's avatar
Eckhart Arnold committed
276
    def test_lookbehind(self):
277
278
        ws = RegExp(r'\s*');  ws.pname = "ws"
        end = RegExp("END");  end.pname = "end"
279
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
280
        word = RegExp(r'\w+');  word.pname = "word"
Eckhart Arnold's avatar
Eckhart Arnold committed
281
282
283
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws
        parser = Grammar(document)
284
285
286
287
288
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

289
290
291
292
293
294
295
296
        cst = parser(self.t2, parser['ws'], complete_match=False)
        assert cst.did_match() and len(cst) == 0 and not cst.errors
        cst = parser(self.t2, parser['word'], complete_match=False)
        assert cst.did_match() and cst.content == "All" and not cst.errors
        cst = parser(self.t2, parser['end'], complete_match=False)
        assert not cst.did_match()


297
298
    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
299
            parser_initialization__ = ["upon instantiation"]
300
            ws = RegExp(r'\s*')
301
            end = RegExp('END')
302
            SUCC_LB = RegExp('\\s*?\\n')
303
            doc_end = Series(Lookbehind(SUCC_LB), end)
304
            word = RegExp(r'\w+')
305
306
307
308
309
310
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
311
        assert not cst.error_flag, cst.as_sxpr()
312
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
313
314
315
        assert cst.error_flag, cst.as_sxpr()


316
317
318
319
320
321
322
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
323
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
Eckhart Arnold's avatar
Eckhart Arnold committed
324
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
325
        assert result
326
        assert not messages, str(messages)
327
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
328
329
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
330
        assert node.tag_name == "regex"
331
332
        assert str(node) == 'abc+def'

333
334
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
eckhart's avatar
eckhart committed
335
336
        regex =  /\w+
                  [+]
337
338
                  \w* /
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
339
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
340
341
342
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
343
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
344
345
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
346
        assert node.tag_name == "regex"
347
348
        assert str(node) == 'abc+def'

349
    def test_ignore_case(self):
350
351
352
353
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
354
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
355
356
357
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
358
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
359
        node, rest = parser.regex(StringView('Alpha'))
360
361
        assert node
        assert rest == ''
362
        assert node.tag_name == "regex"
363
364
365
366
367
368
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
369
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
370
371
372
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
373
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
374
375
        node, rest = parser.regex(StringView('Alpha'))
        assert node is None
376

377
    def test_token(self):
378
        tokenlang = r"""@literalws = right
379
380
381
382
383
384
385
386
387
388
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
389
        result, messages, _ = compile_source(
eckhart's avatar
eckhart committed
390
391
            tokenlang, None, get_ebnf_grammar(), get_ebnf_transformer(),
            get_ebnf_compiler("TokenTest"))
392
        assert result
eckhart's avatar
eckhart committed
393
        assert not messages, str(messages)
394
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
395
        result = parser(testdoc)
396
        # log_parsing_history(parser, "test.log")
397
        assert not result.error_flag, str(result.errors_sorted)
398

399

400
class TestGrammar:
401
402
403
404
405
406
407
408
    grammar = r"""@whitespace = horizontal
    haupt        = textzeile LEERZEILE
    textzeile    = { WORT }+
    WORT         = /[^ \t]+/~
    LEERZEILE    = /\n[ \t]*(?=\n)/~
    """
    pyparser, messages, _ = compile_source(grammar, None, get_ebnf_grammar(),
                                           get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
eckhart's avatar
eckhart committed
409
    assert pyparser, str(messages)
eckhart's avatar
eckhart committed
410
    assert not messages, str(messages)
411
412
413
414

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
415
416
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
        grammar("no_file_name*")
417
        for record in grammar.history__:
418
419
            assert not record.node or record.node.pos >= 0

420
    def test_select_parsing(self):
421
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
422
423
424
425
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
426

di68kap's avatar
di68kap committed
427
428
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
429
            r'''
di68kap's avatar
di68kap committed
430
431
432
433
434
435
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
436
437
438
439
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
440
441
442
443
444
445
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

446
447
448
449
450
451
452
    def test_incomplete_matching(self):
        """Tests whether the flag `complete_match` works as expected when
        calling a grammar object in order to parse a document."""
        gr = grammar_provider('word = ~/\\w+/\n')()
        st = gr('eins')
        assert not st.errors
        st = gr('eins zwei')
453
        assert st.errors[0].code == PARSER_STOPPED_BEFORE_END
454
455
456
        st = gr('eins zwei', complete_match=False)
        assert not st.errors

457
    def test_synonym(self):
di68kap's avatar
di68kap committed
458
        lang = r"""
459
460
461
462
463
464
465
466
467
            doc  = { word | number }
            word = /\w+/ S
            number = [VZ] /\d+/ S 
            S    = ~        # let S by a synonym for anonymous whitespace
            VZ   = "-"
        """
        gr = grammar_provider(lang)()
        st = gr('eins 1 zwei2drei 3')
        # set_config_value('compiled_EBNF_log', 'grammar.log')
468
        gr = grammar_provider("@drop = whitespace, strings" + lang)()
469
470
        st = gr('eins 1 zwei2drei 3')
        st = gr('-3')
471
        assert str(gr['S']) == "S = ~", str(gr['S'])
472

473

474
475
476
477
478
479
480
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
Eckhart Arnold's avatar
Eckhart Arnold committed
481
        st = parser("ABCD")
482
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
483
        st = parser("A_CD")
484
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
485
        st = parser("AB_D")
486
487
488
489
490
491
492
493
494
495
496
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
497
498
499
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
500
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
501
        # transitivity of mandatory-operator
502
        st = parser("ABC_");  assert st.error_flag
503
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
504
505

    def test_series_composition(self):
506
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
507
508
509
510
511
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
512
513
514
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
515
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
516
        st = parser("ABC_E");  assert st.error_flag
517
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
518
519
520

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
521
522
523
524
525
526
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
527
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
528

eckhart's avatar
eckhart committed
529
530
531
532
533
534
535
536
537
538
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
539

540
541
    def test_ebnf_serialization(self):
        ebnf_grammar = get_ebnf_grammar()
eckhart's avatar
eckhart committed
542
        # TODO: Add test here
543
        ebnf = ebnf_grammar.as_ebnf()
544
        # print(ebnf)
545
546


547
548
549
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
550
        prefixes = Interleave(TKN("A"), TKN("B"))
551
552
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
553
554
555
556

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
eckhart's avatar
eckhart committed
557
        prefixes = Interleave(TKN("A"), TKN("B"))
558
559
560
561
562
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
563
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
564
565
566
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
567
568
569
570
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
571
        prefixes = Interleave(TKN("A"), TKN("B"))
572
573
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
eckhart's avatar
eckhart committed
574
575
576
        st = Grammar(prefixes)('B')
        assert st.error_flag
        prefixes = Interleave(TKN("B"), TKN("A"), repetitions=((0, 1), (0, 1)))
577
        assert Grammar(prefixes)('A B').content == 'A B'
eckhart's avatar
eckhart committed
578
579
580
        st = Grammar(prefixes)('B')
        assert not st.error_flag
        assert st.content == 'B'
581
582
583
584

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
585
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
586
587
588
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
589
590
591
        assert Grammar(prefixes)('A B B').error_flag


592
593
class TestInterleave:
    def test_interleave_most_simple(self):
594
        letterset = Interleave(Text("A"), Text("B"), Text("C"))
595
596
        gr = Grammar(letterset)
        st = gr('ABC')
597
        assert not st.errors, str(st.errors)
598
599
600
601
602
603
604
605
606
607
        assert st.content == "ABC"
        st = gr('BCA')
        assert not st.errors
        assert st.content == "BCA"
        st = gr('BCBA')
        assert st.errors
        st = gr('AB')
        assert st.errors

    def test_interleave(self):
608
        letterset = Interleave(Text("A"), Text("B"), Text("C"),
609
610
611
612
                               repetitions=[(1, 1000), (0, 1), (1, 1)])
        gr = Grammar(letterset)
        st = gr('AABC')
        assert not st.errors
613
614
615
616
617
618
619
620
621
622
        st = gr('BACAAA')
        assert not st.errors
        st = gr('ABCC')
        assert st.errors
        st = gr('AAACAAA')
        assert not st.errors
        st = gr('AAABAAA')
        assert st.errors


623
624
625
626
class TestErrorRecovery:
    def test_series_skip(self):
        lang = """
        document = series | /.*/
627
        @series_skip = /(?=[A-Z])/
628
629
630
631
632
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser('AB_D')
        assert len(st.errors) == 1  # no additional "stopped before end"-error!
633
        resume_notices_on(parser)
634
        st = parser('AB_D')
635
        assert len(st.errors) == 2 and any(err.code == RESUME_NOTICE for err in st.errors)
eckhart's avatar
eckhart committed
636
        assert 'Skipping' in str(st.errors_sorted[1])
637

638
    def test_Interleave_skip(self):
639
640
641
        lang = """
        document = allof | /.*/
        @allof_skip = /[A-Z]/
642
        allof = "A" ° §"B" ° "C" ° "D"
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
        """
        parser = grammar_provider(lang)()
        st = parser('CADB')
        assert 'allof' in st and st['allof'].content == "CADB"
        st = parser('_BCD')
        assert st.equals(parse_sxpr('(document "_BCD")'))
        st = parser('_ABC')
        assert st.equals(parse_sxpr('(document "_ABC")'))
        st = parser('A_CD')
        assert st['allof'].content == "A_CD"
        st = parser('AB_D')
        assert st['allof'].content == "AB_D"
        st = parser('A__D')
        assert st['allof'].content == "A__D"
        st = parser('CA_D')
        assert st['allof'].content == "CA_D"
659
660
        st = parser('A_CB')
        assert st['allof'].content == "A_CB"
661
        st = parser('BC_A')
662
        assert 'allof' not in st
663
664


665
class TestPopRetrieve:
666
    mini_language = r"""
667
668
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
669
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrival!
670
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
671
        text           = /[^`]+/
672
        """
673
    mini_lang2 = r"""
674
        @braces_filter = matching_bracket()
675
676
677
678
679
680
681
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
682
    mini_lang3 = r"""@literalws = right
683
        document       = { text | env }
eckhart's avatar
eckhart committed
684
        env            = (specialtag | opentag) text [ closespecial | closetag ]
685
686
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
687
        closetag       = close_slash | close_star
688
689
690
691
692
693
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """
694
    mini_lang4 = r"""@literalws = right
695
696
697
698
699
700
701
        document       = { text | env }
        env            = opentag document closetag
        opentag        = "<" name ">"
        closetag       = "</" :?name ">"
        name           = /\w+/~
        text           = /[^<>]+/        
    """
702
703

    def setup(self):
704
705
706
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
707
        self.minilang_parser4 = grammar_provider(self.mini_lang4)()
708
709

    @staticmethod
710
    def has_tag_name(node, name):
711
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
712

713
714
    def test_capture_assertions(self):
        try:
715
716
717
            _ = Grammar(Capture(Drop(Whitespace(r'\s*'))))
            assert False, "GrammarError expected!"
        except GrammarError as ge:
718
719
            pass
        try:
720
            _ = Grammar(Capture(Series(Text(' '), Drop(Whitespace(r'\s*')))))
721
            assert False, "ValueError expected!"
722
        except GrammarError:
723
            pass
724
725
726
        cp = Capture(RegExp(r'\w+'))
        cp.pname = "capture"
        _ = Grammar(cp)
727

728
729
730
731
    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3
732
        assert self.minilang_parser4
733
734
735
736

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
737
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
738
739
740

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
741
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
742
743
744

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
745
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
746
747
748

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
749
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
750
751
752

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
753
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
754

755
756
757
    def test_optional_match(self):
        test1 = '<info>Hey, you</info>'
        st = self.minilang_parser4(test1)
758
        assert not st.error_flag, str(st.errors_sorted)
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
        test12 = '<info>Hey, <emph>you</emph></info>'
        st = self.minilang_parser4(test1)
        assert not st.error_flag
        test2 = '<info>Hey, you</>'
        st = self.minilang_parser4(test2)
        assert not st.error_flag
        test3 = '<info>Hey, <emph>you</></>'
        st = self.minilang_parser4(test3)
        assert not st.error_flag
        test4 = '<info>Hey, <emph>you</></info>'
        st = self.minilang_parser4(test4)
        assert not st.error_flag

    def test_rollback_behaviour_of_optional_match(self):
        test1 = '<info>Hey, you</info*>'
        st = self.minilang_parser4(test1)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag
        test2 = '<info>Hey, you</*>'
        st = self.minilang_parser4(test2)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag

782
    def test_cache_neutrality(self):
783
784
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
785
        lang = r"""@literalws = right
786
787
788
789
790
791
792
793
794
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
795
        gr = grammar_provider(lang)()
796
        st = gr(case)
797
        assert not st.error_flag, str(st.errors_sorted)
798
799
800
801

    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
802
803
        assert not syntax_tree.errors_sorted, \
            ''.join(str(error) for error in syntax_tree.errors_sorted)
804
805
806
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
807
808
        assert delim == pop
        if is_logging():
809
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
810
811
812
813
814
815
816
817
818
819
820

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
821
        assert not syntax_tree.errors_sorted
822
823
824
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
825
826
        assert delim == pop
        if is_logging():
827
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
828
829
830
831

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
832
        assert not syntax_tree.errors_sorted
833
834
835
836
837
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
        assert len(delim) == len(pop)
        assert delim != pop
838
        if is_logging():
839
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
840
841
842
843
844
845
846
847
848
849
850

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
851
        assert not syntax_tree.errors_sorted
852
853
854
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
855
856
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
857
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
858

859
    def test_autoretrieve(self):
860
        lang = r"""@literalws = right
861
            document   = { definition } § EOF
862
            definition = symbol :defsign value
863
            symbol     = /\w+/~                      
864
865
            defsign    = "=" | ":="
            value      = /\d+/~
eckhart's avatar
eckhart committed
866
            EOF        = !/./ [ :?defsign ]   # eat up captured defsigns
867
        """
868
869
        # code, _, _ = compile_ebnf(lang)
        # print(code)
870
        parser = grammar_provider(lang)()
871
872
873
        st = parser("X := 1")
        assert not st.error_flag
        st1 = st
874
875
        st = parser("")
        assert not st.error_flag
876
877
878

        lines = [line for line in lang.split('\n') if line.strip()]
        eof_line = lines.pop()
879
        lines.insert(2, eof_line)
880
881
882
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
883
        assert not st.errors, str(st.errors)
884
885
        assert st.equals(st1)

886
887
        del lines[2]
        lines.insert(3, eof_line)
888
889
890
891
892
893
894
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)

        # and, finally...
895
        lang_variant = r"""@literalws = right
896
897
898
899
            document   = { definition } § EOF
            symbol     = /\w+/~                      
            defsign    = "=" | ":="
            value      = /\d+/~
900
            EOF        = !/./ :?defsign   # eat up captured defsign, only if it has been retrieved
901
902
903
904
905
906
            definition = symbol :defsign value
        """
        parser = grammar_provider(lang_variant)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)
907
908
        st = parser('')
        assert "EOF expected" in str(st.errors)
di68kap's avatar
di68kap committed
909

910

911
class TestWhitespaceHandling:
912
    minilang = """@literalws = right
913
914
915
916
917
918
919
        doc = A B
        A = "A"
        B = "B"
        Rdoc = ar br
        ar = /A/
        br = /B/
        """
920
    gr = grammar_provider(minilang)()
921
922
923
924
925
926
927
928
929
930
931
932

    def test_token_whitespace(self):
        st = self.gr("AB", 'doc')
        assert not st.error_flag
        st = self.gr("A B", 'doc')
        assert not st.error_flag

    def test_regexp_whitespace(self):
        st = self.gr("AB", 'Rdoc')
        assert not st.error_flag
        st = self.gr("A B", 'Rdoc')
        assert st.error_flag
di68kap's avatar
di68kap committed
933
934
935
936
937


class TestErrorReporting:
    grammar = """
        root      = series alpha | anything
eckhart's avatar
eckhart committed
938
        series    = subseries &alpha
di68kap's avatar
di68kap committed
939
940
941
942
943
944
945
946
947
948
949
950
951
952
        subseries = alpha §beta
        alpha     = /[a-z]+/
        beta      = /[A-Z]+/
        anything  = /.*/
        """

    def setup(self):
        self.parser = grammar_provider(self.grammar)()

    def test_error_propagation(self):
        testcode1 = "halloB"
        testcode2 = "XYZ"
        testcode3 = "hallo "
        cst = self.parser(testcode1)
953
        assert not cst.error_flag, str(cst.errors_sorted)
di68kap's avatar
di68kap committed
954
955
956
957
        cst = self.parser(testcode2)
        assert not cst.error_flag
        cst = self.parser(testcode3)
        assert cst.error_flag
958
959


di68kap's avatar
di68kap committed
960
961
class TestBorderlineCases:
    def test_not_matching(self):
962
        minilang = """parser = /X/\n"""
di68kap's avatar
di68kap committed
963
964
965
966
        gr = grammar_provider(minilang)()
        cst = gr('X', 'parser')
        assert not cst.error_flag
        cst = gr(' ', 'parser')
eckhart's avatar
eckhart committed
967
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
968
        cst = gr('', 'parser')
eckhart's avatar
eckhart committed
969
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
970
971
972
973
974
975
976

    def test_matching(self):
        minilang = """parser = /.?/"""
        gr = grammar_provider(minilang)()
        cst = gr(' ', 'parser')
        assert not cst.error_flag
        cst = gr('  ', 'parser')
977
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
978
979
980
981
        cst = gr('', 'parser')
        assert not cst.error_flag


eckhart's avatar
eckhart committed
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
EBNF_with_Errors = r"""# Test code with errors. All places marked by a "$" should yield and error

@ comment    = /#.*(?:\n|$)/
@ whitespace = /\s*/
@ literalws  = right
@ anonymous  = pure_elem, EOF
@ drop       = whitespace, EOF


# re-entry-rules for resuming after parsing-error