test_parse.py 54 KB
Newer Older
1
#!/usr/bin/env python3
2

3
"""test_parse.py - tests of the parsers-module of DHParser
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

eckhart's avatar
eckhart committed
22
import copy
23
import os
24
import sys
25
from functools import partial
26
from typing import List, Tuple
27

28
29
scriptpath = os.path.dirname(__file__) or '.'
sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
30

eckhart's avatar
eckhart committed
31
from DHParser.configuration import get_config_value, set_config_value
32
from DHParser.toolkit import compile_python_object, re
33
from DHParser.log import is_logging, log_ST, log_parsing_history
eckhart's avatar
eckhart committed
34
35
36
from DHParser.error import Error, is_error, adjust_error_locations, MANDATORY_CONTINUATION, \
    MALFORMED_ERROR_STRING, MANDATORY_CONTINUATION_AT_EOF, RESUME_NOTICE, PARSER_STOPPED_BEFORE_END, \
    PARSER_NEVER_TOUCHES_DOCUMENT
37
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
38
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, \
39
    Interleave, UnknownParserError, CombinedParser, Text, EMPTY_NODE, Capture, Drop, Whitespace, \
40
    GrammarError, Counted, Always, INFINITE
41
from DHParser import compile_source
42
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, \
43
    parse_ebnf, DHPARSER_IMPORTS, compile_ebnf
eckhart's avatar
eckhart committed
44
from DHParser.dsl import grammar_provider, create_parser, raw_compileEBNF
45
from DHParser.syntaxtree import Node, parse_sxpr
46
from DHParser.stringview import StringView
47
from DHParser.trace import set_tracer, trace_history, resume_notices_on
48
49


50
51
52
53
54
55

class TestWhitespace:
    # TODO: add test cases here
    def test_whitespace_comment_mangling(self):
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
56
    def test_non_empty_derivation(self):
57
58
        pass

59

60
61
class TestParserError:
    def test_parser_error_str(self):
62
        pe = ParserError(Node('TAG', 'test').with_pos(0), StringView('Beispiel'), None, True)
63
64
        assert str(pe).find('Beispiel') >= 0 and str(pe).find('TAG') >= 0

65
    def test_false_lookahead_only_message(self):
66
        """PARSER_LOOKAHEAD_*_ONLY errors must not be reported if there
67
68
69
70
71
72
        no lookahead parser in the history!"""
        lang = """
        word = letters { letters | `-` letters }
        letters = /[A-Za-z]+/
        """
        gr = grammar_provider(lang)()
73
74
        set_tracer(gr, trace_history)
        st = gr('hard-time')
75
        assert not st.errors
76
        st = gr('hard-')
77
78
        assert st.errors and not any(e.code == 1045 for e in st.errors)

eckhart's avatar
eckhart committed
79

eckhart's avatar
eckhart committed
80
81
82
83
84
85
86
87
88
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
89
90
        def visitor(context: List[Parser]):
            p = context[-1]
di68kap's avatar
di68kap committed
91
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
92
93
94
95
96
97
98
99
100
101
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3

102
103
    def test_symbol(self):
        class MyGrammar(Grammar):
104
            wrong = Text('wrong')
105
106
107
108
109
            word = OneOrMore(wrong) + Whitespace(r'\s*') + OneOrMore(RegExp(r'\w+'))
            root__ = word
        gr = MyGrammar()
        regex = gr['word'].parsers[-1].parser
        result = gr.associated_symbol(regex).symbol
110
        assert result == 'word', result
111

eckhart's avatar
eckhart committed
112

113
class TestInfiLoopsAndRecursion:
114
    def test_direct_left_recursion1(self):
115
        minilang = """@literalws = right
116
117
118
119
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
120
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
121
        parser = grammar_provider(minilang)()
eckhart's avatar
eckhart committed
122
        # print(raw_compileEBNF(minilang).result)
123
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
124
        syntax_tree = parser(snippet)
125
126
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)
Eckhart Arnold's avatar
Eckhart Arnold committed
127
        if is_logging():
128
            log_ST(syntax_tree, "test_LeftRecursion_direct.cst")
129
            log_parsing_history(parser, "test_LeftRecursion_direct")
130

131
    def test_direct_left_recursion2(self):
132
        minilang = """@literalws = right
133
134
135
136
137
            expr = ex
            ex   = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
138
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
139
        parser = grammar_provider(minilang)()
140
        assert parser
141
        syntax_tree = parser(snippet)
142
143
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
144

145
    def test_indirect_left_recursion1(self):
146
        minilang = """@literalws = right
147
            Expr    = //~ (Product | Sum | Value)
148
149
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
150
            Value   = /[0-9.]+/~ | '(' §Expr ')'
151
            """
152
        parser = grammar_provider(minilang)()
153
        snippet = "8 * 4"
154
        syntax_tree = parser(snippet)
155
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
156
157
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
158
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
159
160
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
161
162
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
163
        if is_logging():
164
            log_ST(syntax_tree, "test_LeftRecursion_indirect.cst")
165
            log_parsing_history(parser, "test_LeftRecursion_indirect")
166

167
168
    # # BEWARE: EXPERIMENTAL TEST can be long running
    # def test_indirect_left_recursion2(self):
169
    #     arithmetic_syntax = """@literalws = right
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
    #         expression     = addition | subtraction
    #         addition       = (expression | term) "+" (expression | term)
    #         subtraction    = (expression | term) "-" (expression | term)
    #         term           = multiplication | division
    #         multiplication = (term | factor) "*" (term | factor)
    #         division       = (term | factor) "/" (term | factor)
    #         factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
    #         group          = "(" expression ")"
    #         SIGN           = /[+-]/
    #         NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
    #         VARIABLE       = /[A-Za-z]/~
    #         """
    #     arithmetic = grammar_provider(arithmetic_syntax)()
    #     arithmetic.left_recursion_depth__ = 2
    #     assert arithmetic
    #     syntax_tree = arithmetic("(a + b) * (a - b)")
    #     assert syntax_tree.errors
187

188
189
190
    def test_break_inifnite_loop_ZeroOrMore(self):
        forever = ZeroOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
191
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
192
193
194
195

    def test_break_inifnite_loop_OneOrMore(self):
        forever = OneOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
196
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
197

198
199
200
201
202
203
204
205
206
207
    def test_break_infinite_loop_Counted(self):
        forever = Counted(Always(), (0, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (5, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (INFINITE, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
eckhart's avatar
eckhart committed
208
        forever = Counted(Always(), (1000, INFINITE - 1))
209
210
211
212
213
214
215
216
217
218
219
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    def test_break_infinite_loop_Interleave(self):
        forever = Interleave(Always(), repetitions = [(0, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Interleave(Always(), Always(),
                             repetitions = [(5, INFINITE), (INFINITE, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
eckhart's avatar
eckhart committed
220
        forever = Interleave(Always(), repetitions = [(1000, INFINITE - 1)])
221
222
223
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

224
225
226
227
228
    # def test_infinite_loops(self):
    #     minilang = """forever = { // } \n"""
    #     try:
    #         parser_class = grammar_provider(minilang)
    #     except CompilationError as error:
229
    #         assert all(e.code == INFINITE_LOOP for e in error.errors)
230
231
232
233
234
235
    #     save = get_config_value('static_analysis')
    #     set_config_value('static_analysis', 'late')
    #     provider = grammar_provider(minilang)
    #     try:
    #         parser = provider()
    #     except GrammarError as error:
236
    #         assert error.errors[0][2].code == INFINITE_LOOP
237
238
239
240
    #     set_config_value('static_analysis', 'none')
    #     parser = provider()
    #     snippet = " "
    #     syntax_tree = parser(snippet)
241
    #     assert any(e.code == INFINITE_LOOP for e in syntax_tree.errors)
242
    #     res = parser.static_analysis()
243
    #     assert res and res[0][2].code == INFINITE_LOOP
244
245
246
247
248
    #     minilang = """not_forever = { / / } \n"""
    #     parser = grammar_provider(minilang)()
    #     res = parser.static_analysis()
    #     assert not res
    #     set_config_value('static_analysis', save)
249

eckhart's avatar
eckhart committed
250

251
252
253
254
255
256
# class TestStaticAnalysis:
#     def test_alternative(self):
#         lang = 'doc = "A" | "AB"'
#         parser = create_parser(lang)


Eckhart Arnold's avatar
Eckhart Arnold committed
257
class TestFlowControl:
258
259
260
261
262
263
    t1 = """
         All work and no play
         makes Jack a dull boy
         END
         """
    t2 = "All word and not play makes Jack a dull boy END\n"
264

Eckhart Arnold's avatar
Eckhart Arnold committed
265
    def test_lookbehind(self):
266
267
        ws = RegExp(r'\s*');  ws.pname = "ws"
        end = RegExp("END");  end.pname = "end"
268
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
269
        word = RegExp(r'\w+');  word.pname = "word"
Eckhart Arnold's avatar
Eckhart Arnold committed
270
271
272
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws
        parser = Grammar(document)
273
274
275
276
277
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

278
279
280
281
282
283
284
285
        cst = parser(self.t2, parser['ws'], complete_match=False)
        assert cst.did_match() and len(cst) == 0 and not cst.errors
        cst = parser(self.t2, parser['word'], complete_match=False)
        assert cst.did_match() and cst.content == "All" and not cst.errors
        cst = parser(self.t2, parser['end'], complete_match=False)
        assert not cst.did_match()


286
287
    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
288
            parser_initialization__ = ["upon instantiation"]
289
            ws = RegExp(r'\s*')
290
            end = RegExp('END')
291
            SUCC_LB = RegExp('\\s*?\\n')
292
            doc_end = Series(Lookbehind(SUCC_LB), end)
293
            word = RegExp(r'\w+')
294
295
296
297
298
299
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
300
        assert not cst.error_flag, cst.as_sxpr()
301
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
302
303
304
        assert cst.error_flag, cst.as_sxpr()


305
306
307
308
309
310
311
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
312
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
Eckhart Arnold's avatar
Eckhart Arnold committed
313
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
314
        assert result
315
        assert not messages, str(messages)
316
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
317
318
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
319
        assert node.tag_name == "regex"
320
321
        assert str(node) == 'abc+def'

322
323
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
eckhart's avatar
eckhart committed
324
325
        regex =  /\w+
                  [+]
326
327
                  \w* /
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
328
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
329
330
331
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
332
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
333
334
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
335
        assert node.tag_name == "regex"
336
337
        assert str(node) == 'abc+def'

338
    def test_ignore_case(self):
339
340
341
342
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
343
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
344
345
346
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
347
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
348
        node, rest = parser.regex(StringView('Alpha'))
349
350
        assert node
        assert rest == ''
351
        assert node.tag_name == "regex"
352
353
354
355
356
357
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
358
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
359
360
361
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
362
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
363
364
        node, rest = parser.regex(StringView('Alpha'))
        assert node is None
365

366
    def test_token(self):
367
        tokenlang = r"""@literalws = right
368
369
370
371
372
373
374
375
376
377
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
378
        result, messages, _ = compile_source(
eckhart's avatar
eckhart committed
379
380
            tokenlang, None, get_ebnf_grammar(), get_ebnf_transformer(),
            get_ebnf_compiler("TokenTest"))
381
        assert result
eckhart's avatar
eckhart committed
382
        assert not messages, str(messages)
383
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
384
        result = parser(testdoc)
385
        # log_parsing_history(parser, "test.log")
386
        assert not result.error_flag, str(result.errors_sorted)
387

388

389
class TestGrammar:
390
391
392
393
394
395
396
397
398
    grammar = r"""@whitespace = horizontal
    haupt        = textzeile LEERZEILE
    textzeile    = { WORT }+
    WORT         = /[^ \t]+/~
    LEERZEILE    = /\n[ \t]*(?=\n)/~
    """
    pyparser, messages, _ = compile_source(grammar, None, get_ebnf_grammar(),
                                           get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
    assert pyparser
eckhart's avatar
eckhart committed
399
    assert not messages, str(messages)
400
401
402
403

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
404
405
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
        grammar("no_file_name*")
406
        for record in grammar.history__:
407
408
            assert not record.node or record.node.pos >= 0

409
    def test_select_parsing(self):
410
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
411
412
413
414
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
415

di68kap's avatar
di68kap committed
416
417
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
418
            r'''
di68kap's avatar
di68kap committed
419
420
421
422
423
424
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
425
426
427
428
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
429
430
431
432
433
434
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

435
436
437
438
439
440
441
    def test_incomplete_matching(self):
        """Tests whether the flag `complete_match` works as expected when
        calling a grammar object in order to parse a document."""
        gr = grammar_provider('word = ~/\\w+/\n')()
        st = gr('eins')
        assert not st.errors
        st = gr('eins zwei')
442
        assert st.errors[0].code == PARSER_STOPPED_BEFORE_END
443
444
445
        st = gr('eins zwei', complete_match=False)
        assert not st.errors

446
    def test_synonym(self):
di68kap's avatar
di68kap committed
447
        lang = r"""
448
449
450
451
452
453
454
455
456
            doc  = { word | number }
            word = /\w+/ S
            number = [VZ] /\d+/ S 
            S    = ~        # let S by a synonym for anonymous whitespace
            VZ   = "-"
        """
        gr = grammar_provider(lang)()
        st = gr('eins 1 zwei2drei 3')
        # set_config_value('compiled_EBNF_log', 'grammar.log')
457
        gr = grammar_provider("@drop = whitespace, strings" + lang)()
458
459
        st = gr('eins 1 zwei2drei 3')
        st = gr('-3')
460
        assert str(gr['S']) == "S = ~", str(gr['S'])
461

462

463
464
465
466
467
468
469
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
Eckhart Arnold's avatar
Eckhart Arnold committed
470
        st = parser("ABCD")
471
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
472
        st = parser("A_CD")
473
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
474
        st = parser("AB_D")
475
476
477
478
479
480
481
482
483
484
485
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
486
487
488
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
489
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
490
        # transitivity of mandatory-operator
491
        st = parser("ABC_");  assert st.error_flag
492
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
493
494

    def test_series_composition(self):
495
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
496
497
498
499
500
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
501
502
503
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
504
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
505
        st = parser("ABC_E");  assert st.error_flag
506
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
507
508
509

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
510
511
512
513
514
515
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
516
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
517

eckhart's avatar
eckhart committed
518
519
520
521
522
523
524
525
526
527
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
528

529
530
    def test_ebnf_serialization(self):
        ebnf_grammar = get_ebnf_grammar()
eckhart's avatar
eckhart committed
531
        # TODO: Add test here
532
        ebnf = ebnf_grammar.as_ebnf()
533
        # print(ebnf)
534
535


536
537
538
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
539
        prefixes = Interleave(TKN("A"), TKN("B"))
540
541
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
542
543
544
545

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
eckhart's avatar
eckhart committed
546
        prefixes = Interleave(TKN("A"), TKN("B"))
547
548
549
550
551
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
552
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
553
554
555
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
556
557
558
559
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
560
        prefixes = Interleave(TKN("A"), TKN("B"))
561
562
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
eckhart's avatar
eckhart committed
563
564
565
        st = Grammar(prefixes)('B')
        assert st.error_flag
        prefixes = Interleave(TKN("B"), TKN("A"), repetitions=((0, 1), (0, 1)))
566
        assert Grammar(prefixes)('A B').content == 'A B'
eckhart's avatar
eckhart committed
567
568
569
        st = Grammar(prefixes)('B')
        assert not st.error_flag
        assert st.content == 'B'
570
571
572
573

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
574
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
575
576
577
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
578
579
580
        assert Grammar(prefixes)('A B B').error_flag


581
582
class TestInterleave:
    def test_interleave_most_simple(self):
583
        letterset = Interleave(Text("A"), Text("B"), Text("C"))
584
585
        gr = Grammar(letterset)
        st = gr('ABC')
586
        assert not st.errors, str(st.errors)
587
588
589
590
591
592
593
594
595
596
        assert st.content == "ABC"
        st = gr('BCA')
        assert not st.errors
        assert st.content == "BCA"
        st = gr('BCBA')
        assert st.errors
        st = gr('AB')
        assert st.errors

    def test_interleave(self):
597
        letterset = Interleave(Text("A"), Text("B"), Text("C"),
598
599
600
601
                               repetitions=[(1, 1000), (0, 1), (1, 1)])
        gr = Grammar(letterset)
        st = gr('AABC')
        assert not st.errors
602
603
604
605
606
607
608
609
610
611
        st = gr('BACAAA')
        assert not st.errors
        st = gr('ABCC')
        assert st.errors
        st = gr('AAACAAA')
        assert not st.errors
        st = gr('AAABAAA')
        assert st.errors


612
613
614
615
class TestErrorRecovery:
    def test_series_skip(self):
        lang = """
        document = series | /.*/
616
        @series_skip = /(?=[A-Z])/
617
618
619
620
621
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser('AB_D')
        assert len(st.errors) == 1  # no additional "stopped before end"-error!
622
        resume_notices_on(parser)
623
        st = parser('AB_D')
624
        assert len(st.errors) == 2 and any(err.code == RESUME_NOTICE for err in st.errors)
eckhart's avatar
eckhart committed
625
        assert 'Skipping' in str(st.errors_sorted[1])
626

627
    def test_Interleave_skip(self):
628
629
630
        lang = """
        document = allof | /.*/
        @allof_skip = /[A-Z]/
631
        allof = "A" ° §"B" ° "C" ° "D"
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
        """
        parser = grammar_provider(lang)()
        st = parser('CADB')
        assert 'allof' in st and st['allof'].content == "CADB"
        st = parser('_BCD')
        assert st.equals(parse_sxpr('(document "_BCD")'))
        st = parser('_ABC')
        assert st.equals(parse_sxpr('(document "_ABC")'))
        st = parser('A_CD')
        assert st['allof'].content == "A_CD"
        st = parser('AB_D')
        assert st['allof'].content == "AB_D"
        st = parser('A__D')
        assert st['allof'].content == "A__D"
        st = parser('CA_D')
        assert st['allof'].content == "CA_D"
648
649
        st = parser('A_CB')
        assert st['allof'].content == "A_CB"
650
        st = parser('BC_A')
651
        assert 'allof' not in st
652
653


654
class TestPopRetrieve:
655
    mini_language = r"""
656
657
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
658
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrival!
659
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
660
        text           = /[^`]+/
661
        """
662
    mini_lang2 = r"""
663
        @braces_filter = matching_bracket()
664
665
666
667
668
669
670
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
671
    mini_lang3 = r"""@literalws = right
672
        document       = { text | env }
eckhart's avatar
eckhart committed
673
        env            = (specialtag | opentag) text [ closespecial | closetag ]
674
675
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
676
        closetag       = close_slash | close_star
677
678
679
680
681
682
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """
683
    mini_lang4 = r"""@literalws = right
684
685
686
687
688
689
690
        document       = { text | env }
        env            = opentag document closetag
        opentag        = "<" name ">"
        closetag       = "</" :?name ">"
        name           = /\w+/~
        text           = /[^<>]+/        
    """
691
692

    def setup(self):
693
694
695
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
696
        self.minilang_parser4 = grammar_provider(self.mini_lang4)()
697
698

    @staticmethod
699
    def has_tag_name(node, name):
700
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
701

702
703
    def test_capture_assertions(self):
        try:
704
705
706
            _ = Grammar(Capture(Drop(Whitespace(r'\s*'))))
            assert False, "GrammarError expected!"
        except GrammarError as ge:
707
708
            pass
        try:
709
            _ = Grammar(Capture(Series(Text(' '), Drop(Whitespace(r'\s*')))))
710
            assert False, "ValueError expected!"
711
        except GrammarError:
712
            pass
713
714
715
        cp = Capture(RegExp(r'\w+'))
        cp.pname = "capture"
        _ = Grammar(cp)
716

717
718
719
720
    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3
721
        assert self.minilang_parser4
722
723
724
725

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
726
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
727
728
729

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
730
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
731
732
733

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
734
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
735
736
737

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
738
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
739
740
741

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
742
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
743

744
745
746
    def test_optional_match(self):
        test1 = '<info>Hey, you</info>'
        st = self.minilang_parser4(test1)
747
        assert not st.error_flag, str(st.errors_sorted)
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
        test12 = '<info>Hey, <emph>you</emph></info>'
        st = self.minilang_parser4(test1)
        assert not st.error_flag
        test2 = '<info>Hey, you</>'
        st = self.minilang_parser4(test2)
        assert not st.error_flag
        test3 = '<info>Hey, <emph>you</></>'
        st = self.minilang_parser4(test3)
        assert not st.error_flag
        test4 = '<info>Hey, <emph>you</></info>'
        st = self.minilang_parser4(test4)
        assert not st.error_flag

    def test_rollback_behaviour_of_optional_match(self):
        test1 = '<info>Hey, you</info*>'
        st = self.minilang_parser4(test1)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag
        test2 = '<info>Hey, you</*>'
        st = self.minilang_parser4(test2)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag

771
    def test_cache_neutrality(self):
772
773
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
774
        lang = r"""@literalws = right
775
776
777
778
779
780
781
782
783
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
784
        gr = grammar_provider(lang)()
785
        st = gr(case)
786
        assert not st.error_flag, str(st.errors_sorted)
787
788
789
790

    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
791
792
        assert not syntax_tree.errors_sorted, \
            ''.join(str(error) for error in syntax_tree.errors_sorted)
793
794
795
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
796
797
        assert delim == pop
        if is_logging():
798
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
799
800
801
802
803
804
805
806
807
808
809

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
810
        assert not syntax_tree.errors_sorted
811
812
813
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
814
815
        assert delim == pop
        if is_logging():
816
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
817
818
819
820

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
821
        assert not syntax_tree.errors_sorted
822
823
824
825
826
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
        assert len(delim) == len(pop)
        assert delim != pop
827
        if is_logging():
828
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
829
830
831
832
833
834
835
836
837
838
839

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
840
        assert not syntax_tree.errors_sorted
841
842
843
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
844
845
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
846
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
847

848
    def test_autoretrieve(self):
849
        lang = r"""@literalws = right
850
            document   = { definition } § EOF
851
            definition = symbol :defsign value
852
            symbol     = /\w+/~                      
853
854
            defsign    = "=" | ":="
            value      = /\d+/~
eckhart's avatar
eckhart committed
855
            EOF        = !/./ [ :?defsign ]   # eat up captured defsigns
856
        """
857
858
        # code, _, _ = compile_ebnf(lang)
        # print(code)
859
        parser = grammar_provider(lang)()
860
861
862
        st = parser("X := 1")
        assert not st.error_flag
        st1 = st
863
864
        st = parser("")
        assert not st.error_flag
865
866
867

        lines = [line for line in lang.split('\n') if line.strip()]
        eof_line = lines.pop()
868
        lines.insert(2, eof_line)
869
870
871
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
872
        assert not st.errors, str(st.errors)
873
874
        assert st.equals(st1)

875
876
        del lines[2]
        lines.insert(3, eof_line)
877
878
879
880
881
882
883
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)

        # and, finally...
884
        lang_variant = r"""@literalws = right
885
886
887
888
            document   = { definition } § EOF
            symbol     = /\w+/~                      
            defsign    = "=" | ":="
            value      = /\d+/~
889
            EOF        = !/./ :?defsign   # eat up captured defsign, only if it has been retrieved
890
891
892
893
894
895
            definition = symbol :defsign value
        """
        parser = grammar_provider(lang_variant)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)
896
897
        st = parser('')
        assert "EOF expected" in str(st.errors)
di68kap's avatar
di68kap committed
898

899

900
class TestWhitespaceHandling:
901
    minilang = """@literalws = right
902
903
904
905
906
907
908
        doc = A B
        A = "A"
        B = "B"
        Rdoc = ar br
        ar = /A/
        br = /B/
        """
909
    gr = grammar_provider(minilang)()
910
911
912
913
914
915
916
917
918
919
920
921

    def test_token_whitespace(self):
        st = self.gr("AB", 'doc')
        assert not st.error_flag
        st = self.gr("A B", 'doc')
        assert not st.error_flag

    def test_regexp_whitespace(self):
        st = self.gr("AB", 'Rdoc')
        assert not st.error_flag
        st = self.gr("A B", 'Rdoc')
        assert st.error_flag
di68kap's avatar
di68kap committed
922
923
924
925
926


class TestErrorReporting:
    grammar = """
        root      = series alpha | anything
eckhart's avatar
eckhart committed
927
        series    = subseries &alpha
di68kap's avatar
di68kap committed
928
929
930
931
932
933
934
935
936
937
938
939
940
941
        subseries = alpha §beta
        alpha     = /[a-z]+/
        beta      = /[A-Z]+/
        anything  = /.*/
        """

    def setup(self):
        self.parser = grammar_provider(self.grammar)()

    def test_error_propagation(self):
        testcode1 = "halloB"
        testcode2 = "XYZ"
        testcode3 = "hallo "
        cst = self.parser(testcode1)
942
        assert not cst.error_flag, str(cst.errors_sorted)
di68kap's avatar
di68kap committed
943
944
945
946
        cst = self.parser(testcode2)
        assert not cst.error_flag
        cst = self.parser(testcode3)
        assert cst.error_flag
947
948


di68kap's avatar
di68kap committed
949
950
class TestBorderlineCases:
    def test_not_matching(self):
951
        minilang = """parser = /X/\n"""
di68kap's avatar
di68kap committed
952
953
954
955
        gr = grammar_provider(minilang)()
        cst = gr('X', 'parser')
        assert not cst.error_flag
        cst = gr(' ', 'parser')
eckhart's avatar
eckhart committed
956
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
957
        cst = gr('', 'parser')
eckhart's avatar
eckhart committed
958
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
959
960
961
962
963
964
965

    def test_matching(self):
        minilang = """parser = /.?/"""
        gr = grammar_provider(minilang)()
        cst = gr(' ', 'parser')
        assert not cst.error_flag
        cst = gr('  ', 'parser')
966
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
967
968
969
970
        cst = gr('', 'parser')
        assert not cst.error_flag


eckhart's avatar
eckhart committed
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
EBNF_with_Errors = r"""# Test code with errors. All places marked by a "$" should yield and error

@ comment    = /#.*(?:\n|$)/
@ whitespace = /\s*/
@ literalws  = right
@ anonymous  = pure_elem, EOF
@ drop       = whitespace, EOF


# re-entry-rules for resuming after parsing-error
@ definition_resume = /\n\s*(?=@|\w+\w*\s*=)/
@ directive_resume  = /\n\s*(?=@|\w+\w*\s*=)/

# specialized error messages for certain cases

@ definition_error  = /,/, 'Delimiter "," not expected in definition!\nEither this was meant to '
                           'be a directive and the directive symbol @ is missing\nor the error is '
                           'due to inconsistent use of the comma as a delimiter\nfor the elements '
                           'of a sequence.'

#: top-level

syntax     = [~//] { definition | directive } §EOF
definition = symbol §:DEF~ expression :ENDL~
directive  = "@" §symbol "="
             (regexp | literals | symbol)
             { "," (regexp | literals | symbol) }

#: components