test_parse.py 53.6 KB
Newer Older
1
#!/usr/bin/env python3
2

3
"""test_parse.py - tests of the parsers-module of DHParser
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

eckhart's avatar
eckhart committed
22
import copy
23
import os
24
import sys
25
from functools import partial
26
from typing import List, Tuple
27

28
29
scriptpath = os.path.dirname(__file__) or '.'
sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
30

eckhart's avatar
eckhart committed
31
from DHParser.configuration import get_config_value, set_config_value
32
from DHParser.toolkit import compile_python_object, re
33
from DHParser.log import is_logging, log_ST, log_parsing_history
eckhart's avatar
eckhart committed
34
35
36
from DHParser.error import Error, is_error, adjust_error_locations, MANDATORY_CONTINUATION, \
    MALFORMED_ERROR_STRING, MANDATORY_CONTINUATION_AT_EOF, RESUME_NOTICE, PARSER_STOPPED_BEFORE_END, \
    PARSER_NEVER_TOUCHES_DOCUMENT
37
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
38
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, \
di68kap's avatar
di68kap committed
39
    Interleave, UnknownParserError, CombinedParser, Token, EMPTY_NODE, Capture, Drop, Whitespace, \
40
    GrammarError, Counted, Always, INFINITE
41
from DHParser import compile_source
42
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, \
43
    parse_ebnf, DHPARSER_IMPORTS, compile_ebnf
44
from DHParser.dsl import grammar_provider, create_parser
45
from DHParser.syntaxtree import Node, parse_sxpr
46
from DHParser.stringview import StringView
47
from DHParser.trace import set_tracer, trace_history, resume_notices_on
48
49


50
51
52
53
54
55

class TestWhitespace:
    # TODO: add test cases here
    def test_whitespace_comment_mangling(self):
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
56
    def test_non_empty_derivation(self):
57
58
        pass

59

60
61
class TestParserError:
    def test_parser_error_str(self):
62
        pe = ParserError(Node('TAG', 'test').with_pos(0), StringView('Beispiel'), None, True)
63
64
        assert str(pe).find('Beispiel') >= 0 and str(pe).find('TAG') >= 0

65
    def test_false_lookahead_only_message(self):
66
        """PARSER_LOOKAHEAD_*_ONLY errors must not be reported if there
67
68
69
70
71
72
        no lookahead parser in the history!"""
        lang = """
        word = letters { letters | `-` letters }
        letters = /[A-Za-z]+/
        """
        gr = grammar_provider(lang)()
73
74
        set_tracer(gr, trace_history)
        st = gr('hard-time')
75
        assert not st.errors
76
        st = gr('hard-')
77
78
        assert st.errors and not any(e.code == 1045 for e in st.errors)

eckhart's avatar
eckhart committed
79

eckhart's avatar
eckhart committed
80
81
82
83
84
85
86
87
88
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
89
90
        def visitor(context: List[Parser]):
            p = context[-1]
di68kap's avatar
di68kap committed
91
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
92
93
94
95
96
97
98
99
100
101
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3

102
103
104
105
106
107
108
109
    def test_symbol(self):
        class MyGrammar(Grammar):
            wrong = Token('wrong')
            word = OneOrMore(wrong) + Whitespace(r'\s*') + OneOrMore(RegExp(r'\w+'))
            root__ = word
        gr = MyGrammar()
        regex = gr['word'].parsers[-1].parser
        result = gr.associated_symbol(regex).symbol
110
        assert result == 'word', result
111

eckhart's avatar
eckhart committed
112

113
class TestInfiLoopsAndRecursion:
114
    def test_direct_left_recursion1(self):
eckhart's avatar
eckhart committed
115
        minilang = """
116
117
118
119
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
120
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
121
        parser = grammar_provider(minilang)()
122
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
123
        syntax_tree = parser(snippet)
124
125
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)
Eckhart Arnold's avatar
Eckhart Arnold committed
126
        if is_logging():
127
            log_ST(syntax_tree, "test_LeftRecursion_direct.cst")
128
            log_parsing_history(parser, "test_LeftRecursion_direct")
129

130
    def test_direct_left_recursion2(self):
131
132
133
134
135
136
        minilang = """
            expr = ex
            ex   = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
137
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
138
        parser = grammar_provider(minilang)()
139
        assert parser
140
        syntax_tree = parser(snippet)
141
142
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
143

144
    def test_indirect_left_recursion1(self):
145
146
        minilang = """
            Expr    = //~ (Product | Sum | Value)
147
148
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
149
            Value   = /[0-9.]+/~ | '(' §Expr ')'
150
            """
151
        parser = grammar_provider(minilang)()
152
        snippet = "8 * 4"
153
        syntax_tree = parser(snippet)
154
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
155
156
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
157
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
158
159
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
160
161
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
162
        if is_logging():
163
            log_ST(syntax_tree, "test_LeftRecursion_indirect.cst")
164
            log_parsing_history(parser, "test_LeftRecursion_indirect")
165

166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
    # # BEWARE: EXPERIMENTAL TEST can be long running
    # def test_indirect_left_recursion2(self):
    #     arithmetic_syntax = """
    #         expression     = addition | subtraction
    #         addition       = (expression | term) "+" (expression | term)
    #         subtraction    = (expression | term) "-" (expression | term)
    #         term           = multiplication | division
    #         multiplication = (term | factor) "*" (term | factor)
    #         division       = (term | factor) "/" (term | factor)
    #         factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
    #         group          = "(" expression ")"
    #         SIGN           = /[+-]/
    #         NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
    #         VARIABLE       = /[A-Za-z]/~
    #         """
    #     arithmetic = grammar_provider(arithmetic_syntax)()
    #     arithmetic.left_recursion_depth__ = 2
    #     assert arithmetic
    #     syntax_tree = arithmetic("(a + b) * (a - b)")
    #     assert syntax_tree.errors
186

187
188
189
    def test_break_inifnite_loop_ZeroOrMore(self):
        forever = ZeroOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
190
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
191
192
193
194

    def test_break_inifnite_loop_OneOrMore(self):
        forever = OneOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
195
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
196

197
198
199
200
201
202
203
204
205
206
    def test_break_infinite_loop_Counted(self):
        forever = Counted(Always(), (0, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (5, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (INFINITE, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
eckhart's avatar
eckhart committed
207
        forever = Counted(Always(), (1000, INFINITE - 1))
208
209
210
211
212
213
214
215
216
217
218
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    def test_break_infinite_loop_Interleave(self):
        forever = Interleave(Always(), repetitions = [(0, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Interleave(Always(), Always(),
                             repetitions = [(5, INFINITE), (INFINITE, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
eckhart's avatar
eckhart committed
219
        forever = Interleave(Always(), repetitions = [(1000, INFINITE - 1)])
220
221
222
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

223
224
225
226
227
    # def test_infinite_loops(self):
    #     minilang = """forever = { // } \n"""
    #     try:
    #         parser_class = grammar_provider(minilang)
    #     except CompilationError as error:
228
    #         assert all(e.code == INFINITE_LOOP for e in error.errors)
229
230
231
232
233
234
    #     save = get_config_value('static_analysis')
    #     set_config_value('static_analysis', 'late')
    #     provider = grammar_provider(minilang)
    #     try:
    #         parser = provider()
    #     except GrammarError as error:
235
    #         assert error.errors[0][2].code == INFINITE_LOOP
236
237
238
239
    #     set_config_value('static_analysis', 'none')
    #     parser = provider()
    #     snippet = " "
    #     syntax_tree = parser(snippet)
240
    #     assert any(e.code == INFINITE_LOOP for e in syntax_tree.errors)
241
    #     res = parser.static_analysis()
242
    #     assert res and res[0][2].code == INFINITE_LOOP
243
244
245
246
247
    #     minilang = """not_forever = { / / } \n"""
    #     parser = grammar_provider(minilang)()
    #     res = parser.static_analysis()
    #     assert not res
    #     set_config_value('static_analysis', save)
248

eckhart's avatar
eckhart committed
249

250
251
252
253
254
255
# class TestStaticAnalysis:
#     def test_alternative(self):
#         lang = 'doc = "A" | "AB"'
#         parser = create_parser(lang)


Eckhart Arnold's avatar
Eckhart Arnold committed
256
class TestFlowControl:
257
258
259
260
261
262
    t1 = """
         All work and no play
         makes Jack a dull boy
         END
         """
    t2 = "All word and not play makes Jack a dull boy END\n"
263

Eckhart Arnold's avatar
Eckhart Arnold committed
264
    def test_lookbehind(self):
265
266
        ws = RegExp(r'\s*');  ws.pname = "ws"
        end = RegExp("END");  end.pname = "end"
267
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
268
        word = RegExp(r'\w+');  word.pname = "word"
Eckhart Arnold's avatar
Eckhart Arnold committed
269
270
271
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws
        parser = Grammar(document)
272
273
274
275
276
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

277
278
279
280
281
282
283
284
        cst = parser(self.t2, parser['ws'], complete_match=False)
        assert cst.did_match() and len(cst) == 0 and not cst.errors
        cst = parser(self.t2, parser['word'], complete_match=False)
        assert cst.did_match() and cst.content == "All" and not cst.errors
        cst = parser(self.t2, parser['end'], complete_match=False)
        assert not cst.did_match()


285
286
    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
287
            parser_initialization__ = ["upon instantiation"]
288
            ws = RegExp(r'\s*')
289
            end = RegExp('END')
290
            SUCC_LB = RegExp('\\s*?\\n')
291
            doc_end = Series(Lookbehind(SUCC_LB), end)
292
            word = RegExp(r'\w+')
293
294
295
296
297
298
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
299
        assert not cst.error_flag, cst.as_sxpr()
300
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
301
302
303
        assert cst.error_flag, cst.as_sxpr()


304
305
306
307
308
309
310
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
311
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
Eckhart Arnold's avatar
Eckhart Arnold committed
312
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
313
        assert result
314
        assert not messages, str(messages)
315
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
316
317
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
318
        assert node.tag_name == "regex"
319
320
        assert str(node) == 'abc+def'

321
322
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
eckhart's avatar
eckhart committed
323
324
        regex =  /\w+
                  [+]
325
326
                  \w* /
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
327
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
328
329
330
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
331
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
332
333
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
334
        assert node.tag_name == "regex"
335
336
        assert str(node) == 'abc+def'

337
    def test_ignore_case(self):
338
339
340
341
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
342
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
343
344
345
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
346
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
347
        node, rest = parser.regex(StringView('Alpha'))
348
349
        assert node
        assert rest == ''
350
        assert node.tag_name == "regex"
351
352
353
354
355
356
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
357
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
358
359
360
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
361
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
362
363
        node, rest = parser.regex(StringView('Alpha'))
        assert node is None
364

365
366
367
368
369
370
371
372
373
374
375
376
    def test_token(self):
        tokenlang = r"""
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
377
        result, messages, _ = compile_source(
eckhart's avatar
eckhart committed
378
379
            tokenlang, None, get_ebnf_grammar(), get_ebnf_transformer(),
            get_ebnf_compiler("TokenTest"))
380
        assert result
eckhart's avatar
eckhart committed
381
        assert not messages, str(messages)
382
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
383
        result = parser(testdoc)
384
        # log_parsing_history(parser, "test.log")
385
        assert not result.error_flag, str(result.errors_sorted)
386

387

388
class TestGrammar:
389
390
391
392
393
394
395
396
397
    grammar = r"""@whitespace = horizontal
    haupt        = textzeile LEERZEILE
    textzeile    = { WORT }+
    WORT         = /[^ \t]+/~
    LEERZEILE    = /\n[ \t]*(?=\n)/~
    """
    pyparser, messages, _ = compile_source(grammar, None, get_ebnf_grammar(),
                                           get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
    assert pyparser
eckhart's avatar
eckhart committed
398
    assert not messages, str(messages)
399
400
401
402

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
403
404
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
        grammar("no_file_name*")
405
        for record in grammar.history__:
406
407
            assert not record.node or record.node.pos >= 0

408
    def test_select_parsing(self):
409
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
410
411
412
413
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
414

di68kap's avatar
di68kap committed
415
416
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
417
            r'''
di68kap's avatar
di68kap committed
418
419
420
421
422
423
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
424
425
426
427
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
428
429
430
431
432
433
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

434
435
436
437
438
439
440
    def test_incomplete_matching(self):
        """Tests whether the flag `complete_match` works as expected when
        calling a grammar object in order to parse a document."""
        gr = grammar_provider('word = ~/\\w+/\n')()
        st = gr('eins')
        assert not st.errors
        st = gr('eins zwei')
441
        assert st.errors[0].code == PARSER_STOPPED_BEFORE_END
442
443
444
        st = gr('eins zwei', complete_match=False)
        assert not st.errors

445
    def test_synonym(self):
di68kap's avatar
di68kap committed
446
        lang = r"""
447
448
449
450
451
452
453
454
455
456
457
458
            doc  = { word | number }
            word = /\w+/ S
            number = [VZ] /\d+/ S 
            S    = ~        # let S by a synonym for anonymous whitespace
            VZ   = "-"
        """
        gr = grammar_provider(lang)()
        st = gr('eins 1 zwei2drei 3')
        # set_config_value('compiled_EBNF_log', 'grammar.log')
        gr = grammar_provider("@drop = whitespace, token" + lang)()
        st = gr('eins 1 zwei2drei 3')
        st = gr('-3')
459
        assert str(gr['S']) == "S = ~", str(gr['S'])
460

461

462
463
464
465
466
467
468
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
Eckhart Arnold's avatar
Eckhart Arnold committed
469
        st = parser("ABCD")
470
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
471
        st = parser("A_CD")
472
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
473
        st = parser("AB_D")
474
475
476
477
478
479
480
481
482
483
484
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
485
486
487
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
488
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
489
        # transitivity of mandatory-operator
490
        st = parser("ABC_");  assert st.error_flag
491
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
492
493

    def test_series_composition(self):
494
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
495
496
497
498
499
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
500
501
502
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
503
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
504
        st = parser("ABC_E");  assert st.error_flag
505
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
506
507
508

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
509
510
511
512
513
514
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
515
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
516

eckhart's avatar
eckhart committed
517
518
519
520
521
522
523
524
525
526
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
527

528
529
    def test_ebnf_serialization(self):
        ebnf_grammar = get_ebnf_grammar()
eckhart's avatar
eckhart committed
530
        # TODO: Add test here
531
        ebnf = ebnf_grammar.as_ebnf()
532
        # print(ebnf)
533
534


535
536
537
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
538
        prefixes = Interleave(TKN("A"), TKN("B"))
539
540
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
541
542
543
544

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
eckhart's avatar
eckhart committed
545
        prefixes = Interleave(TKN("A"), TKN("B"))
546
547
548
549
550
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
551
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
552
553
554
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
555
556
557
558
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
559
        prefixes = Interleave(TKN("A"), TKN("B"))
560
561
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
eckhart's avatar
eckhart committed
562
563
564
        st = Grammar(prefixes)('B')
        assert st.error_flag
        prefixes = Interleave(TKN("B"), TKN("A"), repetitions=((0, 1), (0, 1)))
565
        assert Grammar(prefixes)('A B').content == 'A B'
eckhart's avatar
eckhart committed
566
567
568
        st = Grammar(prefixes)('B')
        assert not st.error_flag
        assert st.content == 'B'
569
570
571
572

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
573
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
574
575
576
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
577
578
579
        assert Grammar(prefixes)('A B B').error_flag


580
581
582
583
584
class TestInterleave:
    def test_interleave_most_simple(self):
        letterset = Interleave(Token("A"), Token("B"), Token("C"))
        gr = Grammar(letterset)
        st = gr('ABC')
585
        assert not st.errors, str(st.errors)
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
        assert st.content == "ABC"
        st = gr('BCA')
        assert not st.errors
        assert st.content == "BCA"
        st = gr('BCBA')
        assert st.errors
        st = gr('AB')
        assert st.errors

    def test_interleave(self):
        letterset = Interleave(Token("A"), Token("B"), Token("C"),
                               repetitions=[(1, 1000), (0, 1), (1, 1)])
        gr = Grammar(letterset)
        st = gr('AABC')
        assert not st.errors
601
602
603
604
605
606
607
608
609
610
        st = gr('BACAAA')
        assert not st.errors
        st = gr('ABCC')
        assert st.errors
        st = gr('AAACAAA')
        assert not st.errors
        st = gr('AAABAAA')
        assert st.errors


611
612
613
614
class TestErrorRecovery:
    def test_series_skip(self):
        lang = """
        document = series | /.*/
615
        @series_skip = /(?=[A-Z])/
616
617
618
619
620
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser('AB_D')
        assert len(st.errors) == 1  # no additional "stopped before end"-error!
621
        resume_notices_on(parser)
622
        st = parser('AB_D')
623
        assert len(st.errors) == 2 and any(err.code == RESUME_NOTICE for err in st.errors)
eckhart's avatar
eckhart committed
624
        assert 'Skipping' in str(st.errors_sorted[1])
625

626
    def test_Interleave_skip(self):
627
628
629
        lang = """
        document = allof | /.*/
        @allof_skip = /[A-Z]/
630
        allof = "A" ° §"B" ° "C" ° "D"
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
        """
        parser = grammar_provider(lang)()
        st = parser('CADB')
        assert 'allof' in st and st['allof'].content == "CADB"
        st = parser('_BCD')
        assert st.equals(parse_sxpr('(document "_BCD")'))
        st = parser('_ABC')
        assert st.equals(parse_sxpr('(document "_ABC")'))
        st = parser('A_CD')
        assert st['allof'].content == "A_CD"
        st = parser('AB_D')
        assert st['allof'].content == "AB_D"
        st = parser('A__D')
        assert st['allof'].content == "A__D"
        st = parser('CA_D')
        assert st['allof'].content == "CA_D"
647
648
        st = parser('A_CB')
        assert st['allof'].content == "A_CB"
649
        st = parser('BC_A')
650
        assert 'allof' not in st
651
652


653
class TestPopRetrieve:
654
    mini_language = r"""
655
656
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
657
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrival!
658
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
659
        text           = /[^`]+/
660
        """
661
    mini_lang2 = r"""
662
        @braces_filter = matching_bracket()
663
664
665
666
667
668
669
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
670
    mini_lang3 = r"""
671
        document       = { text | env }
eckhart's avatar
eckhart committed
672
        env            = (specialtag | opentag) text [ closespecial | closetag ]
673
674
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
675
        closetag       = close_slash | close_star
676
677
678
679
680
681
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """
682
683
684
685
686
687
688
689
    mini_lang4 = r"""
        document       = { text | env }
        env            = opentag document closetag
        opentag        = "<" name ">"
        closetag       = "</" :?name ">"
        name           = /\w+/~
        text           = /[^<>]+/        
    """
690
691

    def setup(self):
692
693
694
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
695
        self.minilang_parser4 = grammar_provider(self.mini_lang4)()
696
697

    @staticmethod
698
    def has_tag_name(node, name):
699
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
700

701
702
    def test_capture_assertions(self):
        try:
703
704
705
            _ = Grammar(Capture(Drop(Whitespace(r'\s*'))))
            assert False, "GrammarError expected!"
        except GrammarError as ge:
706
707
            pass
        try:
708
            _ = Grammar(Capture(Series(Token(' '), Drop(Whitespace(r'\s*')))))
709
            assert False, "ValueError expected!"
710
        except GrammarError:
711
            pass
712
713
714
        cp = Capture(RegExp(r'\w+'))
        cp.pname = "capture"
        _ = Grammar(cp)
715

716
717
718
719
    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3
720
        assert self.minilang_parser4
721
722
723
724

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
725
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
726
727
728

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
729
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
730
731
732

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
733
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
734
735
736

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
737
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
738
739
740

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
741
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
742

743
744
745
    def test_optional_match(self):
        test1 = '<info>Hey, you</info>'
        st = self.minilang_parser4(test1)
746
        assert not st.error_flag, str(st.errors_sorted)
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
        test12 = '<info>Hey, <emph>you</emph></info>'
        st = self.minilang_parser4(test1)
        assert not st.error_flag
        test2 = '<info>Hey, you</>'
        st = self.minilang_parser4(test2)
        assert not st.error_flag
        test3 = '<info>Hey, <emph>you</></>'
        st = self.minilang_parser4(test3)
        assert not st.error_flag
        test4 = '<info>Hey, <emph>you</></info>'
        st = self.minilang_parser4(test4)
        assert not st.error_flag

    def test_rollback_behaviour_of_optional_match(self):
        test1 = '<info>Hey, you</info*>'
        st = self.minilang_parser4(test1)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag
        test2 = '<info>Hey, you</*>'
        st = self.minilang_parser4(test2)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag

770
    def test_cache_neutrality(self):
771
772
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
773
        lang = r"""
774
775
776
777
778
779
780
781
782
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
783
        gr = grammar_provider(lang)()
784
        st = gr(case)
785
        assert not st.error_flag, str(st.errors_sorted)
786
787
788
789

    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
790
791
        assert not syntax_tree.errors_sorted, \
            ''.join(str(error) for error in syntax_tree.errors_sorted)
792
793
794
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
795
796
        assert delim == pop
        if is_logging():
797
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
798
799
800
801
802
803
804
805
806
807
808

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
809
        assert not syntax_tree.errors_sorted
810
811
812
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
813
814
        assert delim == pop
        if is_logging():
815
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
816
817
818
819

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
820
        assert not syntax_tree.errors_sorted
821
822
823
824
825
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
        assert len(delim) == len(pop)
        assert delim != pop
826
        if is_logging():
827
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
828
829
830
831
832
833
834
835
836
837
838

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
839
        assert not syntax_tree.errors_sorted
840
841
842
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
843
844
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
845
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
846

847
    def test_autoretrieve(self):
848
849
        lang = r"""
            document   = { definition } § EOF
850
            definition = symbol :defsign value
851
            symbol     = /\w+/~                      
852
853
            defsign    = "=" | ":="
            value      = /\d+/~
eckhart's avatar
eckhart committed
854
            EOF        = !/./ [ :?defsign ]   # eat up captured defsigns
855
        """
856
857
        # code, _, _ = compile_ebnf(lang)
        # print(code)
858
        parser = grammar_provider(lang)()
859
860
861
        st = parser("X := 1")
        assert not st.error_flag
        st1 = st
862
863
        st = parser("")
        assert not st.error_flag
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887

        lines = [line for line in lang.split('\n') if line.strip()]
        eof_line = lines.pop()
        lines.insert(1, eof_line)
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)

        del lines[1]
        lines.insert(2, eof_line)
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)

        # and, finally...
        lang_variant = r"""
            document   = { definition } § EOF
            symbol     = /\w+/~                      
            defsign    = "=" | ":="
            value      = /\d+/~
888
            EOF        = !/./ :?defsign   # eat up captured defsign, only if it has been retrieved
889
890
891
892
893
894
            definition = symbol :defsign value
        """
        parser = grammar_provider(lang_variant)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)
895
896
        st = parser('')
        assert "EOF expected" in str(st.errors)
di68kap's avatar
di68kap committed
897

898

899
class TestWhitespaceHandling:
Eckhart Arnold's avatar
Eckhart Arnold committed
900
    minilang = """
901
902
903
904
905
906
907
        doc = A B
        A = "A"
        B = "B"
        Rdoc = ar br
        ar = /A/
        br = /B/
        """
908
    gr = grammar_provider(minilang)()
909
910
911
912
913
914
915
916
917
918
919
920

    def test_token_whitespace(self):
        st = self.gr("AB", 'doc')
        assert not st.error_flag
        st = self.gr("A B", 'doc')
        assert not st.error_flag

    def test_regexp_whitespace(self):
        st = self.gr("AB", 'Rdoc')
        assert not st.error_flag
        st = self.gr("A B", 'Rdoc')
        assert st.error_flag
di68kap's avatar
di68kap committed
921
922
923
924
925


class TestErrorReporting:
    grammar = """
        root      = series alpha | anything
eckhart's avatar
eckhart committed
926
        series    = subseries &alpha
di68kap's avatar
di68kap committed
927
928
929
930
931
932
933
934
935
936
937
938
939
940
        subseries = alpha §beta
        alpha     = /[a-z]+/
        beta      = /[A-Z]+/
        anything  = /.*/
        """

    def setup(self):
        self.parser = grammar_provider(self.grammar)()

    def test_error_propagation(self):
        testcode1 = "halloB"
        testcode2 = "XYZ"
        testcode3 = "hallo "
        cst = self.parser(testcode1)
941
        assert not cst.error_flag, str(cst.errors_sorted)
di68kap's avatar
di68kap committed
942
943
944
945
        cst = self.parser(testcode2)
        assert not cst.error_flag
        cst = self.parser(testcode3)
        assert cst.error_flag
946
947


di68kap's avatar
di68kap committed
948
949
class TestBorderlineCases:
    def test_not_matching(self):
950
        minilang = """parser = /X/\n"""
di68kap's avatar
di68kap committed
951
952
953
954
        gr = grammar_provider(minilang)()
        cst = gr('X', 'parser')
        assert not cst.error_flag
        cst = gr(' ', 'parser')
eckhart's avatar
eckhart committed
955
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
956
        cst = gr('', 'parser')
eckhart's avatar
eckhart committed
957
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
958
959
960
961
962
963
964

    def test_matching(self):
        minilang = """parser = /.?/"""
        gr = grammar_provider(minilang)()
        cst = gr(' ', 'parser')
        assert not cst.error_flag
        cst = gr('  ', 'parser')
965
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
966
967
968
969
        cst = gr('', 'parser')
        assert not cst.error_flag


eckhart's avatar
eckhart committed
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
EBNF_with_Errors = r"""# Test code with errors. All places marked by a "$" should yield and error

@ comment    = /#.*(?:\n|$)/
@ whitespace = /\s*/
@ literalws  = right
@ anonymous  = pure_elem, EOF
@ drop       = whitespace, EOF


# re-entry-rules for resuming after parsing-error
@ definition_resume = /\n\s*(?=@|\w+\w*\s*=)/
@ directive_resume  = /\n\s*(?=@|\w+\w*\s*=)/

# specialized error messages for certain cases

@ definition_error  = /,/, 'Delimiter "," not expected in definition!\nEither this was meant to '
                           'be a directive and the directive symbol @ is missing\nor the error is '
                           'due to inconsistent use of the comma as a delimiter\nfor the elements '
                           'of a sequence.'

#: top-level

syntax     = [~//] { definition | directive } §EOF
definition = symbol §:DEF~ expression :ENDL~
directive  = "@" §symbol "="
             (regexp | literals | symbol)
             { "," (regexp | literals | symbol) }

#: components

expression = sequence { :OR~ sequence }