test_parse.py 38.6 KB
Newer Older
1
2
#!/usr/bin/python3

3
"""test_parse.py - tests of the parsers-module of DHParser
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

22
import os
23
import sys
24
from functools import partial
25

26
27
scriptpath = os.path.dirname(__file__) or '.'
sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
28

eckhart's avatar
eckhart committed
29
from DHParser.configuration import get_config_value, set_config_value
30
from DHParser.toolkit import compile_python_object, re
31
from DHParser.log import is_logging, log_ST, log_parsing_history
32
from DHParser.error import Error, is_error
33
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
34
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, AllOf, SomeOf, \
35
    UnknownParserError, MetaParser, EMPTY_NODE
36
from DHParser import compile_source
37
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, DHPARSER_IMPORTS
38
from DHParser.dsl import grammar_provider
39
from DHParser.syntaxtree import Node, parse_sxpr
40
from DHParser.stringview import StringView
41
42


43
44
45
46
47
48

class TestWhitespace:
    # TODO: add test cases here
    def test_whitespace_comment_mangling(self):
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
49
    def test_non_empty_derivation(self):
50
51
        pass

52
53
class TestParserError:
    def test_parser_error_str(self):
54
        pe = ParserError(Node('TAG', 'test').with_pos(0), StringView('Beispiel'), None, True)
55
56
        assert str(pe).find('Beispiel') >= 0 and str(pe).find('TAG') >= 0

57
58
59
60
61
62
63
64
65
66
67
68
69
    def test_false_lookahead_only_message(self):
        """Error.PARSER_LOOKAHEAD_*_ONLY errors must not be reported if there
        no lookahead parser in the history!"""
        lang = """
        word = letters { letters | `-` letters }
        letters = /[A-Za-z]+/
        """
        gr = grammar_provider(lang)()
        st = gr('hard-time', track_history=True)
        assert not st.errors
        st = gr('hard-', track_history=True)
        assert st.errors and not any(e.code == 1045 for e in st.errors)

eckhart's avatar
eckhart committed
70

eckhart's avatar
eckhart committed
71
72
73
74
75
76
77
78
79
80
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
        def visitor(p: Parser):
di68kap's avatar
di68kap committed
81
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
82
83
84
85
86
87
88
89
90
91
92
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3


93
class TestInfiLoopsAndRecursion:
94
    def test_direct_left_recursion1(self):
eckhart's avatar
eckhart committed
95
        minilang = """
96
97
98
99
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
100
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
101
        parser = grammar_provider(minilang)()
102
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
103
        syntax_tree = parser(snippet)
104
105
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)
Eckhart Arnold's avatar
Eckhart Arnold committed
106
        if is_logging():
107
            log_ST(syntax_tree, "test_LeftRecursion_direct.cst")
108
            log_parsing_history(parser, "test_LeftRecursion_direct")
109

110
    def test_direct_left_recursion2(self):
111
112
113
114
115
116
        minilang = """
            expr = ex
            ex   = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
117
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
118
        parser = grammar_provider(minilang)()
119
        assert parser
120
        syntax_tree = parser(snippet)
121
122
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
123

124
    def test_indirect_left_recursion1(self):
125
126
        minilang = """
            Expr    = //~ (Product | Sum | Value)
127
128
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
129
            Value   = /[0-9.]+/~ | '(' §Expr ')'
130
            """
131
        parser = grammar_provider(minilang)()
132
        assert parser
133
        snippet = "8 * 4"
134
        syntax_tree = parser(snippet)
135
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
136
137
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
138
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
139
140
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
141
142
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
143
        if is_logging():
144
            log_ST(syntax_tree, "test_LeftRecursion_indirect.cst")
145
            log_parsing_history(parser, "test_LeftRecursion_indirect")
146

147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
    # # BEWARE: EXPERIMENTAL TEST can be long running
    # def test_indirect_left_recursion2(self):
    #     arithmetic_syntax = """
    #         expression     = addition | subtraction
    #         addition       = (expression | term) "+" (expression | term)
    #         subtraction    = (expression | term) "-" (expression | term)
    #         term           = multiplication | division
    #         multiplication = (term | factor) "*" (term | factor)
    #         division       = (term | factor) "/" (term | factor)
    #         factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
    #         group          = "(" expression ")"
    #         SIGN           = /[+-]/
    #         NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
    #         VARIABLE       = /[A-Za-z]/~
    #         """
    #     arithmetic = grammar_provider(arithmetic_syntax)()
    #     arithmetic.left_recursion_depth__ = 2
    #     assert arithmetic
    #     syntax_tree = arithmetic("(a + b) * (a - b)")
    #     assert syntax_tree.errors
167

168
169
170
171
172
173
174
175
176
177
    def test_break_inifnite_loop_ZeroOrMore(self):
        forever = ZeroOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
        assert repr(result) == "Node(:EMPTY__, )", repr(result)

    def test_break_inifnite_loop_OneOrMore(self):
        forever = OneOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
        assert repr(result) == "Node(:EMPTY__, )", str(result)

178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
    # def test_infinite_loops(self):
    #     minilang = """forever = { // } \n"""
    #     try:
    #         parser_class = grammar_provider(minilang)
    #     except CompilationError as error:
    #         assert all(e.code == Error.INFINITE_LOOP for e in error.errors)
    #     save = get_config_value('static_analysis')
    #     set_config_value('static_analysis', 'late')
    #     provider = grammar_provider(minilang)
    #     try:
    #         parser = provider()
    #     except GrammarError as error:
    #         assert error.errors[0][2].code == Error.INFINITE_LOOP
    #     set_config_value('static_analysis', 'none')
    #     parser = provider()
    #     snippet = " "
    #     syntax_tree = parser(snippet)
    #     assert any(e.code == Error.INFINITE_LOOP for e in syntax_tree.errors)
    #     res = parser.static_analysis()
    #     assert res and res[0][2].code == Error.INFINITE_LOOP
    #     minilang = """not_forever = { / / } \n"""
    #     parser = grammar_provider(minilang)()
    #     res = parser.static_analysis()
    #     assert not res
    #     set_config_value('static_analysis', save)
203

eckhart's avatar
eckhart committed
204

Eckhart Arnold's avatar
Eckhart Arnold committed
205
class TestFlowControl:
206
207
    def setup(self):
        self.t1 = """
eckhart's avatar
eckhart committed
208
        All work and no play
209
210
211
212
213
        makes Jack a dull boy
        END
        """
        self.t2 = "All word and not play makes Jack a dull boy END\n"

Eckhart Arnold's avatar
Eckhart Arnold committed
214
    def test_lookbehind(self):
215
        ws = RegExp(r'\s*')
Eckhart Arnold's avatar
Eckhart Arnold committed
216
        end = RegExp("END")
217
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
218
        word = RegExp(r'\w+')
Eckhart Arnold's avatar
Eckhart Arnold committed
219
220
221
222
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws

        parser = Grammar(document)
223
224
225
226
227
228
229
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
230
            parser_initialization__ = ["upon instantiation"]
231
            ws = RegExp(r'\s*')
232
            end = RegExp('END')
233
            SUCC_LB = RegExp('\\s*?\\n')
234
            doc_end = Series(Lookbehind(SUCC_LB), end)
235
            word = RegExp(r'\w+')
236
237
238
239
240
241
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
242
        assert not cst.error_flag, cst.as_sxpr()
243
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
244
245
246
        assert cst.error_flag, cst.as_sxpr()


247
248
249
250
251
252
253
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
254
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
Eckhart Arnold's avatar
Eckhart Arnold committed
255
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
256
        assert result
257
        assert not messages, str(messages)
258
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
259
260
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
261
        assert node.tag_name == "regex"
262
263
        assert str(node) == 'abc+def'

264
265
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
eckhart's avatar
eckhart committed
266
267
        regex =  /\w+
                  [+]
268
269
                  \w* /
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
270
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
271
272
273
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
274
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
275
276
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
277
        assert node.tag_name == "regex"
278
279
        assert str(node) == 'abc+def'

280
281
282
283
284
    def text_ignore_case(self):
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
285
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
286
287
288
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
289
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
290
291
292
293
        node, rest = parser.regex('Alpha')
        assert node
        assert not node.error_flag
        assert rest == ''
294
        assert node.tag_name == "regex"
295
296
297
298
299
300
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
301
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
302
303
304
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
305
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
306
307
308
309
        node, rest = parser.regex('Alpha')
        assert node.error_flag


310
311
312
313
314
315
316
317
318
319
320
321
    def test_token(self):
        tokenlang = r"""
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
322
        result, messages, _ = compile_source(
eckhart's avatar
eckhart committed
323
324
            tokenlang, None, get_ebnf_grammar(), get_ebnf_transformer(),
            get_ebnf_compiler("TokenTest"))
325
        assert result
eckhart's avatar
eckhart committed
326
        assert not messages, str(messages)
327
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
328
        result = parser(testdoc)
329
        # log_parsing_history(parser, "test.log")
330
331
        assert not result.error_flag

332

333
class TestGrammar:
334
    def setup(self):
335
336
337
338
339
340
        grammar = r"""@whitespace = horizontal
        haupt        = textzeile LEERZEILE
        textzeile    = { WORT }+
        WORT         = /[^ \t]+/~
        LEERZEILE    = /\n[ \t]*(?=\n)/~
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
341
342
        self.pyparser, messages, _ = compile_source(grammar, None, get_ebnf_grammar(),
                                    get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
343
        assert self.pyparser
344
        assert not messages
345
346
347
348

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
349
350
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
        grammar("no_file_name*")
351
        for record in grammar.history__:
352
353
            assert not record.node or record.node.pos >= 0

354
    def test_select_parsing(self):
355
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
356
357
358
359
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
360

di68kap's avatar
di68kap committed
361
362
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
363
            r'''
di68kap's avatar
di68kap committed
364
365
366
367
368
369
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
370
371
372
373
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
374
375
376
377
378
379
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

380
381
382
383
384
385
386
387
388
389
390
    def test_incomplete_matching(self):
        """Tests whether the flag `complete_match` works as expected when
        calling a grammar object in order to parse a document."""
        gr = grammar_provider('word = ~/\\w+/\n')()
        st = gr('eins')
        assert not st.errors
        st = gr('eins zwei')
        assert st.errors[0].code == Error.PARSER_STOPPED_BEFORE_END
        st = gr('eins zwei', complete_match=False)
        assert not st.errors

391

392
393
394
395
396
397
398
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
Eckhart Arnold's avatar
Eckhart Arnold committed
399
        st = parser("ABCD")
400
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
401
        st = parser("A_CD")
402
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
403
        st = parser("AB_D")
404
405
406
407
408
409
410
411
412
413
414
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
415
416
417
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
418
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
419
        # transitivity of mandatory-operator
420
        st = parser("ABC_");  assert st.error_flag
421
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
422
423

    def test_series_composition(self):
424
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
425
426
427
428
429
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
430
431
432
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
433
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
434
        st = parser("ABC_E");  assert st.error_flag
435
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
436
437
438

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
439
440
441
442
443
444
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
445
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
446

eckhart's avatar
eckhart committed
447
448
449
450
451
452
453
454
455
456
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
457

458

459
460
461
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
462
        prefixes = AllOf(TKN("A"), TKN("B"))
463
464
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
465
        # aternative Form
466
        prefixes = AllOf(Series(TKN("B"), TKN("A")))
467
        assert Grammar(prefixes)('A B').content == 'A B'
468
469
470
471

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
472
        prefixes = AllOf(TKN("A"), TKN("B"))
473
474
475
476
477
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
478
        prefixes = AllOf(TKN("A"), TKN("B"), TKN("A"))
479
480
481
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
482
483
484
485
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
486
        prefixes = SomeOf(TKN("A"), TKN("B"))
487
488
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
489
        # aternative Form
490
        prefixes = SomeOf(Alternative(TKN("B"), TKN("A")))
491
492
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B').content == 'B'
493
494
495
496

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
497
        prefixes = SomeOf(TKN("A"), TKN("B"), TKN("A"))
498
499
500
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
501
502
503
        assert Grammar(prefixes)('A B B').error_flag


504
505
506
507
class TestErrorRecovery:
    def test_series_skip(self):
        lang = """
        document = series | /.*/
508
        @series_skip = /(?=[A-Z])/
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser('AB_D')
        assert len(st.errors) == 1  # no additional "stopped before end"-error!

    def test_AllOf_skip(self):
        lang = """
        document = allof | /.*/
        @allof_skip = /[A-Z]/
        allof = < "A" §"B" "C" "D" >
        """
        parser = grammar_provider(lang)()
        st = parser('CADB')
        assert 'allof' in st and st['allof'].content == "CADB"
        st = parser('_BCD')
        assert st.equals(parse_sxpr('(document "_BCD")'))
        st = parser('_ABC')
        assert st.equals(parse_sxpr('(document "_ABC")'))
        st = parser('A_CD')
        assert st['allof'].content == "A_CD"
        st = parser('AB_D')
        assert st['allof'].content == "AB_D"
        st = parser('A__D')
        assert st['allof'].content == "A__D"
        st = parser('CA_D')
        assert st['allof'].content == "CA_D"
        st = parser('BC_A')
        assert st['allof'].content == "BC_A"


540
class TestPopRetrieve:
541
    mini_language = r"""
542
543
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
544
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrival!
545
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
546
        text           = /[^`]+/
547
        """
548
    mini_lang2 = r"""
549
550
551
552
553
554
555
556
        @braces_filter=counterpart
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
557
    mini_lang3 = r"""
558
559
560
561
        document       = { text | env }
        env            = (specialtag | opentag) text [closespecial | closetag]
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
562
        closetag       = close_slash | close_star
563
564
565
566
567
568
569
570
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """

    def setup(self):
571
572
573
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
574
575
576

    @staticmethod
    def opening_delimiter(node, name):
577
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
578
579
580

    @staticmethod
    def closing_delimiter(node):
581
582
        return node.tag_name in {':Pop', ':Retrieve'}
        # return isinstance(node.parser, Retrieve)
583
584
585
586
587
588
589
590
591

    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
592
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
593
594
595

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
596
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
597
598
599

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
600
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
601
602
603

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
604
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
605
606
607

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
608
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
609

610
    def test_cache_neutrality(self):
611
612
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
613
        lang = r"""
614
615
616
617
618
619
620
621
622
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
623
        gr = grammar_provider(lang)()
624
        st = gr(case)
625
        assert not st.error_flag, str(st.errors_sorted)
626
627
628
629

    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
630
        assert not syntax_tree.errors_sorted
631
632
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="delimiter"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
633
634
        assert delim == pop
        if is_logging():
635
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
636
637
638
639
640
641
642
643
644
645
646

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
647
        assert not syntax_tree.errors_sorted
648
649
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="delimiter"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
650
651
        assert delim == pop
        if is_logging():
652
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
653
654
655
656

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
657
        assert not syntax_tree.errors_sorted
658
659
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="braces"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
660
661
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
662
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
663
664
665
666
667
668
669
670
671
672
673

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
674
        assert not syntax_tree.errors_sorted
675
676
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="braces"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
677
678
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
679
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
680
681


682
class TestWhitespaceHandling:
Eckhart Arnold's avatar
Eckhart Arnold committed
683
    minilang = """
684
685
686
687
688
689
690
691
692
        doc = A B
        A = "A"
        B = "B"
        Rdoc = ar br
        ar = /A/
        br = /B/
        """

    def setup(self):
693
        self.gr = grammar_provider(self.minilang)()
694
695
696
697
698
699
700
701
702
703
704
705

    def test_token_whitespace(self):
        st = self.gr("AB", 'doc')
        assert not st.error_flag
        st = self.gr("A B", 'doc')
        assert not st.error_flag

    def test_regexp_whitespace(self):
        st = self.gr("AB", 'Rdoc')
        assert not st.error_flag
        st = self.gr("A B", 'Rdoc')
        assert st.error_flag
di68kap's avatar
di68kap committed
706
707
708
709
710


class TestErrorReporting:
    grammar = """
        root      = series alpha | anything
eckhart's avatar
eckhart committed
711
        series    = subseries &alpha
di68kap's avatar
di68kap committed
712
713
714
715
716
717
718
719
720
721
722
723
724
725
        subseries = alpha §beta
        alpha     = /[a-z]+/
        beta      = /[A-Z]+/
        anything  = /.*/
        """

    def setup(self):
        self.parser = grammar_provider(self.grammar)()

    def test_error_propagation(self):
        testcode1 = "halloB"
        testcode2 = "XYZ"
        testcode3 = "hallo "
        cst = self.parser(testcode1)
726
        assert not cst.error_flag, str(cst.errors_sorted)
di68kap's avatar
di68kap committed
727
728
729
730
        cst = self.parser(testcode2)
        assert not cst.error_flag
        cst = self.parser(testcode3)
        assert cst.error_flag
731
732


di68kap's avatar
di68kap committed
733
734
class TestBorderlineCases:
    def test_not_matching(self):
735
        minilang = """parser = /X/\n"""
di68kap's avatar
di68kap committed
736
737
738
739
        gr = grammar_provider(minilang)()
        cst = gr('X', 'parser')
        assert not cst.error_flag
        cst = gr(' ', 'parser')
740
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
741
        cst = gr('', 'parser')
742
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
743
744
745
746
747
748
749

    def test_matching(self):
        minilang = """parser = /.?/"""
        gr = grammar_provider(minilang)()
        cst = gr(' ', 'parser')
        assert not cst.error_flag
        cst = gr('  ', 'parser')
750
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
751
752
753
754
        cst = gr('', 'parser')
        assert not cst.error_flag


Eckhart Arnold's avatar
Eckhart Arnold committed
755
class TestReentryAfterError:
756
    def setup(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
757
758
759
760
761
        lang = """
        document = alpha [beta] gamma "."
          alpha = "ALPHA" abc
            abc = §"a" "b" "c"
          beta = "BETA" (bac | bca)
762
763
            bac = "b" "a" §"c"
            bca = "b" "c" §"a"
Eckhart Arnold's avatar
Eckhart Arnold committed
764
765
766
767
          gamma = "GAMMA" §(cab | cba)
            cab = "c" "a" §"b"
            cba = "c" "b" §"a"
        """
768
        self.gr = grammar_provider(lang)()
769

770
771
    def test_no_resume_rules(self):
        gr = self.gr;  gr.resume_rules = dict()
772
773
        content = 'ALPHA acb BETA bac GAMMA cab .'
        cst = gr(content)
774
775
776
777
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')

eckhart's avatar
eckhart committed
778
779
780
781
782
783
784
785
    def test_no_resume_rules_partial_parsing(self):
        gr = self.gr;  gr.resume_rules = dict()
        content = 'ALPHA acb'
        cst = gr(content, 'alpha')
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')

786
787
    def test_simple_resume_rule(self):
        gr = self.gr;  gr.resume_rules = dict()
788
        gr.resume_rules__['alpha'] = [re.compile(r'(?=BETA)')]
789
        content = 'ALPHA acb BETA bac GAMMA cab .'
790
791
792
793
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
794
        # because of resuming, there should be only one error message
795
        assert len(cst.errors_sorted) == 1
796

797
798
    def test_failing_resume_rule(self):
        gr = self.gr;  gr.resume_rules = dict()
799
        gr.resume_rules__['alpha'] = [re.compile(r'(?=XXX)')]
800
        content = 'ALPHA acb BETA bac GAMMA cab .'
801
802
803
804
805
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        # assert cst.pick('alpha').content.startswith('ALPHA')

806
807
    def test_severl_reentry_points(self):
        gr = self.gr;  gr.resume_rules = dict()
808
        gr.resume_rules__['alpha'] = [re.compile(r'(?=BETA)'), re.compile(r'(?=GAMMA)')]
809
        content = 'ALPHA acb BETA bac GAMMA cab .'
810
811
812
813
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
814
        # because of resuming, there should be only one error message
815
        assert len(cst.errors_sorted) == 1
816

817
818
    def test_several_reentry_points_second_point_matching(self):
        gr = self.gr;  gr.resume_rules = dict()
819
        gr.resume_rules__['alpha'] = [re.compile(r'(?=BETA)'), re.compile(r'(?=GAMMA)')]
820
821
        content = 'ALPHA acb GAMMA cab .'
        cst = gr(content)
Eckhart Arnold's avatar
Eckhart Arnold committed
822
        assert cst.error_flag
823
        assert cst.content == content
824
        assert cst.pick('alpha').content.startswith('ALPHA')
825
        # because of resuming, there should be only one error message
826
        assert len(cst.errors_sorted) == 1
827

828
829
    def test_several_resume_rules_innermost_rule_matching(self):
        gr = self.gr;  gr.resume_rules = dict()
830
831
832
        gr.resume_rules__['alpha'] = [re.compile(r'(?=BETA)'), re.compile(r'(?=GAMMA)')]
        gr.resume_rules__['beta'] = [re.compile(r'(?=GAMMA)')]
        gr.resume_rules__['bac'] = [re.compile(r'(?=GAMMA)')]
833
834
835
836
837
        content = 'ALPHA abc BETA bad GAMMA cab .'
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
838
        # because of resuming, there should be only one error message
839
        assert len(cst.errors_sorted) == 1
840
841
842
843
844
845
        # multiple failures
        content = 'ALPHA acb BETA bad GAMMA cab .'
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
846
        # there should be only two error messages
847
        assert len(cst.errors_sorted) == 2
848

849
    def test_skip_comment_on_resume(self):
850
851
        lang = r"""
            @ comment =  /(?:\/\/.*)|(?:\/\*(?:.|\n)*?\*\/)/  # Kommentare im C++-Stil
852
            document = block_A block_B
853
            @ block_A_resume = /(?=x)/
854
855
            block_A = "a" §"b" "c"
            block_B = "x" "y" "z"
856
        """
857
858
859
860
861
862
863
864
865
866
867
868
        def mini_suite(grammar):
            tree = grammar('abc/*x*/xyz')
            assert not tree.errors
            tree = grammar('abDxyz')
            mandatory_cont = (Error.MANDATORY_CONTINUATION, Error.MANDATORY_CONTINUATION_AT_EOF)
            assert len(tree.errors) == 1 and tree.errors[0].code in mandatory_cont
            tree = grammar('abD/*x*/xyz')
            assert len(tree.errors) == 1 and tree.errors[0].code in mandatory_cont
            tree = grammar('aD /*x*/ c /* a */ /*x*/xyz')
            assert len(tree.errors) == 1 and tree.errors[0].code in mandatory_cont

        # test regex-defined resume rule
869
        grammar = grammar_provider(lang)()
870
        mini_suite(grammar)
871

872

873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
    def test_unambiguous_error_location(self):
        lang = r"""
            @ drop        = whitespace, token  # drop tokens and whitespace early
           
            @object_resume = /(?<=\})/
           
            json       = ~ value EOF
            value      = object | string 
            object     = "{" [member { "," §member }] "}"
            member     = string §":" value
            string     = `"` CHARACTERS `"` ~

            CHARACTERS = { /[^"\\]+/ }                  
            EOF      =  !/./        # no more characters ahead, end of file reached
            """
        test_case = """{
                "missing member": "abcdef",
            }"""
        gr = grammar_provider(lang)()
        cst = gr(test_case)
        assert any(err.code == Error.MANDATORY_CONTINUATION for err in cst.errors)


896
class TestConfiguredErrorMessages:
897
    def test_configured_error_message(self):
898
899
900
        lang = """
            document = series | /.*/
            @series_error = "a badly configured error message {5}"
901
            series = /X/ | head §"C" "D"
902
903
904
905
            head = "A" "B"
            """
        parser = grammar_provider(lang)()
        st = parser("AB_D");  assert st.error_flag
906
907
        assert st.errors_sorted[0].code == Error.MALFORMED_ERROR_STRING
        assert st.errors_sorted[1].code == Error.MANDATORY_CONTINUATION
908
909


910
911
912
913
914
915
916
917
918
919
class TestUnknownParserError:
    def test_unknown_parser_error(self):
        gr = Grammar()
        try:
            gr("", "NonExistantParser")
            assert False, "UnknownParserError expected!"
        except UnknownParserError:
            pass


920
921
class TestEarlyTokenWhitespaceDrop:
    def setup(self):
922
        self.lang = r"""
923
924
925
            @ drop = token, whitespace
            expression = term  { ("+" | "-") term}
            term       = factor  { ("*"|"/") factor}
eckhart's avatar
eckhart committed
926
            factor     = number | variable | "("  expression  ")"
927
                       | constant | fixed
928
            variable   = /[a-z]/~
929
930
            number     = /\d+/~
            constant   = "A" | "B"
eckhart's avatar
eckhart committed
931
            fixed      = "X"
932
933
934
            """
        self.gr = grammar_provider(self.lang)()

935
    def test_drop(self):
936
937
938
        cst = self.gr('4 + 3 * 5')
        assert not cst.pick(':Token')
        assert not cst.pick(':Whitespace')
939
940
        cst = self.gr('A + B')
        try:
941
            _ = next(cst.select_if(lambda node: node.content == 'A'))
942
943
944
945
            assert False, "Tokens in compound expressions should be dropped!"
        except StopIteration:
            pass
        cst = self.gr('X * y')
946
        assert next(cst.select_if(lambda node: node.content == 'X'))
947

948

Eckhart Arnold's avatar
Eckhart Arnold committed
949
class TestMetaParser:
950
951
952
953
    def setup(self):
        self.mp = MetaParser()
        self.mp.grammar = Grammar()  # override placeholder warning
        self.mp.pname = "named"
954
        self.mp.anonymous = False
955
956
957
        self.mp.tag_name = self.mp.pname

    def test_return_value(self):
958
959
        save = get_config_value('flatten_tree_while_parsing')
        set_config_value('flatten_tree_while_parsing', True)
960
        nd = self.mp._return_value(Node('tagged', 'non-empty'))
Eckhart Arnold's avatar
Eckhart Arnold committed
961
962
963
964
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert len(nd.children) == 1
        assert nd.children[0].tag_name == 'tagged'
        assert nd.children[0].result == "non-empty"
965
        nd = self.mp._return_value(Node('tagged', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
966
967
968
969
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert len(nd.children) == 1
        assert nd.children[0].tag_name == 'tagged'
        assert not nd.children[0].result
970
        nd = self.mp._return_value(Node(':anonymous', 'content'))
Eckhart Arnold's avatar
Eckhart Arnold committed
971
972
973
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert not nd.children
        assert nd.result == 'content'
974
        nd = self.mp._return_value(Node(':anonymous', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
975
976
977
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert not nd.children
        assert not nd.content
978
979
980
        nd = self.mp._return_value(EMPTY_NODE)
        assert nd.tag_name == 'named' and not nd.children, nd.as_sxpr()
        self.mp.pname = ''
981
        self.mp.anonymous = True
982
983
        self.mp.tag_name = ':unnamed'
        nd = self.mp._return_value(Node('tagged', 'non-empty'))
Eckhart Arnold's avatar
Eckhart Arnold committed
984
985
986
        assert nd.tag_name == 'tagged', nd.as_sxpr()
        assert len(nd.children) == 0
        assert nd.content == 'non-empty'
987
        nd = self.mp._return_value(Node('tagged', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
988
989
990
        assert nd.tag_name == 'tagged', nd.as_sxpr()
        assert len(nd.children) == 0
        assert not nd.content
991
        nd = self.mp._return_value(Node(':anonymous', 'content'))
Eckhart Arnold's avatar
Eckhart Arnold committed
992
993
994
        assert nd.tag_name == ':anonymous', nd.as_sxpr()
        assert not nd.children
        assert nd.result == 'content'
995
        nd = self.mp._return_value(Node('', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
996
997
998
        assert nd.tag_name == '', nd.as_sxpr()
        assert not nd.children
        assert not nd.content
999
1000
        assert self.mp._return_value(None) == EMPTY_NODE
        assert self.mp._return_value(EMPTY_NODE) == EMPTY_NODE