test_parse.py 50 KB
Newer Older
1
#!/usr/bin/env python3
2

3
"""test_parse.py - tests of the parsers-module of DHParser
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

eckhart's avatar
eckhart committed
22
import copy
23
import os
24
import sys
25
from functools import partial
26
from typing import List
27

28
29
scriptpath = os.path.dirname(__file__) or '.'
sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
30

eckhart's avatar
eckhart committed
31
from DHParser.configuration import get_config_value, set_config_value
32
from DHParser.toolkit import compile_python_object, re
33
from DHParser.log import is_logging, log_ST, log_parsing_history
34
35
from DHParser.error import Error, is_error, adjust_error_locations, MANDATORY_CONTINUATION, PARSER_DID_NOT_MATCH, \
    MALFORMED_ERROR_STRING, MANDATORY_CONTINUATION_AT_EOF, RESUME_NOTICE, PARSER_STOPPED_BEFORE_END
36
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
37
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, \
38
39
    Interleave, UnknownParserError, MetaParser, Token, EMPTY_NODE, Capture, Drop, Whitespace, \
    GrammarError
40
from DHParser import compile_source
41
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, \
eckhart's avatar
eckhart committed
42
    parse_ebnf, DHPARSER_IMPORTS
43
from DHParser.dsl import grammar_provider
44
from DHParser.syntaxtree import Node, parse_sxpr
45
from DHParser.stringview import StringView
46
from DHParser.trace import set_tracer, trace_history, resume_notices_on
47
48


49
50
51
52
53
54

class TestWhitespace:
    # TODO: add test cases here
    def test_whitespace_comment_mangling(self):
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
55
    def test_non_empty_derivation(self):
56
57
        pass

58

59
60
class TestParserError:
    def test_parser_error_str(self):
61
        pe = ParserError(Node('TAG', 'test').with_pos(0), StringView('Beispiel'), None, True)
62
63
        assert str(pe).find('Beispiel') >= 0 and str(pe).find('TAG') >= 0

64
    def test_false_lookahead_only_message(self):
65
        """PARSER_LOOKAHEAD_*_ONLY errors must not be reported if there
66
67
68
69
70
71
        no lookahead parser in the history!"""
        lang = """
        word = letters { letters | `-` letters }
        letters = /[A-Za-z]+/
        """
        gr = grammar_provider(lang)()
72
73
        set_tracer(gr, trace_history)
        st = gr('hard-time')
74
        assert not st.errors
75
        st = gr('hard-')
76
77
        assert st.errors and not any(e.code == 1045 for e in st.errors)

eckhart's avatar
eckhart committed
78

eckhart's avatar
eckhart committed
79
80
81
82
83
84
85
86
87
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
88
89
        def visitor(context: List[Parser]):
            p = context[-1]
di68kap's avatar
di68kap committed
90
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
91
92
93
94
95
96
97
98
99
100
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3

101
102
103
104
105
106
107
108
    def test_symbol(self):
        class MyGrammar(Grammar):
            wrong = Token('wrong')
            word = OneOrMore(wrong) + Whitespace(r'\s*') + OneOrMore(RegExp(r'\w+'))
            root__ = word
        gr = MyGrammar()
        regex = gr['word'].parsers[-1].parser
        result = gr.associated_symbol(regex).symbol
109
        assert result == 'word', result
110

eckhart's avatar
eckhart committed
111

112
class TestInfiLoopsAndRecursion:
113
    def test_direct_left_recursion1(self):
eckhart's avatar
eckhart committed
114
        minilang = """
115
116
117
118
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
119
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
120
        parser = grammar_provider(minilang)()
121
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
122
        syntax_tree = parser(snippet)
123
124
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)
Eckhart Arnold's avatar
Eckhart Arnold committed
125
        if is_logging():
126
            log_ST(syntax_tree, "test_LeftRecursion_direct.cst")
127
            log_parsing_history(parser, "test_LeftRecursion_direct")
128

129
    def test_direct_left_recursion2(self):
130
131
132
133
134
135
        minilang = """
            expr = ex
            ex   = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
136
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
137
        parser = grammar_provider(minilang)()
138
        assert parser
139
        syntax_tree = parser(snippet)
140
141
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
142

143
    def test_indirect_left_recursion1(self):
144
145
        minilang = """
            Expr    = //~ (Product | Sum | Value)
146
147
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
148
            Value   = /[0-9.]+/~ | '(' §Expr ')'
149
            """
150
        parser = grammar_provider(minilang)()
151
        assert parser
152
        snippet = "8 * 4"
153
        syntax_tree = parser(snippet)
154
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
155
156
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
157
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
158
159
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
160
161
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
162
        if is_logging():
163
            log_ST(syntax_tree, "test_LeftRecursion_indirect.cst")
164
            log_parsing_history(parser, "test_LeftRecursion_indirect")
165

166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
    # # BEWARE: EXPERIMENTAL TEST can be long running
    # def test_indirect_left_recursion2(self):
    #     arithmetic_syntax = """
    #         expression     = addition | subtraction
    #         addition       = (expression | term) "+" (expression | term)
    #         subtraction    = (expression | term) "-" (expression | term)
    #         term           = multiplication | division
    #         multiplication = (term | factor) "*" (term | factor)
    #         division       = (term | factor) "/" (term | factor)
    #         factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
    #         group          = "(" expression ")"
    #         SIGN           = /[+-]/
    #         NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
    #         VARIABLE       = /[A-Za-z]/~
    #         """
    #     arithmetic = grammar_provider(arithmetic_syntax)()
    #     arithmetic.left_recursion_depth__ = 2
    #     assert arithmetic
    #     syntax_tree = arithmetic("(a + b) * (a - b)")
    #     assert syntax_tree.errors
186

187
188
189
    def test_break_inifnite_loop_ZeroOrMore(self):
        forever = ZeroOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
190
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
191
192
193
194

    def test_break_inifnite_loop_OneOrMore(self):
        forever = OneOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
195
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
196

197
198
199
200
201
    # def test_infinite_loops(self):
    #     minilang = """forever = { // } \n"""
    #     try:
    #         parser_class = grammar_provider(minilang)
    #     except CompilationError as error:
202
    #         assert all(e.code == INFINITE_LOOP for e in error.errors)
203
204
205
206
207
208
    #     save = get_config_value('static_analysis')
    #     set_config_value('static_analysis', 'late')
    #     provider = grammar_provider(minilang)
    #     try:
    #         parser = provider()
    #     except GrammarError as error:
209
    #         assert error.errors[0][2].code == INFINITE_LOOP
210
211
212
213
    #     set_config_value('static_analysis', 'none')
    #     parser = provider()
    #     snippet = " "
    #     syntax_tree = parser(snippet)
214
    #     assert any(e.code == INFINITE_LOOP for e in syntax_tree.errors)
215
    #     res = parser.static_analysis()
216
    #     assert res and res[0][2].code == INFINITE_LOOP
217
218
219
220
221
    #     minilang = """not_forever = { / / } \n"""
    #     parser = grammar_provider(minilang)()
    #     res = parser.static_analysis()
    #     assert not res
    #     set_config_value('static_analysis', save)
222

eckhart's avatar
eckhart committed
223

Eckhart Arnold's avatar
Eckhart Arnold committed
224
class TestFlowControl:
225
226
    def setup(self):
        self.t1 = """
eckhart's avatar
eckhart committed
227
        All work and no play
228
229
230
231
232
        makes Jack a dull boy
        END
        """
        self.t2 = "All word and not play makes Jack a dull boy END\n"

Eckhart Arnold's avatar
Eckhart Arnold committed
233
    def test_lookbehind(self):
234
        ws = RegExp(r'\s*')
Eckhart Arnold's avatar
Eckhart Arnold committed
235
        end = RegExp("END")
236
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
237
        word = RegExp(r'\w+')
Eckhart Arnold's avatar
Eckhart Arnold committed
238
239
240
241
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws

        parser = Grammar(document)
242
243
244
245
246
247
248
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
249
            parser_initialization__ = ["upon instantiation"]
250
            ws = RegExp(r'\s*')
251
            end = RegExp('END')
252
            SUCC_LB = RegExp('\\s*?\\n')
253
            doc_end = Series(Lookbehind(SUCC_LB), end)
254
            word = RegExp(r'\w+')
255
256
257
258
259
260
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
261
        assert not cst.error_flag, cst.as_sxpr()
262
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
263
264
265
        assert cst.error_flag, cst.as_sxpr()


266
267
268
269
270
271
272
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
273
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
Eckhart Arnold's avatar
Eckhart Arnold committed
274
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
275
        assert result
276
        assert not messages, str(messages)
277
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
278
279
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
280
        assert node.tag_name == "regex"
281
282
        assert str(node) == 'abc+def'

283
284
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
eckhart's avatar
eckhart committed
285
286
        regex =  /\w+
                  [+]
287
288
                  \w* /
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
289
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
290
291
292
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
293
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
294
295
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
296
        assert node.tag_name == "regex"
297
298
        assert str(node) == 'abc+def'

299
    def test_ignore_case(self):
300
301
302
303
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
304
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
305
306
307
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
308
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
309
        node, rest = parser.regex(StringView('Alpha'))
310
311
        assert node
        assert rest == ''
312
        assert node.tag_name == "regex"
313
314
315
316
317
318
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
319
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
320
321
322
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
323
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
324
325
        node, rest = parser.regex(StringView('Alpha'))
        assert node is None
326

327
328
329
330
331
332
333
334
335
336
337
338
    def test_token(self):
        tokenlang = r"""
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
339
        result, messages, _ = compile_source(
eckhart's avatar
eckhart committed
340
341
            tokenlang, None, get_ebnf_grammar(), get_ebnf_transformer(),
            get_ebnf_compiler("TokenTest"))
342
        assert result
eckhart's avatar
eckhart committed
343
        assert not messages, str(messages)
344
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
345
        result = parser(testdoc)
346
        # log_parsing_history(parser, "test.log")
347
        assert not result.error_flag, str(result.errors_sorted)
348

349

350
class TestGrammar:
351
    def setup(self):
352
353
354
355
356
357
        grammar = r"""@whitespace = horizontal
        haupt        = textzeile LEERZEILE
        textzeile    = { WORT }+
        WORT         = /[^ \t]+/~
        LEERZEILE    = /\n[ \t]*(?=\n)/~
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
358
359
        self.pyparser, messages, _ = compile_source(grammar, None, get_ebnf_grammar(),
                                    get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
360
        assert self.pyparser
361
        assert not messages
362
363
364
365

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
366
367
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
        grammar("no_file_name*")
368
        for record in grammar.history__:
369
370
            assert not record.node or record.node.pos >= 0

371
    def test_select_parsing(self):
372
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
373
374
375
376
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
377

di68kap's avatar
di68kap committed
378
379
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
380
            r'''
di68kap's avatar
di68kap committed
381
382
383
384
385
386
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
387
388
389
390
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
391
392
393
394
395
396
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

397
398
399
400
401
402
403
    def test_incomplete_matching(self):
        """Tests whether the flag `complete_match` works as expected when
        calling a grammar object in order to parse a document."""
        gr = grammar_provider('word = ~/\\w+/\n')()
        st = gr('eins')
        assert not st.errors
        st = gr('eins zwei')
404
        assert st.errors[0].code == PARSER_STOPPED_BEFORE_END
405
406
407
        st = gr('eins zwei', complete_match=False)
        assert not st.errors

408
    def test_synonym(self):
di68kap's avatar
di68kap committed
409
        lang = r"""
410
411
412
413
414
415
416
417
418
419
420
421
            doc  = { word | number }
            word = /\w+/ S
            number = [VZ] /\d+/ S 
            S    = ~        # let S by a synonym for anonymous whitespace
            VZ   = "-"
        """
        gr = grammar_provider(lang)()
        st = gr('eins 1 zwei2drei 3')
        # set_config_value('compiled_EBNF_log', 'grammar.log')
        gr = grammar_provider("@drop = whitespace, token" + lang)()
        st = gr('eins 1 zwei2drei 3')
        st = gr('-3')
422
        assert str(gr['S']) == "S = ~", str(gr['S'])
423

424

425
426
427
428
429
430
431
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
Eckhart Arnold's avatar
Eckhart Arnold committed
432
        st = parser("ABCD")
433
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
434
        st = parser("A_CD")
435
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
436
        st = parser("AB_D")
437
438
439
440
441
442
443
444
445
446
447
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
448
449
450
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
451
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
452
        # transitivity of mandatory-operator
453
        st = parser("ABC_");  assert st.error_flag
454
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
455
456

    def test_series_composition(self):
457
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
458
459
460
461
462
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
463
464
465
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
466
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
467
        st = parser("ABC_E");  assert st.error_flag
468
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
469
470
471

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
472
473
474
475
476
477
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
478
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
479

eckhart's avatar
eckhart committed
480
481
482
483
484
485
486
487
488
489
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
490

491
492
    def test_ebnf_serialization(self):
        ebnf_grammar = get_ebnf_grammar()
eckhart's avatar
eckhart committed
493
        # TODO: Add test here
494
        ebnf = ebnf_grammar.as_ebnf()
495
        # print(ebnf)
496
497


498
499
500
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
501
        prefixes = Interleave(TKN("A"), TKN("B"))
502
503
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
504
505
506
507

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
eckhart's avatar
eckhart committed
508
        prefixes = Interleave(TKN("A"), TKN("B"))
509
510
511
512
513
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
514
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
515
516
517
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
518
519
520
521
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
522
        prefixes = Interleave(TKN("A"), TKN("B"))
523
524
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
eckhart's avatar
eckhart committed
525
526
527
        st = Grammar(prefixes)('B')
        assert st.error_flag
        prefixes = Interleave(TKN("B"), TKN("A"), repetitions=((0, 1), (0, 1)))
528
        assert Grammar(prefixes)('A B').content == 'A B'
eckhart's avatar
eckhart committed
529
530
531
        st = Grammar(prefixes)('B')
        assert not st.error_flag
        assert st.content == 'B'
532
533
534
535

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
536
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
537
538
539
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
540
541
542
        assert Grammar(prefixes)('A B B').error_flag


543
544
545
546
547
class TestInterleave:
    def test_interleave_most_simple(self):
        letterset = Interleave(Token("A"), Token("B"), Token("C"))
        gr = Grammar(letterset)
        st = gr('ABC')
548
        assert not st.errors, str(st.errors)
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
        assert st.content == "ABC"
        st = gr('BCA')
        assert not st.errors
        assert st.content == "BCA"
        st = gr('BCBA')
        assert st.errors
        st = gr('AB')
        assert st.errors

    def test_interleave(self):
        letterset = Interleave(Token("A"), Token("B"), Token("C"),
                               repetitions=[(1, 1000), (0, 1), (1, 1)])
        gr = Grammar(letterset)
        st = gr('AABC')
        assert not st.errors
564
565
566
567
568
569
570
571
572
573
        st = gr('BACAAA')
        assert not st.errors
        st = gr('ABCC')
        assert st.errors
        st = gr('AAACAAA')
        assert not st.errors
        st = gr('AAABAAA')
        assert st.errors


574
575
576
577
class TestErrorRecovery:
    def test_series_skip(self):
        lang = """
        document = series | /.*/
578
        @series_skip = /(?=[A-Z])/
579
580
581
582
583
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser('AB_D')
        assert len(st.errors) == 1  # no additional "stopped before end"-error!
584
        resume_notices_on(parser)
585
        st = parser('AB_D')
586
        assert len(st.errors) == 2 and any(err.code == RESUME_NOTICE for err in st.errors)
eckhart's avatar
eckhart committed
587
        assert 'Skipping' in str(st.errors_sorted[1])
588

589
    def test_Interleave_skip(self):
590
591
592
        lang = """
        document = allof | /.*/
        @allof_skip = /[A-Z]/
593
        allof = "A" ° §"B" ° "C" ° "D"
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
        """
        parser = grammar_provider(lang)()
        st = parser('CADB')
        assert 'allof' in st and st['allof'].content == "CADB"
        st = parser('_BCD')
        assert st.equals(parse_sxpr('(document "_BCD")'))
        st = parser('_ABC')
        assert st.equals(parse_sxpr('(document "_ABC")'))
        st = parser('A_CD')
        assert st['allof'].content == "A_CD"
        st = parser('AB_D')
        assert st['allof'].content == "AB_D"
        st = parser('A__D')
        assert st['allof'].content == "A__D"
        st = parser('CA_D')
        assert st['allof'].content == "CA_D"
610
611
        st = parser('A_CB')
        assert st['allof'].content == "A_CB"
612
        st = parser('BC_A')
613
        assert 'allof' not in st
614
615


616
class TestPopRetrieve:
617
    mini_language = r"""
618
619
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
620
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrival!
621
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
622
        text           = /[^`]+/
623
        """
624
    mini_lang2 = r"""
625
        @braces_filter = matching_bracket
626
627
628
629
630
631
632
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
633
    mini_lang3 = r"""
634
635
636
637
        document       = { text | env }
        env            = (specialtag | opentag) text [closespecial | closetag]
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
638
        closetag       = close_slash | close_star
639
640
641
642
643
644
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """
645
646
647
648
649
650
651
652
    mini_lang4 = r"""
        document       = { text | env }
        env            = opentag document closetag
        opentag        = "<" name ">"
        closetag       = "</" :?name ">"
        name           = /\w+/~
        text           = /[^<>]+/        
    """
653
654

    def setup(self):
655
656
657
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
658
        self.minilang_parser4 = grammar_provider(self.mini_lang4)()
659
660

    @staticmethod
661
    def has_tag_name(node, name):
662
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
663

664
665
    def test_capture_assertions(self):
        try:
666
667
668
            _ = Grammar(Capture(Drop(Whitespace(r'\s*'))))
            assert False, "GrammarError expected!"
        except GrammarError as ge:
669
670
            pass
        try:
671
            _ = Grammar(Capture(Series(Token(' '), Drop(Whitespace(r'\s*')))))
672
            assert False, "ValueError expected!"
673
        except GrammarError:
674
            pass
675
676
677
        cp = Capture(RegExp(r'\w+'))
        cp.pname = "capture"
        _ = Grammar(cp)
678

679
680
681
682
    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3
683
        assert self.minilang_parser4
684
685
686
687

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
688
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
689
690
691

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
692
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
693
694
695

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
696
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
697
698
699

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
700
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
701
702
703

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
704
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
705

706
707
708
    def test_optional_match(self):
        test1 = '<info>Hey, you</info>'
        st = self.minilang_parser4(test1)
709
        assert not st.error_flag, str(st.errors_sorted)
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
        test12 = '<info>Hey, <emph>you</emph></info>'
        st = self.minilang_parser4(test1)
        assert not st.error_flag
        test2 = '<info>Hey, you</>'
        st = self.minilang_parser4(test2)
        assert not st.error_flag
        test3 = '<info>Hey, <emph>you</></>'
        st = self.minilang_parser4(test3)
        assert not st.error_flag
        test4 = '<info>Hey, <emph>you</></info>'
        st = self.minilang_parser4(test4)
        assert not st.error_flag

    def test_rollback_behaviour_of_optional_match(self):
        test1 = '<info>Hey, you</info*>'
        st = self.minilang_parser4(test1)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag
        test2 = '<info>Hey, you</*>'
        st = self.minilang_parser4(test2)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag

733
    def test_cache_neutrality(self):
734
735
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
736
        lang = r"""
737
738
739
740
741
742
743
744
745
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
746
        gr = grammar_provider(lang)()
747
        st = gr(case)
748
        assert not st.error_flag, str(st.errors_sorted)
749
750
751
752

    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
753
754
        assert not syntax_tree.errors_sorted, \
            ''.join(str(error) for error in syntax_tree.errors_sorted)
755
756
757
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
758
759
        assert delim == pop
        if is_logging():
760
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
761
762
763
764
765
766
767
768
769
770
771

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
772
        assert not syntax_tree.errors_sorted
773
774
775
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
776
777
        assert delim == pop
        if is_logging():
778
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
779
780
781
782

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
783
        assert not syntax_tree.errors_sorted
784
785
786
787
788
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
        assert len(delim) == len(pop)
        assert delim != pop
789
        if is_logging():
790
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
791
792
793
794
795
796
797
798
799
800
801

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
802
        assert not syntax_tree.errors_sorted
803
804
805
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
806
807
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
808
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
809

810
    def test_autoretrieve(self):
811
812
        lang = r"""
            document   = { definition } § EOF
813
            definition = symbol :defsign value
814
            symbol     = /\w+/~                      
815
816
            defsign    = "=" | ":="
            value      = /\d+/~
817
            EOF        = !/./ [:?defsign]   # eat up captured defsigns
818
        """
819
820
        # code, _, _ = compile_ebnf(lang)
        # print(code)
821
        parser = grammar_provider(lang)()
822
823
824
        st = parser("X := 1")
        assert not st.error_flag
        st1 = st
825
826
        st = parser("")
        assert not st.error_flag
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850

        lines = [line for line in lang.split('\n') if line.strip()]
        eof_line = lines.pop()
        lines.insert(1, eof_line)
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)

        del lines[1]
        lines.insert(2, eof_line)
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)

        # and, finally...
        lang_variant = r"""
            document   = { definition } § EOF
            symbol     = /\w+/~                      
            defsign    = "=" | ":="
            value      = /\d+/~
851
            EOF        = !/./ :?defsign   # eat up captured defsign, only if it has been retrieved
852
853
854
855
856
857
            definition = symbol :defsign value
        """
        parser = grammar_provider(lang_variant)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)
858
859
        st = parser('')
        assert "EOF expected" in str(st.errors)
di68kap's avatar
di68kap committed
860

861

862
class TestWhitespaceHandling:
Eckhart Arnold's avatar
Eckhart Arnold committed
863
    minilang = """
864
865
866
867
868
869
870
871
872
        doc = A B
        A = "A"
        B = "B"
        Rdoc = ar br
        ar = /A/
        br = /B/
        """

    def setup(self):
873
        self.gr = grammar_provider(self.minilang)()
874
875
876
877
878
879
880
881
882
883
884
885

    def test_token_whitespace(self):
        st = self.gr("AB", 'doc')
        assert not st.error_flag
        st = self.gr("A B", 'doc')
        assert not st.error_flag

    def test_regexp_whitespace(self):
        st = self.gr("AB", 'Rdoc')
        assert not st.error_flag
        st = self.gr("A B", 'Rdoc')
        assert st.error_flag
di68kap's avatar
di68kap committed
886
887
888
889
890


class TestErrorReporting:
    grammar = """
        root      = series alpha | anything
eckhart's avatar
eckhart committed
891
        series    = subseries &alpha
di68kap's avatar
di68kap committed
892
893
894
895
896
897
898
899
900
901
902
903
904
905
        subseries = alpha §beta
        alpha     = /[a-z]+/
        beta      = /[A-Z]+/
        anything  = /.*/
        """

    def setup(self):
        self.parser = grammar_provider(self.grammar)()

    def test_error_propagation(self):
        testcode1 = "halloB"
        testcode2 = "XYZ"
        testcode3 = "hallo "
        cst = self.parser(testcode1)
906
        assert not cst.error_flag, str(cst.errors_sorted)
di68kap's avatar
di68kap committed
907
908
909
910
        cst = self.parser(testcode2)
        assert not cst.error_flag
        cst = self.parser(testcode3)
        assert cst.error_flag
911
912


di68kap's avatar
di68kap committed
913
914
class TestBorderlineCases:
    def test_not_matching(self):
915
        minilang = """parser = /X/\n"""
di68kap's avatar
di68kap committed
916
917
918
919
        gr = grammar_provider(minilang)()
        cst = gr('X', 'parser')
        assert not cst.error_flag
        cst = gr(' ', 'parser')
920
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
921
        cst = gr('', 'parser')
922
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
923
924
925
926
927
928
929

    def test_matching(self):
        minilang = """parser = /.?/"""
        gr = grammar_provider(minilang)()
        cst = gr(' ', 'parser')
        assert not cst.error_flag
        cst = gr('  ', 'parser')
930
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
931
932
933
934
        cst = gr('', 'parser')
        assert not cst.error_flag


eckhart's avatar
eckhart committed
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
EBNF_with_Errors = r"""# Test code with errors. All places marked by a "$" should yield and error

@ comment    = /#.*(?:\n|$)/
@ whitespace = /\s*/
@ literalws  = right
@ anonymous  = pure_elem, EOF
@ drop       = whitespace, EOF


# re-entry-rules for resuming after parsing-error
@ definition_resume = /\n\s*(?=@|\w+\w*\s*=)/
@ directive_resume  = /\n\s*(?=@|\w+\w*\s*=)/

# specialized error messages for certain cases

@ definition_error  = /,/, 'Delimiter "," not expected in definition!\nEither this was meant to '
                           'be a directive and the directive symbol @ is missing\nor the error is '
                           'due to inconsistent use of the comma as a delimiter\nfor the elements '
                           'of a sequence.'

#: top-level

syntax     = [~//] { definition | directive } §EOF
definition = symbol §:DEF~ expression :ENDL~
directive  = "@" §symbol "="
             (regexp | literals | symbol)
             { "," (regexp | literals | symbol) }

#: components

expression = sequence { :OR~ sequence }
sequence   = ["§"] ( interleave | lookaround )
             { :AND~ ["§"] ( interleave | lookaround ) }
interleave = difference { "°" ["§"] difference }
lookaround = flowmarker § (oneormore | pure_elem)
difference = term ["-" § (oneormore $ pure_elem)]               # <- ERROR
term       = oneormore | repetition | option | pure_elem        # resuming expected her

#: elements

pure_elem  = element § !/[?*+]/
element    = [retrieveop] symbol !DEF
           | literal
           | plaintext
           | regexp
           | whitespace
           | group$                                             # <- ERROR

#: flow-operators

flowmarker = "!"  | "&"                                         # resuming expected her
           | "<-!" | "<-&"
retr$ieveop = "::" | ":?" | ":"

#: groups

group      = "(" §expression ")"
oneormore  = "{" expression "}+" | element "+"
repetition = "{" §expressi$on "}" | element "*"                 # <- ERROR
option     = "[" §expression "]" | element "?"                  # resuming expected here

#: leaf-elements

symbol     = /(?!\d)\w+/~
$literals   = { literal }+                                      # <- ERROR
literal    = /"(?:(?<!\\)\\"|[^"])*?"/~                         # resuming expected her