test_parse.py 33.5 KB
Newer Older
1
2
#!/usr/bin/python3

3
"""test_parse.py - tests of the parsers-module of DHParser
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import sys
23
from functools import partial
24

Eckhart Arnold's avatar
Eckhart Arnold committed
25
sys.path.extend(['../', './'])
26

27
from DHParser.toolkit import compile_python_object, get_config_value, set_config_value
28
from DHParser.log import logging, is_logging, log_ST, log_parsing_history
29
from DHParser.error import Error, is_error
Eckhart Arnold's avatar
Eckhart Arnold committed
30
from DHParser.parse import Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
31
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, AllOf, SomeOf, \
32
    UnknownParserError, MetaParser, GrammarError, EMPTY_NODE
33
from DHParser import compile_source
34
35
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, DHPARSER_IMPORTS
from DHParser.dsl import grammar_provider, CompilationError
Eckhart Arnold's avatar
Eckhart Arnold committed
36
from DHParser.syntaxtree import Node
37
38


eckhart's avatar
eckhart committed
39
40
41
42
43
44
45
46
47
48
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
        def visitor(p: Parser):
di68kap's avatar
di68kap committed
49
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
50
51
52
53
54
55
56
57
58
59
60
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3


61
class TestInfiLoopsAndRecursion:
62
    def test_direct_left_recursion1(self):
eckhart's avatar
eckhart committed
63
        minilang ="""
64
65
66
67
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
68
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
69
        parser = grammar_provider(minilang)()
70
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
71
        syntax_tree = parser(snippet)
72
73
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)
Eckhart Arnold's avatar
Eckhart Arnold committed
74
        if is_logging():
75
            log_ST(syntax_tree, "test_LeftRecursion_direct.cst")
76
            log_parsing_history(parser, "test_LeftRecursion_direct")
77

78
    def test_direct_left_recursion2(self):
79
80
81
82
83
84
        minilang = """
            expr = ex
            ex   = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
85
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
86
        parser = grammar_provider(minilang)()
87
88
        assert parser
        syntax_tree = parser(snippet)
89
90
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
91

92
    def test_indirect_left_recursion1(self):
93
94
        minilang = """
            Expr    = //~ (Product | Sum | Value)
95
96
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
97
98
            Value   = /[0-9.]+/~ | '(' Expr ')'
            """
99
        parser = grammar_provider(minilang)()
100
        assert parser
101
        snippet = "8 * 4"
102
        syntax_tree = parser(snippet)
103
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
104
105
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
106
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
107
108
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
109
110
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
111
        if is_logging():
112
            log_ST(syntax_tree, "test_LeftRecursion_indirect.cst")
113
            log_parsing_history(parser, "test_LeftRecursion_indirect")
114

eckhart's avatar
eckhart committed
115
    def test_infinite_loops(self):
116
        minilang = """forever = { // } \n"""
117
118
119
120
        try:
            parser_class = grammar_provider(minilang)
        except CompilationError as error:
            assert all(e.code == Error.INFINITE_LOOP for e in error.errors)
Eckhart Arnold's avatar
Eckhart Arnold committed
121
122
123
124
125
126
127
128
129
130
        save = get_config_value('static_analysis')
        set_config_value('static_analysis', 'late')
        provider = grammar_provider(minilang)
        try:
            parser = provider()
        except GrammarError as error:
            assert error.errors[0][2].code == Error.INFINITE_LOOP
        set_config_value('static_analysis', 'none')
        parser = provider()
        snippet = " "
Eckhart Arnold's avatar
Eckhart Arnold committed
131
        syntax_tree = parser(snippet)
132
133
134
135
136
137
138
        assert any(e.code == Error.INFINITE_LOOP for e in syntax_tree.errors)
        res = parser.static_analysis()
        assert res and res[0][2].code == Error.INFINITE_LOOP
        minilang = """not_forever = { / / } \n"""
        parser = grammar_provider(minilang)()
        res = parser.static_analysis()
        assert not res
Eckhart Arnold's avatar
Eckhart Arnold committed
139
        set_config_value('static_analysis', save)
140

Eckhart Arnold's avatar
Eckhart Arnold committed
141
class TestFlowControl:
142
143
    def setup(self):
        self.t1 = """
eckhart's avatar
eckhart committed
144
        All work and no play
145
146
147
148
149
        makes Jack a dull boy
        END
        """
        self.t2 = "All word and not play makes Jack a dull boy END\n"

Eckhart Arnold's avatar
Eckhart Arnold committed
150
    def test_lookbehind(self):
151
        ws = RegExp(r'\s*')
Eckhart Arnold's avatar
Eckhart Arnold committed
152
        end = RegExp("END")
153
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
154
        word = RegExp(r'\w+')
Eckhart Arnold's avatar
Eckhart Arnold committed
155
156
157
158
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws

        parser = Grammar(document)
159
160
161
162
163
164
165
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
166
            parser_initialization__ = ["upon instantiation"]
167
            ws = RegExp(r'\s*')
168
            end = RegExp('END')
169
            SUCC_LB = RegExp('\\s*?\\n')
170
            doc_end = Series(Lookbehind(SUCC_LB), end)
171
            word = RegExp(r'\w+')
172
173
174
175
176
177
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
178
        assert not cst.error_flag, cst.as_sxpr()
179
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
180
181
182
        assert cst.error_flag, cst.as_sxpr()


183
184
185
186
187
188
189
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
190
191
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
192
        assert result
193
        assert not messages, str(messages)
194
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
195
196
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
197
        assert node.tag_name == "regex"
198
199
        assert str(node) == 'abc+def'

200
201
202
203
204
205
206
207
208
209
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
        regex =  /\w+ 
                  [+]  
                  \w* /
        """
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
210
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
211
212
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
213
        assert node.tag_name == "regex"
214
215
        assert str(node) == 'abc+def'

216
217
218
219
220
221
222
223
224
    def text_ignore_case(self):
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
225
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
226
227
228
229
        node, rest = parser.regex('Alpha')
        assert node
        assert not node.error_flag
        assert rest == ''
230
        assert node.tag_name == "regex"
231
232
233
234
235
236
237
238
239
240
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
241
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
242
243
244
245
        node, rest = parser.regex('Alpha')
        assert node.error_flag


246
247
248
249
250
251
252
253
254
255
256
257
    def test_token(self):
        tokenlang = r"""
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
258
259
        result, messages, syntax_tree = compile_source(tokenlang, None, get_ebnf_grammar(),
                                    get_ebnf_transformer(), get_ebnf_compiler("TokenTest"))
260
        assert result
eckhart's avatar
eckhart committed
261
        assert not messages, str(messages)
262
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
263
        result = parser(testdoc)
264
        # log_parsing_history(parser, "test.log")
265
266
        assert not result.error_flag

267

268
class TestGrammar:
269
    def setup(self):
270
271
272
273
274
275
        grammar = r"""@whitespace = horizontal
        haupt        = textzeile LEERZEILE
        textzeile    = { WORT }+
        WORT         = /[^ \t]+/~
        LEERZEILE    = /\n[ \t]*(?=\n)/~
        """
276
277
278
        self.pyparser, messages, syntax_tree = compile_source(grammar, None, get_ebnf_grammar(),
                                                              get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
        assert self.pyparser
279
        assert not messages
280
281
282
283

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
284
        with logging("LOGS"):
285
            grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
286
287
            grammar("no_file_name*")
        for record in grammar.history__:
288
289
            assert not record.node or record.node.pos >= 0

290
    def test_select_parsing(self):
291
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
292
293
294
295
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
296

di68kap's avatar
di68kap committed
297
298
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
299
            r'''
di68kap's avatar
di68kap committed
300
301
302
303
304
305
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
306
307
308
309
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
310
311
312
313
314
315
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

316

317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser("ABCD");
        assert not st.error_flag
        st = parser("A_CD");
        assert not st.error_flag
        st = parser("AB_D");
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
340
341
342
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
343
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
344
        # transitivity of mandatory-operator
345
        st = parser("ABC_");  assert st.error_flag
346
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
347
348

    def test_series_composition(self):
349
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
350
351
352
353
354
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
355
356
357
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
358
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
359
        st = parser("ABC_E");  assert st.error_flag
360
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
361
362
363

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
364
365
366
367
368
369
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
370
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
371

eckhart's avatar
eckhart committed
372
373
374
375
376
377
378
379
380
381
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
382

383

384
385
386
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
387
        prefixes = AllOf(TKN("A"), TKN("B"))
388
389
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
390
        # aternative Form
391
        prefixes = AllOf(Series(TKN("B"), TKN("A")))
392
        assert Grammar(prefixes)('A B').content == 'A B'
393
394
395
396

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
397
        prefixes = AllOf(TKN("A"), TKN("B"))
398
399
400
401
402
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
403
        prefixes = AllOf(TKN("A"), TKN("B"), TKN("A"))
404
405
406
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
407
408
409
410
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
411
        prefixes = SomeOf(TKN("A"), TKN("B"))
412
413
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
414
        # aternative Form
415
        prefixes = SomeOf(Alternative(TKN("B"), TKN("A")))
416
417
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B').content == 'B'
418
419
420
421

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
422
        prefixes = SomeOf(TKN("A"), TKN("B"), TKN("A"))
423
424
425
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
426
427
428
        assert Grammar(prefixes)('A B B').error_flag


429
class TestPopRetrieve:
430
    mini_language = r"""
431
432
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
433
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrival!
434
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
435
        text           = /[^`]+/
436
        """
437
    mini_lang2 = r"""
438
439
440
441
442
443
444
445
        @braces_filter=counterpart
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
446
    mini_lang3 = r"""
447
448
449
450
        document       = { text | env }
        env            = (specialtag | opentag) text [closespecial | closetag]
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
451
        closetag       = close_slash | close_star
452
453
454
455
456
457
458
459
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """

    def setup(self):
460
461
462
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
463
464
465

    @staticmethod
    def opening_delimiter(node, name):
466
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
467
468
469

    @staticmethod
    def closing_delimiter(node):
470
471
        return node.tag_name in {':Pop', ':Retrieve'}
        # return isinstance(node.parser, Retrieve)
472
473
474
475
476
477
478
479
480

    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
481
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
482
483
484

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
485
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
486
487
488

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
489
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
490
491
492

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
493
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
494
495
496

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
497
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
498

499
    def test_cache_neutrality(self):
500
501
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
502
        lang = r"""
503
504
505
506
507
508
509
510
511
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
512
        gr = grammar_provider(lang)()
513
        st = gr(case)
514
        assert not st.error_flag, str(st.errors_sorted)
515
516
517
518

    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
519
        assert not syntax_tree.errors_sorted
520
521
        delim = str(next(syntax_tree.select(partial(self.opening_delimiter, name="delimiter"))))
        pop = str(next(syntax_tree.select(self.closing_delimiter)))
522
523
        assert delim == pop
        if is_logging():
524
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
525
526
527
528
529
530
531
532
533
534
535

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
536
        assert not syntax_tree.errors_sorted
537
538
        delim = str(next(syntax_tree.select(partial(self.opening_delimiter, name="delimiter"))))
        pop = str(next(syntax_tree.select(self.closing_delimiter)))
539
540
        assert delim == pop
        if is_logging():
541
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
542
543
544
545

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
546
        assert not syntax_tree.errors_sorted
547
548
        delim = str(next(syntax_tree.select(partial(self.opening_delimiter, name="braces"))))
        pop = str(next(syntax_tree.select(self.closing_delimiter)))
549
550
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
551
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
552
553
554
555
556
557
558
559
560
561
562

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
563
        assert not syntax_tree.errors_sorted
564
565
        delim = str(next(syntax_tree.select(partial(self.opening_delimiter, name="braces"))))
        pop = str(next(syntax_tree.select(self.closing_delimiter)))
566
567
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
568
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
569
570


571
class TestWhitespaceHandling:
Eckhart Arnold's avatar
Eckhart Arnold committed
572
    minilang = """
573
574
575
576
577
578
579
580
581
        doc = A B
        A = "A"
        B = "B"
        Rdoc = ar br
        ar = /A/
        br = /B/
        """

    def setup(self):
582
        self.gr = grammar_provider(self.minilang)()
583
584
585
586
587
588
589
590
591
592
593
594

    def test_token_whitespace(self):
        st = self.gr("AB", 'doc')
        assert not st.error_flag
        st = self.gr("A B", 'doc')
        assert not st.error_flag

    def test_regexp_whitespace(self):
        st = self.gr("AB", 'Rdoc')
        assert not st.error_flag
        st = self.gr("A B", 'Rdoc')
        assert st.error_flag
di68kap's avatar
di68kap committed
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614


class TestErrorReporting:
    grammar = """
        root      = series alpha | anything
        series    = subseries &alpha 
        subseries = alpha §beta
        alpha     = /[a-z]+/
        beta      = /[A-Z]+/
        anything  = /.*/
        """

    def setup(self):
        self.parser = grammar_provider(self.grammar)()

    def test_error_propagation(self):
        testcode1 = "halloB"
        testcode2 = "XYZ"
        testcode3 = "hallo "
        cst = self.parser(testcode1)
615
        assert not cst.error_flag, str(cst.errors_sorted)
di68kap's avatar
di68kap committed
616
617
618
619
        cst = self.parser(testcode2)
        assert not cst.error_flag
        cst = self.parser(testcode3)
        assert cst.error_flag
620
621


di68kap's avatar
di68kap committed
622
623
624
625
626
627
628
class TestBorderlineCases:
    def test_not_matching(self):
        minilang = """parser = /X/"""
        gr = grammar_provider(minilang)()
        cst = gr('X', 'parser')
        assert not cst.error_flag
        cst = gr(' ', 'parser')
629
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
630
        cst = gr('', 'parser')
631
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
632
633
634
635
636
637
638

    def test_matching(self):
        minilang = """parser = /.?/"""
        gr = grammar_provider(minilang)()
        cst = gr(' ', 'parser')
        assert not cst.error_flag
        cst = gr('  ', 'parser')
639
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
640
641
642
643
        cst = gr('', 'parser')
        assert not cst.error_flag


Eckhart Arnold's avatar
Eckhart Arnold committed
644
class TestReentryAfterError:
645
    def setup(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
646
647
648
649
650
        lang = """
        document = alpha [beta] gamma "."
          alpha = "ALPHA" abc
            abc = §"a" "b" "c"
          beta = "BETA" (bac | bca)
651
652
            bac = "b" "a" §"c"
            bca = "b" "c" §"a"
Eckhart Arnold's avatar
Eckhart Arnold committed
653
654
655
656
          gamma = "GAMMA" §(cab | cba)
            cab = "c" "a" §"b"
            cba = "c" "b" §"a"
        """
657
        self.gr = grammar_provider(lang)()
658

659
660
    def test_no_resume_rules(self):
        gr = self.gr;  gr.resume_rules = dict()
661
662
        content = 'ALPHA acb BETA bac GAMMA cab .'
        cst = gr(content)
663
664
665
666
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')

eckhart's avatar
eckhart committed
667
668
669
670
671
672
673
674
    def test_no_resume_rules_partial_parsing(self):
        gr = self.gr;  gr.resume_rules = dict()
        content = 'ALPHA acb'
        cst = gr(content, 'alpha')
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')

675
676
    def test_simple_resume_rule(self):
        gr = self.gr;  gr.resume_rules = dict()
677
        gr.resume_rules__['alpha'] = ['BETA']
678
        content = 'ALPHA acb BETA bac GAMMA cab .'
679
680
681
682
683
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
684
        assert len(cst.errors_sorted) == 1
685

686
687
    def test_failing_resume_rule(self):
        gr = self.gr;  gr.resume_rules = dict()
688
        gr.resume_rules__['alpha'] = ['XXX']
689
        content = 'ALPHA acb BETA bac GAMMA cab .'
690
691
692
693
694
695
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        # assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message

696
697
    def test_severl_reentry_points(self):
        gr = self.gr;  gr.resume_rules = dict()
698
        gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
699
        content = 'ALPHA acb BETA bac GAMMA cab .'
700
701
702
703
704
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
705
        assert len(cst.errors_sorted) == 1
706

707
708
    def test_several_reentry_points_second_point_matching(self):
        gr = self.gr;  gr.resume_rules = dict()
709
710
711
        gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
        content = 'ALPHA acb GAMMA cab .'
        cst = gr(content)
Eckhart Arnold's avatar
Eckhart Arnold committed
712
        assert cst.error_flag
713
        assert cst.content == content
714
715
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
716
        assert len(cst.errors_sorted) == 1
717

718
719
720
721
722
723
724
725
726
727
728
    def test_several_resume_rules_innermost_rule_matching(self):
        gr = self.gr;  gr.resume_rules = dict()
        gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
        gr.resume_rules__['beta'] = ['GAMMA']
        gr.resume_rules__['bac'] = ['GAMMA']
        content = 'ALPHA abc BETA bad GAMMA cab .'
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
729
        assert len(cst.errors_sorted) == 1
730
731
732
733
734
735
736
        # multiple failures
        content = 'ALPHA acb BETA bad GAMMA cab .'
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
737
        assert len(cst.errors_sorted) == 2
738

739

740
741
742
743
744
745
746
747
748
749
class TestConfiguredErrorMessages:
    def test_(self):
        lang = """
            document = series | /.*/
            @series_error = "a badly configured error message {5}"
            series = "X" | head §"C" "D"
            head = "A" "B"
            """
        parser = grammar_provider(lang)()
        st = parser("AB_D");  assert st.error_flag
750
751
        assert st.errors_sorted[0].code == Error.MALFORMED_ERROR_STRING
        assert st.errors_sorted[1].code == Error.MANDATORY_CONTINUATION
752
753


754
755
756
757
758
759
760
761
762
763
class TestUnknownParserError:
    def test_unknown_parser_error(self):
        gr = Grammar()
        try:
            gr("", "NonExistantParser")
            assert False, "UnknownParserError expected!"
        except UnknownParserError:
            pass


764
765
class TestEarlyTokenWhitespaceDrop:
    def setup(self):
766
        self.lang = r"""
767
768
769
            @ drop = token, whitespace
            expression = term  { ("+" | "-") term}
            term       = factor  { ("*"|"/") factor}
770
771
            factor     = number | variable | "("  expression  ")" 
                       | constant | fixed
772
            variable   = /[a-z]/~
773
774
775
            number     = /\d+/~
            constant   = "A" | "B"
            fixed      = "X"   
776
777
778
            """
        self.gr = grammar_provider(self.lang)()

779
    def test_drop(self):
780
781
782
        cst = self.gr('4 + 3 * 5')
        assert not cst.pick(':Token')
        assert not cst.pick(':Whitespace')
783
784
785
786
787
788
789
790
        cst = self.gr('A + B')
        try:
            _ = next(cst.select(lambda node: node.content == 'A'))
            assert False, "Tokens in compound expressions should be dropped!"
        except StopIteration:
            pass
        cst = self.gr('X * y')
        assert next(cst.select(lambda node: node.content == 'X'))
791

792

Eckhart Arnold's avatar
Eckhart Arnold committed
793
class TestMetaParser:
794
795
796
797
798
799
800
    def setup(self):
        self.mp = MetaParser()
        self.mp.grammar = Grammar()  # override placeholder warning
        self.mp.pname = "named"
        self.mp.tag_name = self.mp.pname

    def test_return_value(self):
801
802
        save = get_config_value('flatten_tree_while_parsing')
        set_config_value('flatten_tree_while_parsing', True)
803
        nd = self.mp._return_value(Node('tagged', 'non-empty'))
Eckhart Arnold's avatar
Eckhart Arnold committed
804
805
806
807
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert len(nd.children) == 1
        assert nd.children[0].tag_name == 'tagged'
        assert nd.children[0].result == "non-empty"
808
        nd = self.mp._return_value(Node('tagged', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
809
810
811
812
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert len(nd.children) == 1
        assert nd.children[0].tag_name == 'tagged'
        assert not nd.children[0].result
813
        nd = self.mp._return_value(Node(':anonymous', 'content'))
Eckhart Arnold's avatar
Eckhart Arnold committed
814
815
816
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert not nd.children
        assert nd.result == 'content'
817
        nd = self.mp._return_value(Node(':anonymous', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
818
819
820
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert not nd.children
        assert not nd.content
821
822
823
824
825
        nd = self.mp._return_value(EMPTY_NODE)
        assert nd.tag_name == 'named' and not nd.children, nd.as_sxpr()
        self.mp.pname = ''
        self.mp.tag_name = ':unnamed'
        nd = self.mp._return_value(Node('tagged', 'non-empty'))
Eckhart Arnold's avatar
Eckhart Arnold committed
826
827
828
        assert nd.tag_name == 'tagged', nd.as_sxpr()
        assert len(nd.children) == 0
        assert nd.content == 'non-empty'
829
        nd = self.mp._return_value(Node('tagged', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
830
831
832
        assert nd.tag_name == 'tagged', nd.as_sxpr()
        assert len(nd.children) == 0
        assert not nd.content
833
        nd = self.mp._return_value(Node(':anonymous', 'content'))
Eckhart Arnold's avatar
Eckhart Arnold committed
834
835
836
        assert nd.tag_name == ':anonymous', nd.as_sxpr()
        assert not nd.children
        assert nd.result == 'content'
837
        nd = self.mp._return_value(Node('', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
838
839
840
        assert nd.tag_name == '', nd.as_sxpr()
        assert not nd.children
        assert not nd.content
841
842
        assert self.mp._return_value(None) == EMPTY_NODE
        assert self.mp._return_value(EMPTY_NODE) == EMPTY_NODE
843
        set_config_value('flatten_tree_while_parsing', save)
Eckhart Arnold's avatar
Eckhart Arnold committed
844

845
846
847
848
849
850
    def test_return_values(self):
        self.mp.pname = "named"
        self.mp.tag_name = self.mp.pname
        rv = self.mp._return_values((Node('tag', 'content'), EMPTY_NODE))
        assert rv[-1].tag_name != EMPTY_NODE.tag_name, rv[-1].tag_name

Eckhart Arnold's avatar
Eckhart Arnold committed
851

852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
class TestStaticAnalysis:
    bibtex_grammar = """# bad bibtex-grammar
    @ whitespace  = /\s*/
    @ ignorecase  = True
    @ comment     = //

    bibliography = { preamble | comment | entry }
    
    preamble      = "@Preamble{" /"/ pre_code /"/~ §"}"
    pre_code      = { /[^"%]+/ | /%.*\n/ }
    
    comment       = "@Comment{" text §"}"
    
    entry         = /@/ type "{" key { "," field §"=" content } [","] §"}"
    type          = WORD
    key           = NO_BLANK_STRING
    field         = WORD
    content       = "{" text "}" | plain_content
    
    plain_content = COMMA_TERMINATED_STRING
    text          = { CONTENT_STRING | "{" text "}" }
 
    WORD          = /\w+/~
    NO_BLANK_STRING         = /[^ \t\n,%]+/~
    COMMA_TERMINATED_STRING = { /[^,%]+/ | &/%/~ }      # BOOM !!!
    CONTENT_STRING = { /[^{}%]+/ | &/%/~ }+             # BOOM !!!
    
    EOF           =  !/./    
    """

    def test_static_analysis(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
883
884
        save = get_config_value('static_analysis')
        set_config_value('static_analysis', 'late')
885
886
887
888
889
890
891
        gr_class = grammar_provider(self.bibtex_grammar, 'BibTex')
        try:
            gr_instance = gr_class()
        except GrammarError as error:
            affected_parsers = {e[0] for e in error.errors}
            assert affected_parsers == {'CONTENT_STRING', 'COMMA_TERMINATED_STRING'}
            assert all(e[2].code == Error.INFINITE_LOOP for e in error.errors)
Eckhart Arnold's avatar
Eckhart Arnold committed
892
        set_config_value('static_analysis', save)
893
894


Eckhart Arnold's avatar
Eckhart Arnold committed
895

896
if __name__ == "__main__":
897
    from DHParser.testing import runner
898
899
    with logging(False):
        runner("", globals())