test_parse.py 35.6 KB
Newer Older
1
2
#!/usr/bin/python3

3
"""test_parse.py - tests of the parsers-module of DHParser
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

22
import os
23
import sys
24
from functools import partial
25

26
27
scriptpath = os.path.dirname(__file__) or '.'
sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
28

eckhart's avatar
eckhart committed
29
30
from DHParser.configuration import get_config_value, set_config_value
from DHParser.toolkit import compile_python_object
31
from DHParser.log import is_logging, log_ST, log_parsing_history
32
from DHParser.error import Error, is_error
33
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
34
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, AllOf, SomeOf, \
35
    UnknownParserError, MetaParser, GrammarError, EMPTY_NODE
36
from DHParser import compile_source
37
38
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, DHPARSER_IMPORTS
from DHParser.dsl import grammar_provider, CompilationError
Eckhart Arnold's avatar
Eckhart Arnold committed
39
from DHParser.syntaxtree import Node
40
from DHParser.stringview import StringView
41
42


43
44
class TestParserError:
    def test_parser_error_str(self):
45
        pe = ParserError(Node('TAG', 'test').with_pos(0), StringView('Beispiel'), None, True)
46
47
        assert str(pe).find('Beispiel') >= 0 and str(pe).find('TAG') >= 0

eckhart's avatar
eckhart committed
48

eckhart's avatar
eckhart committed
49
50
51
52
53
54
55
56
57
58
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
        def visitor(p: Parser):
di68kap's avatar
di68kap committed
59
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
60
61
62
63
64
65
66
67
68
69
70
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3


71
class TestInfiLoopsAndRecursion:
72
    def test_direct_left_recursion1(self):
eckhart's avatar
eckhart committed
73
        minilang = """
74
75
76
77
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
78
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
79
        parser = grammar_provider(minilang)()
80
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
81
        syntax_tree = parser(snippet)
82
83
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)
Eckhart Arnold's avatar
Eckhart Arnold committed
84
        if is_logging():
85
            log_ST(syntax_tree, "test_LeftRecursion_direct.cst")
86
            log_parsing_history(parser, "test_LeftRecursion_direct")
87

88
    def test_direct_left_recursion2(self):
89
90
91
92
93
94
        minilang = """
            expr = ex
            ex   = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
95
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
96
        parser = grammar_provider(minilang)()
97
        assert parser
98
        syntax_tree = parser(snippet)
99
100
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
101

102
    def test_indirect_left_recursion1(self):
103
104
        minilang = """
            Expr    = //~ (Product | Sum | Value)
105
106
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
107
            Value   = /[0-9.]+/~ | '(' §Expr ')'
108
            """
109
        parser = grammar_provider(minilang)()
110
        assert parser
111
        snippet = "8 * 4"
112
        syntax_tree = parser(snippet)
113
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
114
115
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
116
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
117
118
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
119
120
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
121
        if is_logging():
122
            log_ST(syntax_tree, "test_LeftRecursion_indirect.cst")
123
            log_parsing_history(parser, "test_LeftRecursion_indirect")
124

125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
    # # BEWARE: EXPERIMENTAL TEST can be long running
    # def test_indirect_left_recursion2(self):
    #     arithmetic_syntax = """
    #         expression     = addition | subtraction
    #         addition       = (expression | term) "+" (expression | term)
    #         subtraction    = (expression | term) "-" (expression | term)
    #         term           = multiplication | division
    #         multiplication = (term | factor) "*" (term | factor)
    #         division       = (term | factor) "/" (term | factor)
    #         factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
    #         group          = "(" expression ")"
    #         SIGN           = /[+-]/
    #         NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
    #         VARIABLE       = /[A-Za-z]/~
    #         """
    #     arithmetic = grammar_provider(arithmetic_syntax)()
    #     arithmetic.left_recursion_depth__ = 2
    #     assert arithmetic
    #     syntax_tree = arithmetic("(a + b) * (a - b)")
    #     assert syntax_tree.errors
145

146
147
148
149
150
151
152
153
154
155
    def test_break_inifnite_loop_ZeroOrMore(self):
        forever = ZeroOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
        assert repr(result) == "Node(:EMPTY__, )", repr(result)

    def test_break_inifnite_loop_OneOrMore(self):
        forever = OneOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
        assert repr(result) == "Node(:EMPTY__, )", str(result)

156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
    # def test_infinite_loops(self):
    #     minilang = """forever = { // } \n"""
    #     try:
    #         parser_class = grammar_provider(minilang)
    #     except CompilationError as error:
    #         assert all(e.code == Error.INFINITE_LOOP for e in error.errors)
    #     save = get_config_value('static_analysis')
    #     set_config_value('static_analysis', 'late')
    #     provider = grammar_provider(minilang)
    #     try:
    #         parser = provider()
    #     except GrammarError as error:
    #         assert error.errors[0][2].code == Error.INFINITE_LOOP
    #     set_config_value('static_analysis', 'none')
    #     parser = provider()
    #     snippet = " "
    #     syntax_tree = parser(snippet)
    #     assert any(e.code == Error.INFINITE_LOOP for e in syntax_tree.errors)
    #     res = parser.static_analysis()
    #     assert res and res[0][2].code == Error.INFINITE_LOOP
    #     minilang = """not_forever = { / / } \n"""
    #     parser = grammar_provider(minilang)()
    #     res = parser.static_analysis()
    #     assert not res
    #     set_config_value('static_analysis', save)
181

eckhart's avatar
eckhart committed
182

Eckhart Arnold's avatar
Eckhart Arnold committed
183
class TestFlowControl:
184
185
    def setup(self):
        self.t1 = """
eckhart's avatar
eckhart committed
186
        All work and no play
187
188
189
190
191
        makes Jack a dull boy
        END
        """
        self.t2 = "All word and not play makes Jack a dull boy END\n"

Eckhart Arnold's avatar
Eckhart Arnold committed
192
    def test_lookbehind(self):
193
        ws = RegExp(r'\s*')
Eckhart Arnold's avatar
Eckhart Arnold committed
194
        end = RegExp("END")
195
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
196
        word = RegExp(r'\w+')
Eckhart Arnold's avatar
Eckhart Arnold committed
197
198
199
200
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws

        parser = Grammar(document)
201
202
203
204
205
206
207
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
208
            parser_initialization__ = ["upon instantiation"]
209
            ws = RegExp(r'\s*')
210
            end = RegExp('END')
211
            SUCC_LB = RegExp('\\s*?\\n')
212
            doc_end = Series(Lookbehind(SUCC_LB), end)
213
            word = RegExp(r'\w+')
214
215
216
217
218
219
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
220
        assert not cst.error_flag, cst.as_sxpr()
221
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
222
223
224
        assert cst.error_flag, cst.as_sxpr()


225
226
227
228
229
230
231
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
232
233
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
234
        assert result
235
        assert not messages, str(messages)
236
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
237
238
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
239
        assert node.tag_name == "regex"
240
241
        assert str(node) == 'abc+def'

242
243
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
eckhart's avatar
eckhart committed
244
245
        regex =  /\w+
                  [+]
246
247
248
249
250
251
                  \w* /
        """
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
252
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
253
254
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
255
        assert node.tag_name == "regex"
256
257
        assert str(node) == 'abc+def'

258
259
260
261
262
263
264
265
266
    def text_ignore_case(self):
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
267
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
268
269
270
271
        node, rest = parser.regex('Alpha')
        assert node
        assert not node.error_flag
        assert rest == ''
272
        assert node.tag_name == "regex"
273
274
275
276
277
278
279
280
281
282
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
283
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
284
285
286
287
        node, rest = parser.regex('Alpha')
        assert node.error_flag


288
289
290
291
292
293
294
295
296
297
298
299
    def test_token(self):
        tokenlang = r"""
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
eckhart's avatar
eckhart committed
300
301
302
        result, messages, syntax_tree = compile_source(
            tokenlang, None, get_ebnf_grammar(), get_ebnf_transformer(),
            get_ebnf_compiler("TokenTest"))
303
        assert result
eckhart's avatar
eckhart committed
304
        assert not messages, str(messages)
305
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
306
        result = parser(testdoc)
307
        # log_parsing_history(parser, "test.log")
308
309
        assert not result.error_flag

310

311
class TestGrammar:
312
    def setup(self):
313
314
315
316
317
318
        grammar = r"""@whitespace = horizontal
        haupt        = textzeile LEERZEILE
        textzeile    = { WORT }+
        WORT         = /[^ \t]+/~
        LEERZEILE    = /\n[ \t]*(?=\n)/~
        """
319
320
321
        self.pyparser, messages, syntax_tree = compile_source(grammar, None, get_ebnf_grammar(),
                                                              get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
        assert self.pyparser
322
        assert not messages
323
324
325
326

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
327
328
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
        grammar("no_file_name*")
329
        for record in grammar.history__:
330
331
            assert not record.node or record.node.pos >= 0

332
    def test_select_parsing(self):
333
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
334
335
336
337
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
338

di68kap's avatar
di68kap committed
339
340
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
341
            r'''
di68kap's avatar
di68kap committed
342
343
344
345
346
347
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
348
349
350
351
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
352
353
354
355
356
357
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

358

359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser("ABCD");
        assert not st.error_flag
        st = parser("A_CD");
        assert not st.error_flag
        st = parser("AB_D");
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
382
383
384
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
385
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
386
        # transitivity of mandatory-operator
387
        st = parser("ABC_");  assert st.error_flag
388
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
389
390

    def test_series_composition(self):
391
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
392
393
394
395
396
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
397
398
399
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
400
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
401
        st = parser("ABC_E");  assert st.error_flag
402
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
403
404
405

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
406
407
408
409
410
411
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
412
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
413

eckhart's avatar
eckhart committed
414
415
416
417
418
419
420
421
422
423
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
424

425

426
427
428
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
429
        prefixes = AllOf(TKN("A"), TKN("B"))
430
431
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
432
        # aternative Form
433
        prefixes = AllOf(Series(TKN("B"), TKN("A")))
434
        assert Grammar(prefixes)('A B').content == 'A B'
435
436
437
438

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
439
        prefixes = AllOf(TKN("A"), TKN("B"))
440
441
442
443
444
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
445
        prefixes = AllOf(TKN("A"), TKN("B"), TKN("A"))
446
447
448
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
449
450
451
452
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
453
        prefixes = SomeOf(TKN("A"), TKN("B"))
454
455
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
456
        # aternative Form
457
        prefixes = SomeOf(Alternative(TKN("B"), TKN("A")))
458
459
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B').content == 'B'
460
461
462
463

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
464
        prefixes = SomeOf(TKN("A"), TKN("B"), TKN("A"))
465
466
467
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
468
469
470
        assert Grammar(prefixes)('A B B').error_flag


471
class TestPopRetrieve:
472
    mini_language = r"""
473
474
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
475
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrival!
476
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
477
        text           = /[^`]+/
478
        """
479
    mini_lang2 = r"""
480
481
482
483
484
485
486
487
        @braces_filter=counterpart
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
488
    mini_lang3 = r"""
489
490
491
492
        document       = { text | env }
        env            = (specialtag | opentag) text [closespecial | closetag]
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
493
        closetag       = close_slash | close_star
494
495
496
497
498
499
500
501
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """

    def setup(self):
502
503
504
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
505
506
507

    @staticmethod
    def opening_delimiter(node, name):
508
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
509
510
511

    @staticmethod
    def closing_delimiter(node):
512
513
        return node.tag_name in {':Pop', ':Retrieve'}
        # return isinstance(node.parser, Retrieve)
514
515
516
517
518
519
520
521
522

    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
523
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
524
525
526

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
527
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
528
529
530

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
531
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
532
533
534

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
535
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
536
537
538

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
539
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
540

541
    def test_cache_neutrality(self):
542
543
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
544
        lang = r"""
545
546
547
548
549
550
551
552
553
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
554
        gr = grammar_provider(lang)()
555
        st = gr(case)
556
        assert not st.error_flag, str(st.errors_sorted)
557
558
559
560

    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
561
        assert not syntax_tree.errors_sorted
562
563
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="delimiter"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
564
565
        assert delim == pop
        if is_logging():
566
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
567
568
569
570
571
572
573
574
575
576
577

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
578
        assert not syntax_tree.errors_sorted
579
580
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="delimiter"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
581
582
        assert delim == pop
        if is_logging():
583
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
584
585
586
587

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
588
        assert not syntax_tree.errors_sorted
589
590
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="braces"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
591
592
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
593
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
594
595
596
597
598
599
600
601
602
603
604

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
605
        assert not syntax_tree.errors_sorted
606
607
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="braces"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
608
609
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
610
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
611
612


613
class TestWhitespaceHandling:
Eckhart Arnold's avatar
Eckhart Arnold committed
614
    minilang = """
615
616
617
618
619
620
621
622
623
        doc = A B
        A = "A"
        B = "B"
        Rdoc = ar br
        ar = /A/
        br = /B/
        """

    def setup(self):
624
        self.gr = grammar_provider(self.minilang)()
625
626
627
628
629
630
631
632
633
634
635
636

    def test_token_whitespace(self):
        st = self.gr("AB", 'doc')
        assert not st.error_flag
        st = self.gr("A B", 'doc')
        assert not st.error_flag

    def test_regexp_whitespace(self):
        st = self.gr("AB", 'Rdoc')
        assert not st.error_flag
        st = self.gr("A B", 'Rdoc')
        assert st.error_flag
di68kap's avatar
di68kap committed
637
638
639
640
641


class TestErrorReporting:
    grammar = """
        root      = series alpha | anything
eckhart's avatar
eckhart committed
642
        series    = subseries &alpha
di68kap's avatar
di68kap committed
643
644
645
646
647
648
649
650
651
652
653
654
655
656
        subseries = alpha §beta
        alpha     = /[a-z]+/
        beta      = /[A-Z]+/
        anything  = /.*/
        """

    def setup(self):
        self.parser = grammar_provider(self.grammar)()

    def test_error_propagation(self):
        testcode1 = "halloB"
        testcode2 = "XYZ"
        testcode3 = "hallo "
        cst = self.parser(testcode1)
657
        assert not cst.error_flag, str(cst.errors_sorted)
di68kap's avatar
di68kap committed
658
659
660
661
        cst = self.parser(testcode2)
        assert not cst.error_flag
        cst = self.parser(testcode3)
        assert cst.error_flag
662
663


di68kap's avatar
di68kap committed
664
665
class TestBorderlineCases:
    def test_not_matching(self):
666
        minilang = """parser = /X/\n"""
di68kap's avatar
di68kap committed
667
668
669
670
        gr = grammar_provider(minilang)()
        cst = gr('X', 'parser')
        assert not cst.error_flag
        cst = gr(' ', 'parser')
671
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
672
        cst = gr('', 'parser')
673
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
674
675
676
677
678
679
680

    def test_matching(self):
        minilang = """parser = /.?/"""
        gr = grammar_provider(minilang)()
        cst = gr(' ', 'parser')
        assert not cst.error_flag
        cst = gr('  ', 'parser')
681
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
682
683
684
685
        cst = gr('', 'parser')
        assert not cst.error_flag


Eckhart Arnold's avatar
Eckhart Arnold committed
686
class TestReentryAfterError:
687
    def setup(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
688
689
690
691
692
        lang = """
        document = alpha [beta] gamma "."
          alpha = "ALPHA" abc
            abc = §"a" "b" "c"
          beta = "BETA" (bac | bca)
693
694
            bac = "b" "a" §"c"
            bca = "b" "c" §"a"
Eckhart Arnold's avatar
Eckhart Arnold committed
695
696
697
698
          gamma = "GAMMA" §(cab | cba)
            cab = "c" "a" §"b"
            cba = "c" "b" §"a"
        """
699
        self.gr = grammar_provider(lang)()
700

701
702
    def test_no_resume_rules(self):
        gr = self.gr;  gr.resume_rules = dict()
703
704
        content = 'ALPHA acb BETA bac GAMMA cab .'
        cst = gr(content)
705
706
707
708
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')

eckhart's avatar
eckhart committed
709
710
711
712
713
714
715
716
    def test_no_resume_rules_partial_parsing(self):
        gr = self.gr;  gr.resume_rules = dict()
        content = 'ALPHA acb'
        cst = gr(content, 'alpha')
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')

717
718
    def test_simple_resume_rule(self):
        gr = self.gr;  gr.resume_rules = dict()
719
        gr.resume_rules__['alpha'] = ['BETA']
720
        content = 'ALPHA acb BETA bac GAMMA cab .'
721
722
723
724
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
725
        # because of resuming, there should be only one error message
726
        assert len(cst.errors_sorted) == 1
727

728
729
    def test_failing_resume_rule(self):
        gr = self.gr;  gr.resume_rules = dict()
730
        gr.resume_rules__['alpha'] = ['XXX']
731
        content = 'ALPHA acb BETA bac GAMMA cab .'
732
733
734
735
736
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        # assert cst.pick('alpha').content.startswith('ALPHA')

737
738
    def test_severl_reentry_points(self):
        gr = self.gr;  gr.resume_rules = dict()
739
        gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
740
        content = 'ALPHA acb BETA bac GAMMA cab .'
741
742
743
744
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
745
        # because of resuming, there should be only one error message
746
        assert len(cst.errors_sorted) == 1
747

748
749
    def test_several_reentry_points_second_point_matching(self):
        gr = self.gr;  gr.resume_rules = dict()
750
751
752
        gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
        content = 'ALPHA acb GAMMA cab .'
        cst = gr(content)
Eckhart Arnold's avatar
Eckhart Arnold committed
753
        assert cst.error_flag
754
        assert cst.content == content
755
        assert cst.pick('alpha').content.startswith('ALPHA')
756
        # because of resuming, there should be only one error message
757
        assert len(cst.errors_sorted) == 1
758

759
760
761
762
763
764
765
766
767
768
    def test_several_resume_rules_innermost_rule_matching(self):
        gr = self.gr;  gr.resume_rules = dict()
        gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
        gr.resume_rules__['beta'] = ['GAMMA']
        gr.resume_rules__['bac'] = ['GAMMA']
        content = 'ALPHA abc BETA bad GAMMA cab .'
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
769
        # because of resuming, there should be only one error message
770
        assert len(cst.errors_sorted) == 1
771
772
773
774
775
776
        # multiple failures
        content = 'ALPHA acb BETA bad GAMMA cab .'
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
777
        # there should be only two error messages
778
        assert len(cst.errors_sorted) == 2
779

780
    def test_skip_comment_on_resume(self):
781
782
        lang = r"""
            @ comment =  /(?:\/\/.*)|(?:\/\*(?:.|\n)*?\*\/)/  # Kommentare im C++-Stil
783
784
785
786
            document = block_A block_B
            @ block_A_resume = /x/
            block_A = "a" §"b" "c"
            block_B = "x" "y" "z"
787
        """
788
789
790
791
792
793
794
795
796
797
798
799
        def mini_suite(grammar):
            tree = grammar('abc/*x*/xyz')
            assert not tree.errors
            tree = grammar('abDxyz')
            mandatory_cont = (Error.MANDATORY_CONTINUATION, Error.MANDATORY_CONTINUATION_AT_EOF)
            assert len(tree.errors) == 1 and tree.errors[0].code in mandatory_cont
            tree = grammar('abD/*x*/xyz')
            assert len(tree.errors) == 1 and tree.errors[0].code in mandatory_cont
            tree = grammar('aD /*x*/ c /* a */ /*x*/xyz')
            assert len(tree.errors) == 1 and tree.errors[0].code in mandatory_cont

        # test regex-defined resume rule
800
        grammar = grammar_provider(lang)()
801
        mini_suite(grammar)
802

803
804
805
806
807
        # test string-defined resume rule
        alt_lang = lang.replace('@ block_A_resume = /x/',
                                '@ block_A_resume = "x"')
        grammar = grammar_provider(alt_lang)()
        mini_suite(grammar)
808

809
class TestConfiguredErrorMessages:
810
    def test_configured_error_message(self):
811
812
813
        lang = """
            document = series | /.*/
            @series_error = "a badly configured error message {5}"
814
            series = /X/ | head §"C" "D"
815
816
817
818
            head = "A" "B"
            """
        parser = grammar_provider(lang)()
        st = parser("AB_D");  assert st.error_flag
819
820
        assert st.errors_sorted[0].code == Error.MALFORMED_ERROR_STRING
        assert st.errors_sorted[1].code == Error.MANDATORY_CONTINUATION
821
822


823
824
825
826
827
828
829
830
831
832
class TestUnknownParserError:
    def test_unknown_parser_error(self):
        gr = Grammar()
        try:
            gr("", "NonExistantParser")
            assert False, "UnknownParserError expected!"
        except UnknownParserError:
            pass


833
834
class TestEarlyTokenWhitespaceDrop:
    def setup(self):
835
        self.lang = r"""
836
837
838
            @ drop = token, whitespace
            expression = term  { ("+" | "-") term}
            term       = factor  { ("*"|"/") factor}
eckhart's avatar
eckhart committed
839
            factor     = number | variable | "("  expression  ")"
840
                       | constant | fixed
841
            variable   = /[a-z]/~
842
843
            number     = /\d+/~
            constant   = "A" | "B"
eckhart's avatar
eckhart committed
844
            fixed      = "X"
845
846
847
            """
        self.gr = grammar_provider(self.lang)()

848
    def test_drop(self):
849
850
851
        cst = self.gr('4 + 3 * 5')
        assert not cst.pick(':Token')
        assert not cst.pick(':Whitespace')
852
853
        cst = self.gr('A + B')
        try:
854
            _ = next(cst.select_if(lambda node: node.content == 'A'))
855
856
857
858
            assert False, "Tokens in compound expressions should be dropped!"
        except StopIteration:
            pass
        cst = self.gr('X * y')
859
        assert next(cst.select_if(lambda node: node.content == 'X'))
860

861

Eckhart Arnold's avatar
Eckhart Arnold committed
862
class TestMetaParser:
863
864
865
866
867
868
869
    def setup(self):
        self.mp = MetaParser()
        self.mp.grammar = Grammar()  # override placeholder warning
        self.mp.pname = "named"
        self.mp.tag_name = self.mp.pname

    def test_return_value(self):
870
871
        save = get_config_value('flatten_tree_while_parsing')
        set_config_value('flatten_tree_while_parsing', True)
872
        nd = self.mp._return_value(Node('tagged', 'non-empty'))
Eckhart Arnold's avatar
Eckhart Arnold committed
873
874
875
876
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert len(nd.children) == 1
        assert nd.children[0].tag_name == 'tagged'
        assert nd.children[0].result == "non-empty"
877
        nd = self.mp._return_value(Node('tagged', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
878
879
880
881
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert len(nd.children) == 1
        assert nd.children[0].tag_name == 'tagged'
        assert not nd.children[0].result
882
        nd = self.mp._return_value(Node(':anonymous', 'content'))
Eckhart Arnold's avatar
Eckhart Arnold committed
883
884
885
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert not nd.children
        assert nd.result == 'content'
886
        nd = self.mp._return_value(Node(':anonymous', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
887
888
889
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert not nd.children
        assert not nd.content
890
891
892
893
894
        nd = self.mp._return_value(EMPTY_NODE)
        assert nd.tag_name == 'named' and not nd.children, nd.as_sxpr()
        self.mp.pname = ''
        self.mp.tag_name = ':unnamed'
        nd = self.mp._return_value(Node('tagged', 'non-empty'))
Eckhart Arnold's avatar
Eckhart Arnold committed
895
896
897
        assert nd.tag_name == 'tagged', nd.as_sxpr()
        assert len(nd.children) == 0
        assert nd.content == 'non-empty'
898
        nd = self.mp._return_value(Node('tagged', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
899
900
901
        assert nd.tag_name == 'tagged', nd.as_sxpr()
        assert len(nd.children) == 0
        assert not nd.content
902
        nd = self.mp._return_value(Node(':anonymous', 'content'))
Eckhart Arnold's avatar
Eckhart Arnold committed
903
904
905
        assert nd.tag_name == ':anonymous', nd.as_sxpr()
        assert not nd.children
        assert nd.result == 'content'
906
        nd = self.mp._return_value(Node('', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
907
908
909
        assert nd.tag_name == '', nd.as_sxpr()
        assert not nd.children
        assert not nd.content
910
911
        assert self.mp._return_value(None) == EMPTY_NODE
        assert self.mp._return_value(EMPTY_NODE) == EMPTY_NODE
912
        set_config_value('flatten_tree_while_parsing', save)
Eckhart Arnold's avatar
Eckhart Arnold committed
913

914
915
916
917
918
919
    def test_return_values(self):
        self.mp.pname = "named"
        self.mp.tag_name = self.mp.pname
        rv = self.mp._return_values((Node('tag', 'content'), EMPTY_NODE))
        assert rv[-1].tag_name != EMPTY_NODE.tag_name, rv[-1].tag_name

920
    def test_in_context(self):
921
        minilang = r"""
922
923
924
            term       = factor  { (DIV|MUL) factor}
            factor     = NUMBER | VARIABLE
            MUL        = "*" | &factor
eckhart's avatar
eckhart committed
925
            DIV        = "/"
926
927
928
929
930
            NUMBER     = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
            VARIABLE   = /[A-Za-z]/~
            """
        gr = grammar_provider(minilang)()
        cst = gr("2x")
931
        assert bool(cst.pick('MUL')), "Named empty nodes should not be dropped!!!"
Eckhart Arnold's avatar
Eckhart Arnold committed
932

eckhart's avatar
eckhart committed
933

934
if __name__ == "__main__":
935
    from DHParser.testing import runner
936
    runner("", globals())