test_parse.py 58.5 KB
Newer Older
1
#!/usr/bin/env python3
2

3
"""test_parse.py - tests of the parsers-module of DHParser
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

eckhart's avatar
eckhart committed
22
import copy
23
import os
24
import sys
25
from functools import partial
26
from typing import List, Tuple
27

28
29
scriptpath = os.path.dirname(__file__) or '.'
sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
30

eckhart's avatar
eckhart committed
31
from DHParser.configuration import get_config_value, set_config_value
32
from DHParser.toolkit import compile_python_object, re
33
from DHParser.log import is_logging, log_ST, log_parsing_history, start_logging
eckhart's avatar
eckhart committed
34
35
36
from DHParser.error import Error, is_error, adjust_error_locations, MANDATORY_CONTINUATION, \
    MALFORMED_ERROR_STRING, MANDATORY_CONTINUATION_AT_EOF, RESUME_NOTICE, PARSER_STOPPED_BEFORE_END, \
    PARSER_NEVER_TOUCHES_DOCUMENT
37
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
38
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, \
39
    Interleave, UnknownParserError, CombinedParser, Text, EMPTY_NODE, Capture, Drop, Whitespace, \
40
    GrammarError, Counted, Always, INFINITE
41
from DHParser import compile_source
42
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, \
43
    parse_ebnf, DHPARSER_IMPORTS, compile_ebnf
eckhart's avatar
eckhart committed
44
from DHParser.dsl import grammar_provider, create_parser, raw_compileEBNF
45
from DHParser.syntaxtree import Node, parse_sxpr
46
from DHParser.stringview import StringView
47
from DHParser.trace import set_tracer, trace_history, resume_notices_on
48
49


50
51
52
53
54
55

class TestWhitespace:
    # TODO: add test cases here
    def test_whitespace_comment_mangling(self):
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
56
    def test_non_empty_derivation(self):
57
58
        pass

59

60
61
class TestParserError:
    def test_parser_error_str(self):
62
        pe = ParserError(Node('TAG', 'test').with_pos(0), StringView('Beispiel'), None, True)
63
64
        assert str(pe).find('Beispiel') >= 0 and str(pe).find('TAG') >= 0

65
    def test_false_lookahead_only_message(self):
66
        """PARSER_LOOKAHEAD_*_ONLY errors must not be reported if there
67
68
69
70
71
72
        no lookahead parser in the history!"""
        lang = """
        word = letters { letters | `-` letters }
        letters = /[A-Za-z]+/
        """
        gr = grammar_provider(lang)()
73
74
        set_tracer(gr, trace_history)
        st = gr('hard-time')
75
        assert not st.errors
76
        st = gr('hard-')
77
78
        assert st.errors and not any(e.code == 1045 for e in st.errors)

eckhart's avatar
eckhart committed
79

eckhart's avatar
eckhart committed
80
81
82
83
84
85
86
87
88
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
89
90
        def visitor(context: List[Parser]):
            p = context[-1]
di68kap's avatar
di68kap committed
91
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
92
93
94
95
96
97
98
99
100
101
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3

102
103
    def test_symbol(self):
        class MyGrammar(Grammar):
104
            wrong = Text('wrong')
105
106
107
108
109
            word = OneOrMore(wrong) + Whitespace(r'\s*') + OneOrMore(RegExp(r'\w+'))
            root__ = word
        gr = MyGrammar()
        regex = gr['word'].parsers[-1].parser
        result = gr.associated_symbol(regex).symbol
110
        assert result == 'word', result
111

eckhart's avatar
eckhart committed
112

113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
class TestInfiLoopsAndRecursion:
    def setup(self):
        pass
        # set_config_value('history_tracking', True)
        # set_config_value('resume_notices', True)
        # start_logging('LOGS')

    def test_very_simple(self):
        minilang = """
            term = term (`*`|`/`) factor | factor
            factor = /[0-9]+/
            """
        grammar_factory = grammar_provider(minilang)
        parser = grammar_factory()
        snippet = "5*4*3*2"
        # set_tracer(parser, trace_history)
        st = parser(snippet)
        if is_logging():
            log_ST(st, 'test_LeftRecursion_very_simple.cst')
            log_parsing_history(parser, 'test_LeftRecursion_very_simple')
        assert not is_error(st.error_flag), str(st.errors)
        st = parser("1*2*3*4*5*6*7*8*9")
        # if is_logging():
        #     log_ST(st, 'test_LeftRecursion_very_simple_2.cst')
        #     log_parsing_history(parser, 'test_LeftRecursion_very_simple_2')
        assert not is_error(st.error_flag)

    def test_direct_left_recursion1(self):
        minilang = """@literalws = right
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
        parser = grammar_provider(minilang)()
        # print(raw_compileEBNF(minilang).result)
        assert parser
        syntax_tree = parser(snippet)
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_direct1.cst")
            log_parsing_history(parser, "test_LeftRecursion_direct1")
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)

    def test_direct_left_recursion2(self):
        minilang = """@literalws = right
            expr = ex
            ex   = expr ("+"|"-") term | term
            term = tr
            tr   = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
        parser = grammar_provider(minilang)()
        assert parser
        syntax_tree = parser(snippet)
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_direct2.cst")
            log_parsing_history(parser, "test_LeftRecursion_direct2")

    def test_indirect_left_recursion1(self):
        minilang = """@literalws = right
            Expr    = //~ (Product | Sum | Value)
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
            Value   = /[0-9.]+/~ | '(' §Expr ')'
            """
        # print(raw_compileEBNF(minilang).result)
        parser = grammar_provider(minilang)()
        snippet = "8 * 4"
        syntax_tree = parser(snippet)
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_indirect1.cst")
            log_parsing_history(parser, "test_LeftRecursion_indirect1")
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
        snippet = "9 + 8 * (4 - 3 / (5 - 1))"
        syntax_tree = parser(snippet)
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_indirect1.cst")
            log_parsing_history(parser, "test_LeftRecursion_indirect1")

    # BEWARE: EXPERIMENTAL TEST can be long running
    def test_indirect_left_recursion2(self):
        arithmetic_syntax = r"""@literalws = right
            expression     = addition | subtraction  # | term
            addition       = (expression | term) "+" (expression | term)
            subtraction    = (expression | term) "-" (expression | term)
            term           = multiplication | division  # | factor
            multiplication = (term | factor) "*" (term | factor)
            division       = (term | factor) "/" (term | factor)
            factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
            group          = "(" expression ")"
            SIGN           = /[+-]/
            NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
            VARIABLE       = /[A-Za-z]/~
            """
        arithmetic = grammar_provider(arithmetic_syntax)()
        assert arithmetic
        syntax_tree = arithmetic("(a + b) * (a - b)")
        assert syntax_tree.errors
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_indirect2.cst")
            log_parsing_history(arithmetic, "test_LeftRecursion_indirect2")

    def test_indirect_left_recursion3(self):
        arithmetic_syntax = r"""@literalws = right
            expression     = addition | subtraction | term
            addition       = (expression | term) "+" (expression | term)
            subtraction    = (expression | term) "-" (expression | term)
            term           = multiplication | division | factor
            multiplication = (term | factor) "*" (term | factor)
            division       = (term | factor) "/" (term | factor)
            factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
            group          = "(" expression ")"
            SIGN           = /[+-]/
            NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
            VARIABLE       = /[A-Za-z]/~
            """
        arithmetic = grammar_provider(arithmetic_syntax)()
        assert arithmetic
        syntax_tree = arithmetic("(a + b) * (a - b)")
        assert not syntax_tree.errors
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_indirect3.cst")
            log_parsing_history(arithmetic, "test_LeftRecursion_indirect3")


    def test_break_inifnite_loop_ZeroOrMore(self):
        forever = ZeroOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    def test_break_inifnite_loop_OneOrMore(self):
        forever = OneOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    def test_break_infinite_loop_Counted(self):
        forever = Counted(Always(), (0, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (5, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (INFINITE, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (1000, INFINITE - 1))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    def test_break_infinite_loop_Interleave(self):
        forever = Interleave(Always(), repetitions = [(0, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Interleave(Always(), Always(),
                             repetitions = [(5, INFINITE), (INFINITE, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Interleave(Always(), repetitions = [(1000, INFINITE - 1)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    # def test_infinite_loops(self):
    #     minilang = """forever = { // } \n"""
    #     try:
    #         parser_class = grammar_provider(minilang)
    #     except CompilationError as error:
    #         assert all(e.code == INFINITE_LOOP for e in error.errors)
    #     save = get_config_value('static_analysis')
    #     set_config_value('static_analysis', 'late')
    #     provider = grammar_provider(minilang)
    #     try:
    #         parser = provider()
    #     except GrammarError as error:
    #         assert error.errors[0][2].code == INFINITE_LOOP
    #     set_config_value('static_analysis', 'none')
    #     parser = provider()
    #     snippet = " "
    #     syntax_tree = parser(snippet)
    #     assert any(e.code == INFINITE_LOOP for e in syntax_tree.errors)
    #     res = parser.static_analysis()
    #     assert res and res[0][2].code == INFINITE_LOOP
    #     minilang = """not_forever = { / / } \n"""
    #     parser = grammar_provider(minilang)()
    #     res = parser.static_analysis()
    #     assert not res
    #     set_config_value('static_analysis', save)
312

eckhart's avatar
eckhart committed
313

314
315
316
317
318
319
# class TestStaticAnalysis:
#     def test_alternative(self):
#         lang = 'doc = "A" | "AB"'
#         parser = create_parser(lang)


Eckhart Arnold's avatar
Eckhart Arnold committed
320
class TestFlowControl:
321
322
323
324
325
326
    t1 = """
         All work and no play
         makes Jack a dull boy
         END
         """
    t2 = "All word and not play makes Jack a dull boy END\n"
327

Eckhart Arnold's avatar
Eckhart Arnold committed
328
    def test_lookbehind(self):
329
330
        ws = RegExp(r'\s*');  ws.pname = "ws"
        end = RegExp("END");  end.pname = "end"
331
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
332
        word = RegExp(r'\w+');  word.pname = "word"
Eckhart Arnold's avatar
Eckhart Arnold committed
333
334
335
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws
        parser = Grammar(document)
336
337
338
339
340
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

341
342
343
344
345
346
347
348
        cst = parser(self.t2, parser['ws'], complete_match=False)
        assert cst.did_match() and len(cst) == 0 and not cst.errors
        cst = parser(self.t2, parser['word'], complete_match=False)
        assert cst.did_match() and cst.content == "All" and not cst.errors
        cst = parser(self.t2, parser['end'], complete_match=False)
        assert not cst.did_match()


349
350
    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
351
            parser_initialization__ = ["upon instantiation"]
352
            ws = RegExp(r'\s*')
353
            end = RegExp('END')
354
            SUCC_LB = RegExp('\\s*?\\n')
355
            doc_end = Series(Lookbehind(SUCC_LB), end)
356
            word = RegExp(r'\w+')
357
358
359
360
361
362
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
363
        assert not cst.error_flag, cst.as_sxpr()
364
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
365
366
367
        assert cst.error_flag, cst.as_sxpr()


368
369
370
371
372
373
374
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
375
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
Eckhart Arnold's avatar
Eckhart Arnold committed
376
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
377
        assert result
378
        assert not messages, str(messages)
379
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
380
381
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
382
        assert node.tag_name == "regex"
383
384
        assert str(node) == 'abc+def'

385
386
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
eckhart's avatar
eckhart committed
387
388
        regex =  /\w+
                  [+]
389
390
                  \w* /
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
391
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
392
393
394
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
395
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
396
397
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
398
        assert node.tag_name == "regex"
399
400
        assert str(node) == 'abc+def'

401
    def test_ignore_case(self):
402
403
404
405
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
406
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
407
408
409
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
410
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
411
        node, rest = parser.regex(StringView('Alpha'))
412
413
        assert node
        assert rest == ''
414
        assert node.tag_name == "regex"
415
416
417
418
419
420
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
421
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
422
423
424
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
425
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
426
427
        node, rest = parser.regex(StringView('Alpha'))
        assert node is None
428

429
    def test_token(self):
430
        tokenlang = r"""@literalws = right
431
432
433
434
435
436
437
438
439
440
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
441
        result, messages, _ = compile_source(
eckhart's avatar
eckhart committed
442
443
            tokenlang, None, get_ebnf_grammar(), get_ebnf_transformer(),
            get_ebnf_compiler("TokenTest"))
444
        assert result
eckhart's avatar
eckhart committed
445
        assert not messages, str(messages)
446
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
447
        result = parser(testdoc)
448
        # log_parsing_history(parser, "test.log")
449
        assert not result.error_flag, str(result.errors_sorted)
450

451

452
class TestGrammar:
453
454
455
456
457
458
459
460
    grammar = r"""@whitespace = horizontal
    haupt        = textzeile LEERZEILE
    textzeile    = { WORT }+
    WORT         = /[^ \t]+/~
    LEERZEILE    = /\n[ \t]*(?=\n)/~
    """
    pyparser, messages, _ = compile_source(grammar, None, get_ebnf_grammar(),
                                           get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
eckhart's avatar
eckhart committed
461
    assert pyparser, str(messages)
eckhart's avatar
eckhart committed
462
    assert not messages, str(messages)
463
464
465
466

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
467
468
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
        grammar("no_file_name*")
469
        for record in grammar.history__:
470
471
            assert not record.node or record.node.pos >= 0

472
    def test_select_parsing(self):
473
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
474
475
476
477
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
478

di68kap's avatar
di68kap committed
479
480
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
481
            r'''
di68kap's avatar
di68kap committed
482
483
484
485
486
487
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
488
489
490
491
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
492
493
494
495
496
497
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

498
499
500
501
502
503
504
    def test_incomplete_matching(self):
        """Tests whether the flag `complete_match` works as expected when
        calling a grammar object in order to parse a document."""
        gr = grammar_provider('word = ~/\\w+/\n')()
        st = gr('eins')
        assert not st.errors
        st = gr('eins zwei')
505
        assert st.errors[0].code == PARSER_STOPPED_BEFORE_END
506
507
508
        st = gr('eins zwei', complete_match=False)
        assert not st.errors

509
    def test_synonym(self):
di68kap's avatar
di68kap committed
510
        lang = r"""
511
512
513
514
515
516
517
518
519
            doc  = { word | number }
            word = /\w+/ S
            number = [VZ] /\d+/ S 
            S    = ~        # let S by a synonym for anonymous whitespace
            VZ   = "-"
        """
        gr = grammar_provider(lang)()
        st = gr('eins 1 zwei2drei 3')
        # set_config_value('compiled_EBNF_log', 'grammar.log')
520
        gr = grammar_provider("@drop = whitespace, strings" + lang)()
521
522
        st = gr('eins 1 zwei2drei 3')
        st = gr('-3')
523
        assert str(gr['S']) == "S = ~", str(gr['S'])
524

525

526
527
528
529
530
531
532
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
Eckhart Arnold's avatar
Eckhart Arnold committed
533
        st = parser("ABCD")
534
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
535
        st = parser("A_CD")
536
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
537
        st = parser("AB_D")
538
539
540
541
542
543
544
545
546
547
548
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
549
550
551
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
552
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
553
        # transitivity of mandatory-operator
554
        st = parser("ABC_");  assert st.error_flag
555
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
556
557

    def test_series_composition(self):
558
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
559
560
561
562
563
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
564
565
566
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
567
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
568
        st = parser("ABC_E");  assert st.error_flag
569
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
570
571
572

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
573
574
575
576
577
578
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
579
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
580

eckhart's avatar
eckhart committed
581
582
583
584
585
586
587
588
589
590
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
591

592
593
    def test_ebnf_serialization(self):
        ebnf_grammar = get_ebnf_grammar()
eckhart's avatar
eckhart committed
594
        # TODO: Add test here
595
        ebnf = ebnf_grammar.as_ebnf()
596
        # print(ebnf)
597
598


599
600
601
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
602
        prefixes = Interleave(TKN("A"), TKN("B"))
603
604
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
605
606
607
608

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
eckhart's avatar
eckhart committed
609
        prefixes = Interleave(TKN("A"), TKN("B"))
610
611
612
613
614
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
615
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
616
617
618
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
619
620
621
622
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
623
        prefixes = Interleave(TKN("A"), TKN("B"))
624
625
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
eckhart's avatar
eckhart committed
626
627
628
        st = Grammar(prefixes)('B')
        assert st.error_flag
        prefixes = Interleave(TKN("B"), TKN("A"), repetitions=((0, 1), (0, 1)))
629
        assert Grammar(prefixes)('A B').content == 'A B'
eckhart's avatar
eckhart committed
630
631
632
        st = Grammar(prefixes)('B')
        assert not st.error_flag
        assert st.content == 'B'
633
634
635
636

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
637
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
638
639
640
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
641
642
643
        assert Grammar(prefixes)('A B B').error_flag


644
645
class TestInterleave:
    def test_interleave_most_simple(self):
646
        letterset = Interleave(Text("A"), Text("B"), Text("C"))
647
648
        gr = Grammar(letterset)
        st = gr('ABC')
649
        assert not st.errors, str(st.errors)
650
651
652
653
654
655
656
657
658
659
        assert st.content == "ABC"
        st = gr('BCA')
        assert not st.errors
        assert st.content == "BCA"
        st = gr('BCBA')
        assert st.errors
        st = gr('AB')
        assert st.errors

    def test_interleave(self):
660
        letterset = Interleave(Text("A"), Text("B"), Text("C"),
661
662
663
664
                               repetitions=[(1, 1000), (0, 1), (1, 1)])
        gr = Grammar(letterset)
        st = gr('AABC')
        assert not st.errors
665
666
667
668
669
670
671
672
673
674
        st = gr('BACAAA')
        assert not st.errors
        st = gr('ABCC')
        assert st.errors
        st = gr('AAACAAA')
        assert not st.errors
        st = gr('AAABAAA')
        assert st.errors


675
676
677
678
class TestErrorRecovery:
    def test_series_skip(self):
        lang = """
        document = series | /.*/
679
        @series_skip = /(?=[A-Z])/
680
681
682
683
684
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser('AB_D')
        assert len(st.errors) == 1  # no additional "stopped before end"-error!
685
        resume_notices_on(parser)
686
        st = parser('AB_D')
687
        assert len(st.errors) == 2 and any(err.code == RESUME_NOTICE for err in st.errors)
eckhart's avatar
eckhart committed
688
        assert 'Skipping' in str(st.errors_sorted[1])
689

690
    def test_Interleave_skip(self):
691
692
693
        lang = """
        document = allof | /.*/
        @allof_skip = /[A-Z]/
694
        allof = "A" ° §"B" ° "C" ° "D"
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
        """
        parser = grammar_provider(lang)()
        st = parser('CADB')
        assert 'allof' in st and st['allof'].content == "CADB"
        st = parser('_BCD')
        assert st.equals(parse_sxpr('(document "_BCD")'))
        st = parser('_ABC')
        assert st.equals(parse_sxpr('(document "_ABC")'))
        st = parser('A_CD')
        assert st['allof'].content == "A_CD"
        st = parser('AB_D')
        assert st['allof'].content == "AB_D"
        st = parser('A__D')
        assert st['allof'].content == "A__D"
        st = parser('CA_D')
        assert st['allof'].content == "CA_D"
711
712
        st = parser('A_CB')
        assert st['allof'].content == "A_CB"
713
        st = parser('BC_A')
714
        assert 'allof' not in st
715
716


717
class TestPopRetrieve:
718
    mini_language = r"""
719
720
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
721
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrieval!
722
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
723
        text           = /[^`]+/
724
        """
725
    mini_lang2 = r"""
726
        @braces_filter = matching_bracket()
727
728
729
730
731
732
733
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
734
    mini_lang3 = r"""@literalws = right
735
        document       = { text | env }
eckhart's avatar
eckhart committed
736
        env            = (specialtag | opentag) text [ closespecial | closetag ]
737
738
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
739
        closetag       = close_slash | close_star
740
741
742
743
744
745
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """
746
    mini_lang4 = r"""@literalws = right
747
748
749
750
751
752
753
        document       = { text | env }
        env            = opentag document closetag
        opentag        = "<" name ">"
        closetag       = "</" :?name ">"
        name           = /\w+/~
        text           = /[^<>]+/        
    """
754
755

    def setup(self):
756
757
758
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
759
        self.minilang_parser4 = grammar_provider(self.mini_lang4)()
760
761

    @staticmethod
762
    def has_tag_name(node, name):
763
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
764

765
766
    def test_capture_assertions(self):
        try:
767
768
769
            _ = Grammar(Capture(Drop(Whitespace(r'\s*'))))
            assert False, "GrammarError expected!"
        except GrammarError as ge:
770
771
            pass
        try:
772
            _ = Grammar(Capture(Series(Text(' '), Drop(Whitespace(r'\s*')))))
773
            assert False, "ValueError expected!"
774
        except GrammarError:
775
            pass
776
777
778
        cp = Capture(RegExp(r'\w+'))
        cp.pname = "capture"
        _ = Grammar(cp)
779

780
781
782
783
    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3
784
        assert self.minilang_parser4
785
786
787
788

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
789
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
790
791
792

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
793
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
794
795
796

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
797
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
798
799
800

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
801
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
802
803
804

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
805
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
806

807
    def test_optional_match(self):
808
809
810
811
        # from DHParser.dsl import compileEBNF
        # src = compileEBNF(self.mini_lang4)
        # print(src)
        # return
812
813
814
815
816
817
        test1 = '<info>Hey, you</info>'
        st = self.minilang_parser4(test1)
        assert not st.error_flag, str(st.errors_sorted)
        test12 = '<info>Hey, <emph>you</emph></info>'
        st = self.minilang_parser4(test1)
        assert not st.error_flag
818
        test2 = '<info>Hey, you</>'
819
820
821
        # set_config_value('history_tracking', True)
        # set_tracer(self.minilang_parser4, trace_history)
        # start_logging('LOGS')
822
        st = self.minilang_parser4(test2)
823
824
825
826
827
828
829
830
831
        # log_parsing_history(self.minilang_parser4, "optional_match")
        # print(st.as_sxpr())
        assert not st.error_flag
        test3 = '<info>Hey, <emph>you</></>'
        st = self.minilang_parser4(test3)
        assert not st.error_flag
        test4 = '<info>Hey, <emph>you</></info>'
        st = self.minilang_parser4(test4)
        assert not st.error_flag
832
833
834
835
836
837
838
839
840
841
842

    def test_rollback_behaviour_of_optional_match(self):
        test1 = '<info>Hey, you</info*>'
        st = self.minilang_parser4(test1)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag
        test2 = '<info>Hey, you</*>'
        st = self.minilang_parser4(test2)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag

843
    def test_cache_neutrality_1(self):
844
845
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
846
        lang = r"""@literalws = right
847
848
849
850
851
852
853
854
855
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
856
        gr = grammar_provider(lang)()
857
        st = gr(case)
858
        assert not st.error_flag, str(st.errors_sorted)
859

860
861
862
863
864
865
866
867
868
869
870
    def test_cache_neutrality_2(self):
        lang = r'''document = variantA | variantB
            variantA  = delimiter `X` ::delimiter `!` 
            variantB  = `A` delimiter ::delimiter `!` 
            delimiter = `A` | `X`
        '''
        gr = grammar_provider(lang)()
        case = 'AXA!'
        st = gr(case)
        assert not st.errors
        case = 'AXX!'
871
872
873
        # set_config_value('history_tracking', True)
        # start_logging('LOGS')
        # set_tracer(gr, trace_history)
874
        st = gr(case)
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
        # log_parsing_history(gr, 'test_cache_neutrality_2')
        assert not st.errors
        assert str(st) == "AXX!"
        # print(st.as_sxpr())

    def test_cache_neutrality_3(selfself):
        lang = r'''document = variantA | variantB
            variantA  = delimiter `X` check ::delimiter `!` 
            variantB  = `A` delimiter check ::delimiter `!`
            check = :delimiter 
            delimiter = `A` | `X`
        '''
        gr = grammar_provider(lang)()
        case = 'AXXX!'
        # set_config_value('history_tracking', True)
        # start_logging('LOGS')
        # set_tracer(gr, trace_history)
        st = gr(case)
        # log_parsing_history(gr, 'test_cache_neutrality_3')
        assert not st.errors
        # print(st.as_sxpr())
896

897
898
899
    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
900
901
        assert not syntax_tree.errors_sorted, \
            ''.join(str(error) for error in syntax_tree.errors_sorted)
902
903
904
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
905
906
        assert delim == pop
        if is_logging():
907
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
908
909
910
911
912
913
914
915
916
917
918

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
919
        assert not syntax_tree.errors_sorted
920
921
922
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
923
924
        assert delim == pop
        if is_logging():
925
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
926
927
928
929

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
930
        assert not syntax_tree.errors_sorted
931
932
933
934
935
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
        assert len(delim) == len(pop)
        assert delim != pop
936
        if is_logging():
937
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
938
939
940
941
942
943
944
945
946
947
948

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
949
        assert not syntax_tree.errors_sorted
950
951
952
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
953
954
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
955
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
956

957
    def test_autoretrieve(self):
958
        lang = r"""@literalws = right
959
            document   = { definition } § EOF
960
            definition = symbol :defsign value
961
            symbol     = /\w+/~                      
962
963
            defsign    = "=" | ":="
            value      = /\d+/~
eckhart's avatar
eckhart committed
964
            EOF        = !/./ [ :?defsign ]   # eat up captured defsigns
965
        """
966
        # print(raw_compileEBNF(lang).result)
967
        parser = grammar_provider(lang)()
968
        st = parser("X := 1")
969
        assert not st.error_flag, str(st.errors)
970
        st1 = st
971
972
        st = parser("")
        assert not st.error_flag
973
974
975

        lines = [line for line in lang.split('\n') if line.strip()]
        eof_line = lines.pop()
976
        lines.insert(2, eof_line)
977
978
979
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
980
        assert not st.errors, str(st.errors)
981
982
        assert st.equals(st1)

983
984
        del lines[2]
        lines.insert(3, eof_line)
985
986
987
988
989
990
991
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)

        # and, finally...
992
        lang_variant = r"""@literalws = right
993
994
995
996
            document   = { definition } § EOF
            symbol     = /\w+/~                      
            defsign    = "=" | ":="
            value      = /\d+/~
997
            EOF        = !/./ :?defsign   # eat up captured defsign, only if it has been retrieved
998
999
1000
            definition = symbol :defsign value
        """
        parser = grammar_provider(lang_variant)()