test_parse.py 56.8 KB
Newer Older
1
#!/usr/bin/env python3
2

3
"""test_parse.py - tests of the parsers-module of DHParser
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

eckhart's avatar
eckhart committed
22
import copy
23
import os
24
import sys
25
from functools import partial
26
from typing import List, Tuple
27

28
29
scriptpath = os.path.dirname(__file__) or '.'
sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
30

eckhart's avatar
eckhart committed
31
from DHParser.configuration import get_config_value, set_config_value
32
from DHParser.toolkit import compile_python_object, re
33
from DHParser.log import is_logging, log_ST, log_parsing_history, start_logging
eckhart's avatar
eckhart committed
34
35
36
from DHParser.error import Error, is_error, adjust_error_locations, MANDATORY_CONTINUATION, \
    MALFORMED_ERROR_STRING, MANDATORY_CONTINUATION_AT_EOF, RESUME_NOTICE, PARSER_STOPPED_BEFORE_END, \
    PARSER_NEVER_TOUCHES_DOCUMENT
37
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
38
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, \
39
    Interleave, UnknownParserError, CombinedParser, Text, EMPTY_NODE, Capture, Drop, Whitespace, \
40
    GrammarError, Counted, Always, INFINITE
41
from DHParser import compile_source
42
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, \
43
    parse_ebnf, DHPARSER_IMPORTS, compile_ebnf
eckhart's avatar
eckhart committed
44
from DHParser.dsl import grammar_provider, create_parser, raw_compileEBNF
45
from DHParser.syntaxtree import Node, parse_sxpr
46
from DHParser.stringview import StringView
47
from DHParser.trace import set_tracer, trace_history, resume_notices_on
48
49


50
51
52
53
54
55

class TestWhitespace:
    # TODO: add test cases here
    def test_whitespace_comment_mangling(self):
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
56
    def test_non_empty_derivation(self):
57
58
        pass

59

60
61
class TestParserError:
    def test_parser_error_str(self):
62
        pe = ParserError(Node('TAG', 'test').with_pos(0), StringView('Beispiel'), None, True)
63
64
        assert str(pe).find('Beispiel') >= 0 and str(pe).find('TAG') >= 0

65
    def test_false_lookahead_only_message(self):
66
        """PARSER_LOOKAHEAD_*_ONLY errors must not be reported if there
67
68
69
70
71
72
        no lookahead parser in the history!"""
        lang = """
        word = letters { letters | `-` letters }
        letters = /[A-Za-z]+/
        """
        gr = grammar_provider(lang)()
73
74
        set_tracer(gr, trace_history)
        st = gr('hard-time')
75
        assert not st.errors
76
        st = gr('hard-')
77
78
        assert st.errors and not any(e.code == 1045 for e in st.errors)

eckhart's avatar
eckhart committed
79

eckhart's avatar
eckhart committed
80
81
82
83
84
85
86
87
88
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
89
90
        def visitor(context: List[Parser]):
            p = context[-1]
di68kap's avatar
di68kap committed
91
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
92
93
94
95
96
97
98
99
100
101
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3

102
103
    def test_symbol(self):
        class MyGrammar(Grammar):
104
            wrong = Text('wrong')
105
106
107
108
109
            word = OneOrMore(wrong) + Whitespace(r'\s*') + OneOrMore(RegExp(r'\w+'))
            root__ = word
        gr = MyGrammar()
        regex = gr['word'].parsers[-1].parser
        result = gr.associated_symbol(regex).symbol
110
        assert result == 'word', result
111

eckhart's avatar
eckhart committed
112

113
class TestInfiLoopsAndRecursion:
114
115
    def setup(self):
        pass
116
        set_config_value('history_tracking', True)
117
        # set_config_value('resume_notices', True)
118
        start_logging('LOGS')
119

eckhart's avatar
eckhart committed
120
121
122
123
124
    def test_very_simple(self):
        minilang = """
            term = term (`*`|`/`) factor | factor
            factor = /[0-9]+/
            """
125
126
        grammar_factory = grammar_provider(minilang)
        parser = grammar_factory()
127
        snippet = "5*4*3*2"
128
        # set_tracer(parser, trace_history)
eckhart's avatar
eckhart committed
129
        st = parser(snippet)
130
131
        if is_logging():
            log_ST(st, 'test_LeftRecursion_very_simple.cst')
132
            log_parsing_history(parser, 'test_LeftRecursion_very_simple')
133
        assert not is_error(st.error_flag), str(st.errors)
134
        st = parser("1*2*3*4*5*6*7*8*9")
135
136
137
138
        # if is_logging():
        #     log_ST(st, 'test_LeftRecursion_very_simple_2.cst')
        #     log_parsing_history(parser, 'test_LeftRecursion_very_simple_2')
        assert not is_error(st.error_flag)
eckhart's avatar
eckhart committed
139

140
    def test_direct_left_recursion1(self):
141
        minilang = """@literalws = right
142
143
144
145
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
146
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
147
        parser = grammar_provider(minilang)()
eckhart's avatar
eckhart committed
148
        # print(raw_compileEBNF(minilang).result)
149
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
150
        syntax_tree = parser(snippet)
Eckhart Arnold's avatar
Eckhart Arnold committed
151
        if is_logging():
152
153
            log_ST(syntax_tree, "test_LeftRecursion_direct1.cst")
            log_parsing_history(parser, "test_LeftRecursion_direct1")
154
155
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)
156

157
    def test_direct_left_recursion2(self):
158
        minilang = """@literalws = right
159
160
            expr = ex
            ex   = expr ("+"|"-") term | term
161
162
            term = tr
            tr   = term ("*"|"/") factor | factor
163
164
            factor = /[0-9]+/~
            """
165
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
166
        parser = grammar_provider(minilang)()
167
        assert parser
168
        syntax_tree = parser(snippet)
169
170
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
171
172
173
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_direct2.cst")
            log_parsing_history(parser, "test_LeftRecursion_direct2")
174

175
    def test_indirect_left_recursion1(self):
176
        minilang = """@literalws = right
177
            Expr    = //~ (Product | Sum | Value)
178
179
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
180
            Value   = /[0-9.]+/~ | '(' §Expr ')'
181
            """
eckhart's avatar
eckhart committed
182
        # print(raw_compileEBNF(minilang).result)
183
        parser = grammar_provider(minilang)()
184
        snippet = "8 * 4"
185
        syntax_tree = parser(snippet)
186
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
187
188
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
189
190
191
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_indirect1.cst")
            log_parsing_history(parser, "test_LeftRecursion_indirect1")
192
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
193
194
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
195
196
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
197
        snippet = "9 + 8 * (4 - 3 / (5 - 1))"
198
199
200
        syntax_tree = parser(snippet)
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
201
        if is_logging():
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
            log_ST(syntax_tree, "test_LeftRecursion_indirect1.cst")
            log_parsing_history(parser, "test_LeftRecursion_indirect1")

    # BEWARE: EXPERIMENTAL TEST can be long running
    def test_indirect_left_recursion2(self):
        arithmetic_syntax = """@literalws = right
            expression     = addition | subtraction  # | term
            addition       = (expression | term) "+" (expression | term)
            subtraction    = (expression | term) "-" (expression | term)
            term           = multiplication | division  # | factor
            multiplication = (term | factor) "*" (term | factor)
            division       = (term | factor) "/" (term | factor)
            factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
            group          = "(" expression ")"
            SIGN           = /[+-]/
            NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
            VARIABLE       = /[A-Za-z]/~
            """
        arithmetic = grammar_provider(arithmetic_syntax)()
        assert arithmetic
        syntax_tree = arithmetic("(a + b) * (a - b)")
        assert syntax_tree.errors
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_indirect2.cst")
            log_parsing_history(arithmetic, "test_LeftRecursion_indirect2")

228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
    def test_indirect_left_recursion3(self):
        arithmetic_syntax = """@literalws = right
            expression     = addition | subtraction | term
            addition       = (expression | term) "+" (expression | term)
            subtraction    = (expression | term) "-" (expression | term)
            term           = multiplication | division | factor
            multiplication = (term | factor) "*" (term | factor)
            division       = (term | factor) "/" (term | factor)
            factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
            group          = "(" expression ")"
            SIGN           = /[+-]/
            NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
            VARIABLE       = /[A-Za-z]/~
            """
        arithmetic = grammar_provider(arithmetic_syntax)()
        assert arithmetic
        syntax_tree = arithmetic("(a + b) * (a - b)")
        assert not syntax_tree.errors
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_indirect3.cst")
            log_parsing_history(arithmetic, "test_LeftRecursion_indirect3")

250

251
252
253
    def test_break_inifnite_loop_ZeroOrMore(self):
        forever = ZeroOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
254
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
255
256
257
258

    def test_break_inifnite_loop_OneOrMore(self):
        forever = OneOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
259
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
260

261
262
263
264
265
266
267
268
269
270
    def test_break_infinite_loop_Counted(self):
        forever = Counted(Always(), (0, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (5, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (INFINITE, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
eckhart's avatar
eckhart committed
271
        forever = Counted(Always(), (1000, INFINITE - 1))
272
273
274
275
276
277
278
279
280
281
282
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    def test_break_infinite_loop_Interleave(self):
        forever = Interleave(Always(), repetitions = [(0, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Interleave(Always(), Always(),
                             repetitions = [(5, INFINITE), (INFINITE, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
eckhart's avatar
eckhart committed
283
        forever = Interleave(Always(), repetitions = [(1000, INFINITE - 1)])
284
285
286
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

287
288
289
290
291
    # def test_infinite_loops(self):
    #     minilang = """forever = { // } \n"""
    #     try:
    #         parser_class = grammar_provider(minilang)
    #     except CompilationError as error:
292
    #         assert all(e.code == INFINITE_LOOP for e in error.errors)
293
294
295
296
297
298
    #     save = get_config_value('static_analysis')
    #     set_config_value('static_analysis', 'late')
    #     provider = grammar_provider(minilang)
    #     try:
    #         parser = provider()
    #     except GrammarError as error:
299
    #         assert error.errors[0][2].code == INFINITE_LOOP
300
301
302
303
    #     set_config_value('static_analysis', 'none')
    #     parser = provider()
    #     snippet = " "
    #     syntax_tree = parser(snippet)
304
    #     assert any(e.code == INFINITE_LOOP for e in syntax_tree.errors)
305
    #     res = parser.static_analysis()
306
    #     assert res and res[0][2].code == INFINITE_LOOP
307
308
309
310
311
    #     minilang = """not_forever = { / / } \n"""
    #     parser = grammar_provider(minilang)()
    #     res = parser.static_analysis()
    #     assert not res
    #     set_config_value('static_analysis', save)
312

eckhart's avatar
eckhart committed
313

314
315
316
317
318
319
# class TestStaticAnalysis:
#     def test_alternative(self):
#         lang = 'doc = "A" | "AB"'
#         parser = create_parser(lang)


Eckhart Arnold's avatar
Eckhart Arnold committed
320
class TestFlowControl:
321
322
323
324
325
326
    t1 = """
         All work and no play
         makes Jack a dull boy
         END
         """
    t2 = "All word and not play makes Jack a dull boy END\n"
327

Eckhart Arnold's avatar
Eckhart Arnold committed
328
    def test_lookbehind(self):
329
330
        ws = RegExp(r'\s*');  ws.pname = "ws"
        end = RegExp("END");  end.pname = "end"
331
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
332
        word = RegExp(r'\w+');  word.pname = "word"
Eckhart Arnold's avatar
Eckhart Arnold committed
333
334
335
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws
        parser = Grammar(document)
336
337
338
339
340
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

341
342
343
344
345
346
347
348
        cst = parser(self.t2, parser['ws'], complete_match=False)
        assert cst.did_match() and len(cst) == 0 and not cst.errors
        cst = parser(self.t2, parser['word'], complete_match=False)
        assert cst.did_match() and cst.content == "All" and not cst.errors
        cst = parser(self.t2, parser['end'], complete_match=False)
        assert not cst.did_match()


349
350
    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
351
            parser_initialization__ = ["upon instantiation"]
352
            ws = RegExp(r'\s*')
353
            end = RegExp('END')
354
            SUCC_LB = RegExp('\\s*?\\n')
355
            doc_end = Series(Lookbehind(SUCC_LB), end)
356
            word = RegExp(r'\w+')
357
358
359
360
361
362
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
363
        assert not cst.error_flag, cst.as_sxpr()
364
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
365
366
367
        assert cst.error_flag, cst.as_sxpr()


368
369
370
371
372
373
374
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
375
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
Eckhart Arnold's avatar
Eckhart Arnold committed
376
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
377
        assert result
378
        assert not messages, str(messages)
379
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
380
381
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
382
        assert node.tag_name == "regex"
383
384
        assert str(node) == 'abc+def'

385
386
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
eckhart's avatar
eckhart committed
387
388
        regex =  /\w+
                  [+]
389
390
                  \w* /
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
391
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
392
393
394
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
395
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
396
397
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
398
        assert node.tag_name == "regex"
399
400
        assert str(node) == 'abc+def'

401
    def test_ignore_case(self):
402
403
404
405
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
406
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
407
408
409
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
410
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
411
        node, rest = parser.regex(StringView('Alpha'))
412
413
        assert node
        assert rest == ''
414
        assert node.tag_name == "regex"
415
416
417
418
419
420
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
421
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
422
423
424
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
425
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
426
427
        node, rest = parser.regex(StringView('Alpha'))
        assert node is None
428

429
    def test_token(self):
430
        tokenlang = r"""@literalws = right
431
432
433
434
435
436
437
438
439
440
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
441
        result, messages, _ = compile_source(
eckhart's avatar
eckhart committed
442
443
            tokenlang, None, get_ebnf_grammar(), get_ebnf_transformer(),
            get_ebnf_compiler("TokenTest"))
444
        assert result
eckhart's avatar
eckhart committed
445
        assert not messages, str(messages)
446
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
447
        result = parser(testdoc)
448
        # log_parsing_history(parser, "test.log")
449
        assert not result.error_flag, str(result.errors_sorted)
450

451

452
class TestGrammar:
453
454
455
456
457
458
459
460
    grammar = r"""@whitespace = horizontal
    haupt        = textzeile LEERZEILE
    textzeile    = { WORT }+
    WORT         = /[^ \t]+/~
    LEERZEILE    = /\n[ \t]*(?=\n)/~
    """
    pyparser, messages, _ = compile_source(grammar, None, get_ebnf_grammar(),
                                           get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
eckhart's avatar
eckhart committed
461
    assert pyparser, str(messages)
eckhart's avatar
eckhart committed
462
    assert not messages, str(messages)
463
464
465
466

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
467
468
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
        grammar("no_file_name*")
469
        for record in grammar.history__:
470
471
            assert not record.node or record.node.pos >= 0

472
    def test_select_parsing(self):
473
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
474
475
476
477
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
478

di68kap's avatar
di68kap committed
479
480
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
481
            r'''
di68kap's avatar
di68kap committed
482
483
484
485
486
487
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
488
489
490
491
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
492
493
494
495
496
497
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

498
499
500
501
502
503
504
    def test_incomplete_matching(self):
        """Tests whether the flag `complete_match` works as expected when
        calling a grammar object in order to parse a document."""
        gr = grammar_provider('word = ~/\\w+/\n')()
        st = gr('eins')
        assert not st.errors
        st = gr('eins zwei')
505
        assert st.errors[0].code == PARSER_STOPPED_BEFORE_END
506
507
508
        st = gr('eins zwei', complete_match=False)
        assert not st.errors

509
    def test_synonym(self):
di68kap's avatar
di68kap committed
510
        lang = r"""
511
512
513
514
515
516
517
518
519
            doc  = { word | number }
            word = /\w+/ S
            number = [VZ] /\d+/ S 
            S    = ~        # let S by a synonym for anonymous whitespace
            VZ   = "-"
        """
        gr = grammar_provider(lang)()
        st = gr('eins 1 zwei2drei 3')
        # set_config_value('compiled_EBNF_log', 'grammar.log')
520
        gr = grammar_provider("@drop = whitespace, strings" + lang)()
521
522
        st = gr('eins 1 zwei2drei 3')
        st = gr('-3')
523
        assert str(gr['S']) == "S = ~", str(gr['S'])
524

525

526
527
528
529
530
531
532
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
Eckhart Arnold's avatar
Eckhart Arnold committed
533
        st = parser("ABCD")
534
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
535
        st = parser("A_CD")
536
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
537
        st = parser("AB_D")
538
539
540
541
542
543
544
545
546
547
548
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
549
550
551
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
552
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
553
        # transitivity of mandatory-operator
554
        st = parser("ABC_");  assert st.error_flag
555
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
556
557

    def test_series_composition(self):
558
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
559
560
561
562
563
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
564
565
566
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
567
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
568
        st = parser("ABC_E");  assert st.error_flag
569
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
570
571
572

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
573
574
575
576
577
578
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
579
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
580

eckhart's avatar
eckhart committed
581
582
583
584
585
586
587
588
589
590
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
591

592
593
    def test_ebnf_serialization(self):
        ebnf_grammar = get_ebnf_grammar()
eckhart's avatar
eckhart committed
594
        # TODO: Add test here
595
        ebnf = ebnf_grammar.as_ebnf()
596
        # print(ebnf)
597
598


599
600
601
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
602
        prefixes = Interleave(TKN("A"), TKN("B"))
603
604
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
605
606
607
608

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
eckhart's avatar
eckhart committed
609
        prefixes = Interleave(TKN("A"), TKN("B"))
610
611
612
613
614
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
615
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
616
617
618
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
619
620
621
622
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
623
        prefixes = Interleave(TKN("A"), TKN("B"))
624
625
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
eckhart's avatar
eckhart committed
626
627
628
        st = Grammar(prefixes)('B')
        assert st.error_flag
        prefixes = Interleave(TKN("B"), TKN("A"), repetitions=((0, 1), (0, 1)))
629
        assert Grammar(prefixes)('A B').content == 'A B'
eckhart's avatar
eckhart committed
630
631
632
        st = Grammar(prefixes)('B')
        assert not st.error_flag
        assert st.content == 'B'
633
634
635
636

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
637
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
638
639
640
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
641
642
643
        assert Grammar(prefixes)('A B B').error_flag


644
645
class TestInterleave:
    def test_interleave_most_simple(self):
646
        letterset = Interleave(Text("A"), Text("B"), Text("C"))
647
648
        gr = Grammar(letterset)
        st = gr('ABC')
649
        assert not st.errors, str(st.errors)
650
651
652
653
654
655
656
657
658
659
        assert st.content == "ABC"
        st = gr('BCA')
        assert not st.errors
        assert st.content == "BCA"
        st = gr('BCBA')
        assert st.errors
        st = gr('AB')
        assert st.errors

    def test_interleave(self):
660
        letterset = Interleave(Text("A"), Text("B"), Text("C"),
661
662
663
664
                               repetitions=[(1, 1000), (0, 1), (1, 1)])
        gr = Grammar(letterset)
        st = gr('AABC')
        assert not st.errors
665
666
667
668
669
670
671
672
673
674
        st = gr('BACAAA')
        assert not st.errors
        st = gr('ABCC')
        assert st.errors
        st = gr('AAACAAA')
        assert not st.errors
        st = gr('AAABAAA')
        assert st.errors


675
676
677
678
class TestErrorRecovery:
    def test_series_skip(self):
        lang = """
        document = series | /.*/
679
        @series_skip = /(?=[A-Z])/
680
681
682
683
684
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser('AB_D')
        assert len(st.errors) == 1  # no additional "stopped before end"-error!
685
        resume_notices_on(parser)
686
        st = parser('AB_D')
687
        assert len(st.errors) == 2 and any(err.code == RESUME_NOTICE for err in st.errors)
eckhart's avatar
eckhart committed
688
        assert 'Skipping' in str(st.errors_sorted[1])
689

690
    def test_Interleave_skip(self):
691
692
693
        lang = """
        document = allof | /.*/
        @allof_skip = /[A-Z]/
694
        allof = "A" ° §"B" ° "C" ° "D"
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
        """
        parser = grammar_provider(lang)()
        st = parser('CADB')
        assert 'allof' in st and st['allof'].content == "CADB"
        st = parser('_BCD')
        assert st.equals(parse_sxpr('(document "_BCD")'))
        st = parser('_ABC')
        assert st.equals(parse_sxpr('(document "_ABC")'))
        st = parser('A_CD')
        assert st['allof'].content == "A_CD"
        st = parser('AB_D')
        assert st['allof'].content == "AB_D"
        st = parser('A__D')
        assert st['allof'].content == "A__D"
        st = parser('CA_D')
        assert st['allof'].content == "CA_D"
711
712
        st = parser('A_CB')
        assert st['allof'].content == "A_CB"
713
        st = parser('BC_A')
714
        assert 'allof' not in st
715
716


717
class TestPopRetrieve:
718
    mini_language = r"""
719
720
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
721
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrival!
722
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
723
        text           = /[^`]+/
724
        """
725
    mini_lang2 = r"""
726
        @braces_filter = matching_bracket()
727
728
729
730
731
732
733
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
734
    mini_lang3 = r"""@literalws = right
735
        document       = { text | env }
eckhart's avatar
eckhart committed
736
        env            = (specialtag | opentag) text [ closespecial | closetag ]
737
738
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
739
        closetag       = close_slash | close_star
740
741
742
743
744
745
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """
746
    mini_lang4 = r"""@literalws = right
747
748
749
750
751
752
753
        document       = { text | env }
        env            = opentag document closetag
        opentag        = "<" name ">"
        closetag       = "</" :?name ">"
        name           = /\w+/~
        text           = /[^<>]+/        
    """
754
755

    def setup(self):
756
757
758
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
759
        self.minilang_parser4 = grammar_provider(self.mini_lang4)()
760
761

    @staticmethod
762
    def has_tag_name(node, name):
763
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
764

765
766
    def test_capture_assertions(self):
        try:
767
768
769
            _ = Grammar(Capture(Drop(Whitespace(r'\s*'))))
            assert False, "GrammarError expected!"
        except GrammarError as ge:
770
771
            pass
        try:
772
            _ = Grammar(Capture(Series(Text(' '), Drop(Whitespace(r'\s*')))))
773
            assert False, "ValueError expected!"
774
        except GrammarError:
775
            pass
776
777
778
        cp = Capture(RegExp(r'\w+'))
        cp.pname = "capture"
        _ = Grammar(cp)
779

780
781
782
783
    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3
784
        assert self.minilang_parser4
785
786
787
788

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
789
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
790
791
792

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
793
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
794
795
796

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
797
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
798
799
800

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
801
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
802
803
804

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
805
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
806

807
808
809
    def test_optional_match(self):
        test1 = '<info>Hey, you</info>'
        st = self.minilang_parser4(test1)
810
        assert not st.error_flag, str(st.errors_sorted)
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
        test12 = '<info>Hey, <emph>you</emph></info>'
        st = self.minilang_parser4(test1)
        assert not st.error_flag
        test2 = '<info>Hey, you</>'
        st = self.minilang_parser4(test2)
        assert not st.error_flag
        test3 = '<info>Hey, <emph>you</></>'
        st = self.minilang_parser4(test3)
        assert not st.error_flag
        test4 = '<info>Hey, <emph>you</></info>'
        st = self.minilang_parser4(test4)
        assert not st.error_flag

    def test_rollback_behaviour_of_optional_match(self):
        test1 = '<info>Hey, you</info*>'
        st = self.minilang_parser4(test1)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag
        test2 = '<info>Hey, you</*>'
        st = self.minilang_parser4(test2)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag

834
    def test_cache_neutrality(self):
835
836
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
837
        lang = r"""@literalws = right
838
839
840
841
842
843
844
845
846
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
847
        gr = grammar_provider(lang)()
848
        st = gr(case)
849
        assert not st.error_flag, str(st.errors_sorted)
850
851
852
853

    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
854
855
        assert not syntax_tree.errors_sorted, \
            ''.join(str(error) for error in syntax_tree.errors_sorted)
856
857
858
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
859
860
        assert delim == pop
        if is_logging():
861
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
862
863
864
865
866
867
868
869
870
871
872

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
873
        assert not syntax_tree.errors_sorted
874
875
876
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
877
878
        assert delim == pop
        if is_logging():
879
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
880
881
882
883

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
884
        assert not syntax_tree.errors_sorted
885
886
887
888
889
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
        assert len(delim) == len(pop)
        assert delim != pop
890
        if is_logging():
891
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
892
893
894
895
896
897
898
899
900
901
902

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
903
        assert not syntax_tree.errors_sorted
904
905
906
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
907
908
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
909
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
910

911
    def test_autoretrieve(self):
912
        lang = r"""@literalws = right
913
            document   = { definition } § EOF
914
            definition = symbol :defsign value
915
            symbol     = /\w+/~                      
916
917
            defsign    = "=" | ":="
            value      = /\d+/~
eckhart's avatar
eckhart committed
918
            EOF        = !/./ [ :?defsign ]   # eat up captured defsigns
919
        """
920
        # print(raw_compileEBNF(lang).result)
921
        parser = grammar_provider(lang)()
922
        st = parser("X := 1")
923
        assert not st.error_flag, str(st.errors)
924
        st1 = st
925
926
        st = parser("")
        assert not st.error_flag
927
928
929

        lines = [line for line in lang.split('\n') if line.strip()]
        eof_line = lines.pop()
930
        lines.insert(2, eof_line)
931
932
933
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
934
        assert not st.errors, str(st.errors)
935
936
        assert st.equals(st1)

937
938
        del lines[2]
        lines.insert(3, eof_line)
939
940
941
942
943
944
945
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)

        # and, finally...
946
        lang_variant = r"""@literalws = right
947
948
949
950
            document   = { definition } § EOF
            symbol     = /\w+/~                      
            defsign    = "=" | ":="
            value      = /\d+/~
951
            EOF        = !/./ :?defsign   # eat up captured defsign, only if it has been retrieved
952
953
954
955
956
957
            definition = symbol :defsign value
        """
        parser = grammar_provider(lang_variant)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)
958
959
        st = parser('')
        assert "EOF expected" in str(st.errors)
di68kap's avatar
di68kap committed
960

961

962
class TestWhitespaceHandling:
963
    minilang = """@literalws = right
964
965
966
967
968
969
970
        doc = A B
        A = "A"
        B = "B"
        Rdoc = ar br
        ar = /A/
        br = /B/
        """
971
    gr = grammar_provider(minilang)()
972
973
974
975
976
977
978
979
980
981
982
983

    def test_token_whitespace(self):
        st = self.gr("AB", 'doc')
        assert not st.error_flag
        st = self.gr("A B", 'doc')
        assert not st.error_flag

    def test_regexp_whitespace(self):
        st = self.gr("AB", 'Rdoc')
        assert not st.error_flag
        st =