test_parse.py 52.7 KB
Newer Older
1
#!/usr/bin/env python3
2

3
"""test_parse.py - tests of the parsers-module of DHParser
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

eckhart's avatar
eckhart committed
22
import copy
23
import os
24
import sys
25
from functools import partial
26
from typing import List, Tuple
27

28
29
scriptpath = os.path.dirname(__file__) or '.'
sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
30

eckhart's avatar
eckhart committed
31
from DHParser.configuration import get_config_value, set_config_value
32
from DHParser.toolkit import compile_python_object, re
33
from DHParser.log import is_logging, log_ST, log_parsing_history
34
35
from DHParser.error import Error, is_error, adjust_error_locations, MANDATORY_CONTINUATION, PARSER_DID_NOT_MATCH, \
    MALFORMED_ERROR_STRING, MANDATORY_CONTINUATION_AT_EOF, RESUME_NOTICE, PARSER_STOPPED_BEFORE_END
36
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
37
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, \
di68kap's avatar
di68kap committed
38
    Interleave, UnknownParserError, CombinedParser, Token, EMPTY_NODE, Capture, Drop, Whitespace, \
39
    GrammarError, Counted, Always, INFINITE
40
from DHParser import compile_source
41
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, \
42
    parse_ebnf, DHPARSER_IMPORTS, compile_ebnf
43
from DHParser.dsl import grammar_provider, create_parser
44
from DHParser.syntaxtree import Node, parse_sxpr
45
from DHParser.stringview import StringView
46
from DHParser.trace import set_tracer, trace_history, resume_notices_on
47
48


49
50
51
52
53
54

class TestWhitespace:
    # TODO: add test cases here
    def test_whitespace_comment_mangling(self):
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
55
    def test_non_empty_derivation(self):
56
57
        pass

58

59
60
class TestParserError:
    def test_parser_error_str(self):
61
        pe = ParserError(Node('TAG', 'test').with_pos(0), StringView('Beispiel'), None, True)
62
63
        assert str(pe).find('Beispiel') >= 0 and str(pe).find('TAG') >= 0

64
    def test_false_lookahead_only_message(self):
65
        """PARSER_LOOKAHEAD_*_ONLY errors must not be reported if there
66
67
68
69
70
71
        no lookahead parser in the history!"""
        lang = """
        word = letters { letters | `-` letters }
        letters = /[A-Za-z]+/
        """
        gr = grammar_provider(lang)()
72
73
        set_tracer(gr, trace_history)
        st = gr('hard-time')
74
        assert not st.errors
75
        st = gr('hard-')
76
77
        assert st.errors and not any(e.code == 1045 for e in st.errors)

eckhart's avatar
eckhart committed
78

eckhart's avatar
eckhart committed
79
80
81
82
83
84
85
86
87
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
88
89
        def visitor(context: List[Parser]):
            p = context[-1]
di68kap's avatar
di68kap committed
90
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
91
92
93
94
95
96
97
98
99
100
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3

101
102
103
104
105
106
107
108
    def test_symbol(self):
        class MyGrammar(Grammar):
            wrong = Token('wrong')
            word = OneOrMore(wrong) + Whitespace(r'\s*') + OneOrMore(RegExp(r'\w+'))
            root__ = word
        gr = MyGrammar()
        regex = gr['word'].parsers[-1].parser
        result = gr.associated_symbol(regex).symbol
109
        assert result == 'word', result
110

eckhart's avatar
eckhart committed
111

112
class TestInfiLoopsAndRecursion:
113
    def test_direct_left_recursion1(self):
eckhart's avatar
eckhart committed
114
        minilang = """
115
116
117
118
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
119
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
120
        parser = grammar_provider(minilang)()
121
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
122
        syntax_tree = parser(snippet)
123
124
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)
Eckhart Arnold's avatar
Eckhart Arnold committed
125
        if is_logging():
126
            log_ST(syntax_tree, "test_LeftRecursion_direct.cst")
127
            log_parsing_history(parser, "test_LeftRecursion_direct")
128

129
    def test_direct_left_recursion2(self):
130
131
132
133
134
135
        minilang = """
            expr = ex
            ex   = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
136
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
137
        parser = grammar_provider(minilang)()
138
        assert parser
139
        syntax_tree = parser(snippet)
140
141
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
142

143
    def test_indirect_left_recursion1(self):
144
145
        minilang = """
            Expr    = //~ (Product | Sum | Value)
146
147
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
148
            Value   = /[0-9.]+/~ | '(' §Expr ')'
149
            """
150
        parser = grammar_provider(minilang)()
151
        assert parser
152
        snippet = "8 * 4"
153
        syntax_tree = parser(snippet)
154
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
155
156
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
157
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
158
159
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
160
161
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
162
        if is_logging():
163
            log_ST(syntax_tree, "test_LeftRecursion_indirect.cst")
164
            log_parsing_history(parser, "test_LeftRecursion_indirect")
165

166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
    # # BEWARE: EXPERIMENTAL TEST can be long running
    # def test_indirect_left_recursion2(self):
    #     arithmetic_syntax = """
    #         expression     = addition | subtraction
    #         addition       = (expression | term) "+" (expression | term)
    #         subtraction    = (expression | term) "-" (expression | term)
    #         term           = multiplication | division
    #         multiplication = (term | factor) "*" (term | factor)
    #         division       = (term | factor) "/" (term | factor)
    #         factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
    #         group          = "(" expression ")"
    #         SIGN           = /[+-]/
    #         NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
    #         VARIABLE       = /[A-Za-z]/~
    #         """
    #     arithmetic = grammar_provider(arithmetic_syntax)()
    #     arithmetic.left_recursion_depth__ = 2
    #     assert arithmetic
    #     syntax_tree = arithmetic("(a + b) * (a - b)")
    #     assert syntax_tree.errors
186

187
188
189
    def test_break_inifnite_loop_ZeroOrMore(self):
        forever = ZeroOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
190
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
191
192
193
194

    def test_break_inifnite_loop_OneOrMore(self):
        forever = OneOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
195
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
196

197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
    def test_break_infinite_loop_Counted(self):
        forever = Counted(Always(), (0, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (5, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (INFINITE, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (1000, 1000000000000))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    def test_break_infinite_loop_Interleave(self):
        forever = Interleave(Always(), repetitions = [(0, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Interleave(Always(), Always(),
                             repetitions = [(5, INFINITE), (INFINITE, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Interleave(Always(), repetitions = [(1000, 1000000000000)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

223
224
225
226
227
    # def test_infinite_loops(self):
    #     minilang = """forever = { // } \n"""
    #     try:
    #         parser_class = grammar_provider(minilang)
    #     except CompilationError as error:
228
    #         assert all(e.code == INFINITE_LOOP for e in error.errors)
229
230
231
232
233
234
    #     save = get_config_value('static_analysis')
    #     set_config_value('static_analysis', 'late')
    #     provider = grammar_provider(minilang)
    #     try:
    #         parser = provider()
    #     except GrammarError as error:
235
    #         assert error.errors[0][2].code == INFINITE_LOOP
236
237
238
239
    #     set_config_value('static_analysis', 'none')
    #     parser = provider()
    #     snippet = " "
    #     syntax_tree = parser(snippet)
240
    #     assert any(e.code == INFINITE_LOOP for e in syntax_tree.errors)
241
    #     res = parser.static_analysis()
242
    #     assert res and res[0][2].code == INFINITE_LOOP
243
244
245
246
247
    #     minilang = """not_forever = { / / } \n"""
    #     parser = grammar_provider(minilang)()
    #     res = parser.static_analysis()
    #     assert not res
    #     set_config_value('static_analysis', save)
248

eckhart's avatar
eckhart committed
249

Eckhart Arnold's avatar
Eckhart Arnold committed
250
class TestFlowControl:
251
252
253
254
255
256
    t1 = """
         All work and no play
         makes Jack a dull boy
         END
         """
    t2 = "All word and not play makes Jack a dull boy END\n"
257

Eckhart Arnold's avatar
Eckhart Arnold committed
258
    def test_lookbehind(self):
259
260
        ws = RegExp(r'\s*');  ws.pname = "ws"
        end = RegExp("END");  end.pname = "end"
261
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
262
        word = RegExp(r'\w+');  word.pname = "word"
Eckhart Arnold's avatar
Eckhart Arnold committed
263
264
265
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws
        parser = Grammar(document)
266
267
268
269
270
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

271
272
273
274
275
276
277
278
        cst = parser(self.t2, parser['ws'], complete_match=False)
        assert cst.did_match() and len(cst) == 0 and not cst.errors
        cst = parser(self.t2, parser['word'], complete_match=False)
        assert cst.did_match() and cst.content == "All" and not cst.errors
        cst = parser(self.t2, parser['end'], complete_match=False)
        assert not cst.did_match()


279
280
    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
281
            parser_initialization__ = ["upon instantiation"]
282
            ws = RegExp(r'\s*')
283
            end = RegExp('END')
284
            SUCC_LB = RegExp('\\s*?\\n')
285
            doc_end = Series(Lookbehind(SUCC_LB), end)
286
            word = RegExp(r'\w+')
287
288
289
290
291
292
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
293
        assert not cst.error_flag, cst.as_sxpr()
294
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
295
296
297
        assert cst.error_flag, cst.as_sxpr()


298
299
300
301
302
303
304
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
305
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
Eckhart Arnold's avatar
Eckhart Arnold committed
306
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
307
        assert result
308
        assert not messages, str(messages)
309
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
310
311
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
312
        assert node.tag_name == "regex"
313
314
        assert str(node) == 'abc+def'

315
316
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
eckhart's avatar
eckhart committed
317
318
        regex =  /\w+
                  [+]
319
320
                  \w* /
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
321
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
322
323
324
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
325
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
326
327
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
328
        assert node.tag_name == "regex"
329
330
        assert str(node) == 'abc+def'

331
    def test_ignore_case(self):
332
333
334
335
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
336
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
337
338
339
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
340
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
341
        node, rest = parser.regex(StringView('Alpha'))
342
343
        assert node
        assert rest == ''
344
        assert node.tag_name == "regex"
345
346
347
348
349
350
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
351
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
352
353
354
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
355
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
356
357
        node, rest = parser.regex(StringView('Alpha'))
        assert node is None
358

359
360
361
362
363
364
365
366
367
368
369
370
    def test_token(self):
        tokenlang = r"""
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
371
        result, messages, _ = compile_source(
eckhart's avatar
eckhart committed
372
373
            tokenlang, None, get_ebnf_grammar(), get_ebnf_transformer(),
            get_ebnf_compiler("TokenTest"))
374
        assert result
eckhart's avatar
eckhart committed
375
        assert not messages, str(messages)
376
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
377
        result = parser(testdoc)
378
        # log_parsing_history(parser, "test.log")
379
        assert not result.error_flag, str(result.errors_sorted)
380

381

382
class TestGrammar:
383
384
385
386
387
388
389
390
391
392
    grammar = r"""@whitespace = horizontal
    haupt        = textzeile LEERZEILE
    textzeile    = { WORT }+
    WORT         = /[^ \t]+/~
    LEERZEILE    = /\n[ \t]*(?=\n)/~
    """
    pyparser, messages, _ = compile_source(grammar, None, get_ebnf_grammar(),
                                           get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
    assert pyparser
    assert not messages
393
394
395
396

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
397
398
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
        grammar("no_file_name*")
399
        for record in grammar.history__:
400
401
            assert not record.node or record.node.pos >= 0

402
    def test_select_parsing(self):
403
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
404
405
406
407
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
408

di68kap's avatar
di68kap committed
409
410
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
411
            r'''
di68kap's avatar
di68kap committed
412
413
414
415
416
417
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
418
419
420
421
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
422
423
424
425
426
427
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

428
429
430
431
432
433
434
    def test_incomplete_matching(self):
        """Tests whether the flag `complete_match` works as expected when
        calling a grammar object in order to parse a document."""
        gr = grammar_provider('word = ~/\\w+/\n')()
        st = gr('eins')
        assert not st.errors
        st = gr('eins zwei')
435
        assert st.errors[0].code == PARSER_STOPPED_BEFORE_END
436
437
438
        st = gr('eins zwei', complete_match=False)
        assert not st.errors

439
    def test_synonym(self):
di68kap's avatar
di68kap committed
440
        lang = r"""
441
442
443
444
445
446
447
448
449
450
451
452
            doc  = { word | number }
            word = /\w+/ S
            number = [VZ] /\d+/ S 
            S    = ~        # let S by a synonym for anonymous whitespace
            VZ   = "-"
        """
        gr = grammar_provider(lang)()
        st = gr('eins 1 zwei2drei 3')
        # set_config_value('compiled_EBNF_log', 'grammar.log')
        gr = grammar_provider("@drop = whitespace, token" + lang)()
        st = gr('eins 1 zwei2drei 3')
        st = gr('-3')
453
        assert str(gr['S']) == "S = ~", str(gr['S'])
454

455

456
457
458
459
460
461
462
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
Eckhart Arnold's avatar
Eckhart Arnold committed
463
        st = parser("ABCD")
464
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
465
        st = parser("A_CD")
466
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
467
        st = parser("AB_D")
468
469
470
471
472
473
474
475
476
477
478
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
479
480
481
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
482
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
483
        # transitivity of mandatory-operator
484
        st = parser("ABC_");  assert st.error_flag
485
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
486
487

    def test_series_composition(self):
488
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
489
490
491
492
493
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
494
495
496
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
497
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
498
        st = parser("ABC_E");  assert st.error_flag
499
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
500
501
502

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
503
504
505
506
507
508
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
509
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
510

eckhart's avatar
eckhart committed
511
512
513
514
515
516
517
518
519
520
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
521

522
523
    def test_ebnf_serialization(self):
        ebnf_grammar = get_ebnf_grammar()
eckhart's avatar
eckhart committed
524
        # TODO: Add test here
525
        ebnf = ebnf_grammar.as_ebnf()
526
        # print(ebnf)
527
528


529
530
531
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
532
        prefixes = Interleave(TKN("A"), TKN("B"))
533
534
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
535
536
537
538

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
eckhart's avatar
eckhart committed
539
        prefixes = Interleave(TKN("A"), TKN("B"))
540
541
542
543
544
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
545
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
546
547
548
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
549
550
551
552
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
553
        prefixes = Interleave(TKN("A"), TKN("B"))
554
555
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
eckhart's avatar
eckhart committed
556
557
558
        st = Grammar(prefixes)('B')
        assert st.error_flag
        prefixes = Interleave(TKN("B"), TKN("A"), repetitions=((0, 1), (0, 1)))
559
        assert Grammar(prefixes)('A B').content == 'A B'
eckhart's avatar
eckhart committed
560
561
562
        st = Grammar(prefixes)('B')
        assert not st.error_flag
        assert st.content == 'B'
563
564
565
566

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
567
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
568
569
570
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
571
572
573
        assert Grammar(prefixes)('A B B').error_flag


574
575
576
577
578
class TestInterleave:
    def test_interleave_most_simple(self):
        letterset = Interleave(Token("A"), Token("B"), Token("C"))
        gr = Grammar(letterset)
        st = gr('ABC')
579
        assert not st.errors, str(st.errors)
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
        assert st.content == "ABC"
        st = gr('BCA')
        assert not st.errors
        assert st.content == "BCA"
        st = gr('BCBA')
        assert st.errors
        st = gr('AB')
        assert st.errors

    def test_interleave(self):
        letterset = Interleave(Token("A"), Token("B"), Token("C"),
                               repetitions=[(1, 1000), (0, 1), (1, 1)])
        gr = Grammar(letterset)
        st = gr('AABC')
        assert not st.errors
595
596
597
598
599
600
601
602
603
604
        st = gr('BACAAA')
        assert not st.errors
        st = gr('ABCC')
        assert st.errors
        st = gr('AAACAAA')
        assert not st.errors
        st = gr('AAABAAA')
        assert st.errors


605
606
607
608
class TestErrorRecovery:
    def test_series_skip(self):
        lang = """
        document = series | /.*/
609
        @series_skip = /(?=[A-Z])/
610
611
612
613
614
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser('AB_D')
        assert len(st.errors) == 1  # no additional "stopped before end"-error!
615
        resume_notices_on(parser)
616
        st = parser('AB_D')
617
        assert len(st.errors) == 2 and any(err.code == RESUME_NOTICE for err in st.errors)
eckhart's avatar
eckhart committed
618
        assert 'Skipping' in str(st.errors_sorted[1])
619

620
    def test_Interleave_skip(self):
621
622
623
        lang = """
        document = allof | /.*/
        @allof_skip = /[A-Z]/
624
        allof = "A" ° §"B" ° "C" ° "D"
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
        """
        parser = grammar_provider(lang)()
        st = parser('CADB')
        assert 'allof' in st and st['allof'].content == "CADB"
        st = parser('_BCD')
        assert st.equals(parse_sxpr('(document "_BCD")'))
        st = parser('_ABC')
        assert st.equals(parse_sxpr('(document "_ABC")'))
        st = parser('A_CD')
        assert st['allof'].content == "A_CD"
        st = parser('AB_D')
        assert st['allof'].content == "AB_D"
        st = parser('A__D')
        assert st['allof'].content == "A__D"
        st = parser('CA_D')
        assert st['allof'].content == "CA_D"
641
642
        st = parser('A_CB')
        assert st['allof'].content == "A_CB"
643
        st = parser('BC_A')
644
        assert 'allof' not in st
645
646


647
class TestPopRetrieve:
648
    mini_language = r"""
649
650
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
651
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrival!
652
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
653
        text           = /[^`]+/
654
        """
655
    mini_lang2 = r"""
656
        @braces_filter = matching_bracket()
657
658
659
660
661
662
663
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
664
    mini_lang3 = r"""
665
        document       = { text | env }
eckhart's avatar
eckhart committed
666
        env            = (specialtag | opentag) text [ closespecial | closetag ]
667
668
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
669
        closetag       = close_slash | close_star
670
671
672
673
674
675
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """
676
677
678
679
680
681
682
683
    mini_lang4 = r"""
        document       = { text | env }
        env            = opentag document closetag
        opentag        = "<" name ">"
        closetag       = "</" :?name ">"
        name           = /\w+/~
        text           = /[^<>]+/        
    """
684
685

    def setup(self):
686
687
688
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
689
        self.minilang_parser4 = grammar_provider(self.mini_lang4)()
690
691

    @staticmethod
692
    def has_tag_name(node, name):
693
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
694

695
696
    def test_capture_assertions(self):
        try:
697
698
699
            _ = Grammar(Capture(Drop(Whitespace(r'\s*'))))
            assert False, "GrammarError expected!"
        except GrammarError as ge:
700
701
            pass
        try:
702
            _ = Grammar(Capture(Series(Token(' '), Drop(Whitespace(r'\s*')))))
703
            assert False, "ValueError expected!"
704
        except GrammarError:
705
            pass
706
707
708
        cp = Capture(RegExp(r'\w+'))
        cp.pname = "capture"
        _ = Grammar(cp)
709

710
711
712
713
    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3
714
        assert self.minilang_parser4
715
716
717
718

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
719
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
720
721
722

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
723
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
724
725
726

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
727
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
728
729
730

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
731
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
732
733
734

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
735
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
736

737
738
739
    def test_optional_match(self):
        test1 = '<info>Hey, you</info>'
        st = self.minilang_parser4(test1)
740
        assert not st.error_flag, str(st.errors_sorted)
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
        test12 = '<info>Hey, <emph>you</emph></info>'
        st = self.minilang_parser4(test1)
        assert not st.error_flag
        test2 = '<info>Hey, you</>'
        st = self.minilang_parser4(test2)
        assert not st.error_flag
        test3 = '<info>Hey, <emph>you</></>'
        st = self.minilang_parser4(test3)
        assert not st.error_flag
        test4 = '<info>Hey, <emph>you</></info>'
        st = self.minilang_parser4(test4)
        assert not st.error_flag

    def test_rollback_behaviour_of_optional_match(self):
        test1 = '<info>Hey, you</info*>'
        st = self.minilang_parser4(test1)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag
        test2 = '<info>Hey, you</*>'
        st = self.minilang_parser4(test2)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag

764
    def test_cache_neutrality(self):
765
766
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
767
        lang = r"""
768
769
770
771
772
773
774
775
776
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
777
        gr = grammar_provider(lang)()
778
        st = gr(case)
779
        assert not st.error_flag, str(st.errors_sorted)
780
781
782
783

    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
784
785
        assert not syntax_tree.errors_sorted, \
            ''.join(str(error) for error in syntax_tree.errors_sorted)
786
787
788
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
789
790
        assert delim == pop
        if is_logging():
791
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
792
793
794
795
796
797
798
799
800
801
802

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
803
        assert not syntax_tree.errors_sorted
804
805
806
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
807
808
        assert delim == pop
        if is_logging():
809
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
810
811
812
813

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
814
        assert not syntax_tree.errors_sorted
815
816
817
818
819
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
        assert len(delim) == len(pop)
        assert delim != pop
820
        if is_logging():
821
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
822
823
824
825
826
827
828
829
830
831
832

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
833
        assert not syntax_tree.errors_sorted
834
835
836
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
837
838
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
839
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
840

841
    def test_autoretrieve(self):
842
843
        lang = r"""
            document   = { definition } § EOF
844
            definition = symbol :defsign value
845
            symbol     = /\w+/~                      
846
847
            defsign    = "=" | ":="
            value      = /\d+/~
eckhart's avatar
eckhart committed
848
            EOF        = !/./ [ :?defsign ]   # eat up captured defsigns
849
        """
850
851
        # code, _, _ = compile_ebnf(lang)
        # print(code)
852
        parser = grammar_provider(lang)()
853
854
855
        st = parser("X := 1")
        assert not st.error_flag
        st1 = st
856
857
        st = parser("")
        assert not st.error_flag
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881

        lines = [line for line in lang.split('\n') if line.strip()]
        eof_line = lines.pop()
        lines.insert(1, eof_line)
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)

        del lines[1]
        lines.insert(2, eof_line)
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)

        # and, finally...
        lang_variant = r"""
            document   = { definition } § EOF
            symbol     = /\w+/~                      
            defsign    = "=" | ":="
            value      = /\d+/~
882
            EOF        = !/./ :?defsign   # eat up captured defsign, only if it has been retrieved
883
884
885
886
887
888
            definition = symbol :defsign value
        """
        parser = grammar_provider(lang_variant)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)
889
890
        st = parser('')
        assert "EOF expected" in str(st.errors)
di68kap's avatar
di68kap committed
891

892

893
class TestWhitespaceHandling:
Eckhart Arnold's avatar
Eckhart Arnold committed
894
    minilang = """
895
896
897
898
899
900
901
        doc = A B
        A = "A"
        B = "B"
        Rdoc = ar br
        ar = /A/
        br = /B/
        """
902
    gr = grammar_provider(minilang)()
903
904
905
906
907
908
909
910
911
912
913
914

    def test_token_whitespace(self):
        st = self.gr("AB", 'doc')
        assert not st.error_flag
        st = self.gr("A B", 'doc')
        assert not st.error_flag

    def test_regexp_whitespace(self):
        st = self.gr("AB", 'Rdoc')
        assert not st.error_flag
        st = self.gr("A B", 'Rdoc')
        assert st.error_flag
di68kap's avatar
di68kap committed
915
916
917
918
919


class TestErrorReporting:
    grammar = """
        root      = series alpha | anything
eckhart's avatar
eckhart committed
920
        series    = subseries &alpha
di68kap's avatar
di68kap committed
921
922
923
924
925
926
927
928
929
930
931
932
933
934
        subseries = alpha §beta
        alpha     = /[a-z]+/
        beta      = /[A-Z]+/
        anything  = /.*/
        """

    def setup(self):
        self.parser = grammar_provider(self.grammar)()

    def test_error_propagation(self):
        testcode1 = "halloB"
        testcode2 = "XYZ"
        testcode3 = "hallo "
        cst = self.parser(testcode1)
935
        assert not cst.error_flag, str(cst.errors_sorted)
di68kap's avatar
di68kap committed
936
937
938
939
        cst = self.parser(testcode2)
        assert not cst.error_flag
        cst = self.parser(testcode3)
        assert cst.error_flag
940
941


di68kap's avatar
di68kap committed
942
943
class TestBorderlineCases:
    def test_not_matching(self):
944
        minilang = """parser = /X/\n"""
di68kap's avatar
di68kap committed
945
946
947
948
        gr = grammar_provider(minilang)()
        cst = gr('X', 'parser')
        assert not cst.error_flag
        cst = gr(' ', 'parser')
949
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
950
        cst = gr('', 'parser')
951
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
952
953
954
955
956
957
958

    def test_matching(self):
        minilang = """parser = /.?/"""
        gr = grammar_provider(minilang)()
        cst = gr(' ', 'parser')
        assert not cst.error_flag
        cst = gr('  ', 'parser')
959
        assert cst.error_flag and cst.errors_sorted[0].code == PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
960
961
962
963
        cst = gr('', 'parser')
        assert not cst.error_flag


eckhart's avatar
eckhart committed
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
EBNF_with_Errors = r"""# Test code with errors. All places marked by a "$" should yield and error

@ comment    = /#.*(?:\n|$)/
@ whitespace = /\s*/
@ literalws  = right
@ anonymous  = pure_elem, EOF
@ drop       = whitespace, EOF


# re-entry-rules for resuming after parsing-error
@ definition_resume = /\n\s*(?=@|\w+\w*\s*=)/
@ directive_resume  = /\n\s*(?=@|\w+\w*\s*=)/

# specialized error messages for certain cases

@ definition_error  = /,/, 'Delimiter "," not expected in definition!\nEither this was meant to '
                           'be a directive and the directive symbol @ is missing\nor the error is '
                           'due to inconsistent use of the comma as a delimiter\nfor the elements '
                           'of a sequence.'

#: top-level

syntax     = [~//] { definition | directive } §EOF
definition = symbol §:DEF~ expression :ENDL~
directive  = "@" §symbol "="
             (regexp | literals | symbol)
             { "," (regexp | literals | symbol) }

#: components

expression = sequence { :OR~ sequence }
sequence   = ["§"] ( interleave | lookaround )
             { :AND~ ["§"] ( interleave | lookaround ) }
interleave = difference { "°" ["§"] difference }
lookaround = flowmarker § (oneormore | pure_elem)
difference = term ["-" § (oneormore $ pure_elem)]               # <- ERROR
term       = oneormore | repetition | option | pure_elem        # resuming expected her