test_ebnf.py 20.5 KB
Newer Older
1
2
#!/usr/bin/python3

3
"""test_ebnf.py - tests of the ebnf module of DHParser 
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
                             

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

Eckhart Arnold's avatar
Eckhart Arnold committed
23
import sys
Eckhart Arnold's avatar
Eckhart Arnold committed
24
from multiprocessing import Pool
Eckhart Arnold's avatar
Eckhart Arnold committed
25

Eckhart Arnold's avatar
Eckhart Arnold committed
26
27
sys.path.extend(['../', './'])

28
from DHParser.toolkit import compile_python_object, re
29
from DHParser.preprocess import nil_preprocessor
30
from DHParser import compile_source
31
from DHParser.error import has_errors, Error
32
from DHParser.syntaxtree import WHITESPACE_PTYPE
33
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, EBNFTransform, get_ebnf_compiler
34
from DHParser.dsl import CompilationError, compileDSL, DHPARSER_IMPORTS, grammar_provider
35
from DHParser.testing import grammar_unit
36
37


38
39
40
41
42
43
44
45
46
47
48
class TestDirectives:
    mini_language = """
        expression =  term  { ("+" | "-") term }
        term       =  factor  { ("*" | "/") factor }
        factor     =  constant | "("  expression  ")"
        constant   =  digit { digit } [ //~ ]
        digit      = /0/ | /1/ | /2/ | /3/ | /4/ | /5/ | /6/ | /7/ | /8/ | /9/ 
        """

    def test_whitespace_linefeed(self):
        lang = "@ whitespace = linefeed\n" + self.mini_language
49
        MinilangParser = grammar_provider(lang)
50
51
        parser = MinilangParser()
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
52
        syntax_tree = parser("3 + 4 * 12")
53
        # parser.log_parsing_history("WSP")
54
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
55
        syntax_tree = parser("3 + 4 \n * 12")
56
57
        # parser.log_parsing_history("WSPLF")
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
58
        syntax_tree = parser("3 + 4 \n \n * 12")
59
        assert syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
60
        syntax_tree = parser("3 + 4 \n\n * 12")
61
62
63
64
        assert syntax_tree.collect_errors()

    def test_whitespace_vertical(self):
        lang = "@ whitespace = vertical\n" + self.mini_language
65
        parser = grammar_provider(lang)()
66
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
67
        syntax_tree = parser("3 + 4 * 12")
68
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
69
        syntax_tree = parser("3 + 4 \n * 12")
70
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
71
        syntax_tree = parser("3 + 4 \n \n * 12")
72
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
73
        syntax_tree = parser("3 + 4 \n\n * 12")
74
75
        assert not syntax_tree.collect_errors()

76
77
    def test_whitespace_horizontal(self):
        lang = "@ whitespace = horizontal\n" + self.mini_language
78
        parser = grammar_provider(lang)()
79
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
80
        syntax_tree = parser("3 + 4 * 12")
81
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
82
        syntax_tree = parser("3 + 4 \n * 12")
83
84
        assert syntax_tree.collect_errors()

85

Eckhart Arnold's avatar
Eckhart Arnold committed
86
87
88
89
90
91
92
93
94
95
96
97
class TestReservedSymbols:
    def test_comment_usage(self):
        lang = r"""
        @comment = /#.*(?:\n|$)/
        document = text [ COMMENT__ ]
        text = /[^#]+/
        """
        parser = grammar_provider(lang)()

    def test_whitespace(self):
        lang = r"""
        @whitespace = /\s*/
98
        document = WSP_RE__ { word WSP_RE__ }
Eckhart Arnold's avatar
Eckhart Arnold committed
99
100
101
102
103
104
105
106
        word = /\w+/ 
        """
        parser = grammar_provider(lang)()

    def test_mixin(self):
        lang = r"""
        @comment = /#.*(?:\n|$)/
        @whitespace = /\s*/
107
        document = WSP_RE__ { word WSP_RE__ }
Eckhart Arnold's avatar
Eckhart Arnold committed
108
109
110
111
112
113
114
        word = /\w+/ 
        """
        parser = grammar_provider(lang)()
        result = parser("test # kommentar")
        assert not result.error_flag, str(result.as_sxpr())


115
class TestEBNFParser:
Eckhart Arnold's avatar
Eckhart Arnold committed
116
    cases = {
117
118
119
120
121
122
123
        "list_": {
            "match": {
                1: "hund",
                2: "hund, katze,maus",
                3: "hund , katze"
            },
            "fail": {
124
125
126
                4: "123",
                5: '"literal"',
                6: "/regexp/"
127
128
129
130
            }
        }
    }

131
    def setup(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
132
        self.EBNF = get_ebnf_grammar()
133

134
135
    def test_RE(self):
        gr = get_ebnf_grammar()
136
        m = gr.regexp.parsers[0].regexp.match(r'/[\\\\]/ xxx /')
137
138
139
140
        rs = m.group()
        assert rs.find('x') < 0, rs.group()
        rx = re.compile(rs[1:-1])
        assert rx.match(r'\\')
141

142
    def test_literal(self):
143
        snippet = '"text" '
Eckhart Arnold's avatar
Eckhart Arnold committed
144
        result = self.EBNF(snippet, 'literal')
145
146
        assert not result.error_flag
        assert str(result) == snippet
147
        assert result.select(lambda node: node.parser.ptype == WHITESPACE_PTYPE)
148

149
150
151
        result = self.EBNF('"text" ', 'literal')
        assert not result.error_flag
        result = self.EBNF(' "text"', 'literal')
152
153
        assert result.error_flag  # literals catch following, but not leading whitespace

154
155
156
157
158
159
160
161
    def test_plaintext(self):
        result = self.EBNF('`plain`', 'plaintext')
        assert not result.error_flag

    def test_list(self):
        grammar_unit(self.cases, get_ebnf_grammar, get_ebnf_transformer)


162

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
class TestParserNameOverwriteBug:
    def test_term_bug(self):
        grammar = get_ebnf_grammar()
        st = grammar('impossible = [§"an optional requirement"]')
        # print(st.as_sxpr())
        get_ebnf_transformer()(st)
        # print(st.as_sxpr())
        lang = """series = "A" "B" §"C" "D"
        """
        parser = get_ebnf_grammar()
        st = grammar(lang)
        # print(st.as_sxpr())
        get_ebnf_transformer()(st)
        # print(st.as_sxpr())
        result = get_ebnf_compiler()(st)
        messages = st.collect_errors()
        assert not has_errors(messages), str(messages)


182
183
class TestSemanticValidation:
    def check(self, minilang, bool_filter=lambda x: x):
Eckhart Arnold's avatar
Eckhart Arnold committed
184
        grammar = get_ebnf_grammar()
Eckhart Arnold's avatar
Eckhart Arnold committed
185
        st = grammar(minilang)
186
        assert not st.collect_errors()
187
        EBNFTransform()(st)
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
        assert bool_filter(st.collect_errors())

    def test_illegal_nesting(self):
        self.check('impossible = { [ "an optional requirement" ] }')

    def test_illegal_nesting_option_required(self):
        self.check('impossible = [ §"an optional requirement" ]')

    def test_illegal_nesting_oneormore_option(self):
        self.check('impossible = { [ "no use"] }+')

    def test_legal_nesting(self):
        self.check('possible = { [ "+" ] "1" }', lambda x: not x)


class TestCompilerErrors:
    def test_error_propagation(self):
205
        ebnf = "@ literalws = wrongvalue  # testing error propagation\n"
Eckhart Arnold's avatar
Eckhart Arnold committed
206
207
        result, messages, st = compile_source(ebnf, None, get_ebnf_grammar(),
            get_ebnf_transformer(), get_ebnf_compiler('ErrorPropagationTest'))
208
209
        assert messages

210
211
212
213
214
215
216
217
218
219
220
221
222
    def test_undefined_symbols(self):
        """Use of undefined symbols should be reported.
        """
        ebnf = """syntax = { intermediary }
                  intermediary = "This symbol is " [ badly_spelled ] "!"
                  bedly_spilled = "wrong" """
        result, messages, st = compile_source(ebnf, None, get_ebnf_grammar(),
            get_ebnf_transformer(), get_ebnf_compiler('UndefinedSymbols'))
        assert messages

    def test_no_error(self):
        """But reserved symbols should not be repoted as undefined.
        """
223
        ebnf = """nothing =  WSP_RE__ | COMMENT__\n"""
224
225
226
227
        result, messages, st = compile_source(ebnf, None, get_ebnf_grammar(),
            get_ebnf_transformer(), get_ebnf_compiler('UndefinedSymbols'))
        assert not bool(messages), messages

228

229
class TestSelfHosting:
Eckhart Arnold's avatar
Eckhart Arnold committed
230
231
232
233
234
235
236
237
238
    grammar = r"""
        # EBNF-Grammar in EBNF

        @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
        @ whitespace =  /\s*/                            # whitespace includes linefeed
        @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

        syntax     =  [~//] { definition | directive } §EOF
        definition =  symbol §"=" expression
239
        directive  =  "@" §symbol "=" ( regexp | literal | list_ )
Eckhart Arnold's avatar
Eckhart Arnold committed
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269

        expression =  term { "|" term }
        term       =  { factor }+
        factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                    | [flowmarker] literal
                    | [flowmarker] regexp
                    | [flowmarker] group
                    | [flowmarker] regexchain
                    | [flowmarker] oneormore
                    | repetition
                    | option

        flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                      "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
        retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

        group      =  "(" expression §")"
        regexchain =  ">" expression §"<"                # compiles "expression" into a singular regular expression
        oneormore  =  "{" expression "}+"
        repetition =  "{" expression §"}"
        option     =  "[" expression §"]"

        symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
        literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                    | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
        regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                         # '~' is a whitespace-marker, if present leading or trailing
                                                         # whitespace of a regular expression will be ignored tacitly.
        list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
                                                         # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
270
        EOF =  !/./
Eckhart Arnold's avatar
Eckhart Arnold committed
271
272
        """

273
274
    def test_self(self):
        compiler_name = "EBNF"
Eckhart Arnold's avatar
Eckhart Arnold committed
275
276
277
278
        compiler = get_ebnf_compiler(compiler_name, self.grammar)
        parser = get_ebnf_grammar()
        result, errors, syntax_tree = compile_source(self.grammar, None, parser,
                                            get_ebnf_transformer(), compiler)
279
280
281
        assert not errors, str(errors)
        # compile the grammar again using the result of the previous
        # compilation as parser
282
        compileDSL(self.grammar, nil_preprocessor, result, get_ebnf_transformer(), compiler)
283

Eckhart Arnold's avatar
Eckhart Arnold committed
284
285
286
287
288
289
290
291
292
    def multiprocessing_task(self):
        compiler_name = "EBNF"
        compiler = get_ebnf_compiler(compiler_name, self.grammar)
        parser = get_ebnf_grammar()
        result, errors, syntax_tree = compile_source(self.grammar, None, parser,
                                            get_ebnf_transformer(), compiler)
        return errors

    def test_multiprocessing(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
293
        with Pool() as pool:
Eckhart Arnold's avatar
Eckhart Arnold committed
294
            res = [pool.apply_async(self.multiprocessing_task, ()) for i in range(4)]
295
            errors = [r.get(timeout=10) for r in res]
Eckhart Arnold's avatar
Eckhart Arnold committed
296
297
        for i, e in enumerate(errors):
            assert not e, ("%i: " % i) + str(e)
298
299


300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
class TestBoundaryCases:
    def setup(self):
        self.gr = get_ebnf_grammar()
        self.tr = get_ebnf_transformer()
        self.cp = get_ebnf_compiler()

    def test_empty_grammar(self):
        t = self.gr("")
        self.tr(t)
        r = self.cp(t)
        assert r

    def test_single_statement_grammar(self):
        t = self.gr("i = /i/")
        self.tr(t)
        r = self.cp(t)
        assert r

    def test_two_statement_grammar(self):
        t = self.gr("i = k {k}\nk = /k/")
        self.tr(t)
        r = self.cp(t)
        assert r

324
325
326
327
    def test_unconnected_symbols(self):
        ebnf = """root = /.*/
                  unconnected = /.*/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
328
329
330
331
332
333
334
335
        result, messages, AST = compile_source(ebnf, nil_preprocessor,
                                               get_ebnf_grammar(),
                                               get_ebnf_transformer(),
                                               get_ebnf_compiler())
        if messages:
            assert not has_errors(messages), "Unconnected rules should result in a warning, " \
                "not an error: " + str(messages)
            grammar_src = result
336
337
            grammar = compile_python_object(DHPARSER_IMPORTS + grammar_src,
                                            'get_(?:\w+_)?grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
338
339
340
        else:
            assert False, "EBNF compiler should warn about unconnected rules."

341
342
343
344
        assert grammar['root'], "Grammar objects should be subscriptable by parser names!"
        try:
            unconnected = grammar['unconnected']
        except KeyError:
Eckhart Arnold's avatar
Eckhart Arnold committed
345
            assert False, "Grammar objects should be able to cope with unconnected parsers!"
346
347
348
349
350
351
        try:
            nonexistant = grammar['nonexistant']
            assert False, "Grammar object shoul raise a KeyError if subscripted by " \
                          "a non-existant parser name!"
        except KeyError:
            pass
352
353
354
355
356
357
358


class TestSynonymDetection:
    def test_synonym_detection(self):
        ebnf = """a = b
                  b = /b/
        """
359
        grammar = grammar_provider(ebnf)()
360
361
        assert grammar['a'].name == 'a', grammar['a'].name
        assert grammar['b'].name == 'b', grammar['b'].name
362
        assert grammar('b').as_sxpr().count('b') == 2
363

364

365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
class TestFlowControlOperators:
    def setup(self):
        self.t1 = """
        All work and no play 
        makes Jack a dull boy
        END
        """
        self.t2 = "All word and not play makes Jack a dull boy END\n"

    def test_lookbehind_indirect(self):
        lang = r"""
            document = ws sequence doc_end ws         
            sequence = { !end word ws }+
            doc_end  = -&SUCC_LB end        
            ws       = /\s*/
            end      = /END/
            word     = /\w+/
            SUCC_LB  = indirection
383
            indirection = /\s*?\n/
384
385
386
387
388
        """
        parser = grammar_provider(lang)()
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
389
        # this should fail, because 'END' is not preceded by a line feed
390
391
        assert cst.error_flag, cst.as_sxpr()

392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
    def test_required_error_reporting(self):
        """Tests whether failures to comply with the required operator '§'
        are correctly reported as such.
        """
        lang1 = "nonsense == /\w+/~  # wrong_equal_sign"
        lang2 = "nonsense = [^{}%]+  # someone forgot the '/'-delimiters for regular expressions"
        try:
            parser_class = grammar_provider(lang1)
            assert False, "Compilation error expected."
        except CompilationError as error:
            pass
        try:
            parser_class = grammar_provider(lang2)
            assert False, "Compilation error expected."
        except CompilationError as error:
            pass

409

410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
class TestWhitespace:
    def test_whitespace(self):
        tail = r"""
            WORD     =  /\w+/~
            EOF      =  !/./
        """
        lang1 = r'document = "DOC" { WORD } EOF' + tail
        parser = grammar_provider(lang1)()
        cst = parser("DOC Wörter Wörter Wörter")
        assert not cst.error_flag
        cst = parser("DOCWörter Wörter Wörter")
        assert not cst.error_flag

        lang2 = r'document = `DOC` { WORD } EOF' + tail
        parser = grammar_provider(lang2)()
        cst = parser("DOC Wörter Wörter Wörter")
        assert cst.error_flag
        cst = parser("DOCWörter Wörter Wörter")
        assert not cst.error_flag

        lang3 = r'document = `DOC` ~ { WORD } EOF' + tail
        parser = grammar_provider(lang3)()
        cst = parser("DOC Wörter Wörter Wörter")
        assert not cst.error_flag
        cst = parser("DOCWörter Wörter Wörter")
        assert not cst.error_flag


438
439
440
441
class TestAllSome:
    def test_all(self):
        ebnf = 'prefix = <"A" "B">'
        grammar = grammar_provider(ebnf)()
442
        assert grammar('B A').content == 'B A'
443
444
445
446

    def test_some(self):
        ebnf = 'prefix = <"A" | "B">'
        grammar = grammar_provider(ebnf)()
447
448
        assert grammar('B A').content == 'B A'
        assert grammar('B').content == 'B'
449

450

451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
class TestCuratedErrors:
    """
    Cureted Errors replace existing errors with alternative
    error codes and messages that are more helptful to the user.
    """
    def test_user_error_declaration(self):
        lang = """
            document = series | /.*/
            series = "X" | head §"C" "D"
            head = "A" "B"
            @series_error = "a user defined error message"
            """
        try:
            parser = grammar_provider(lang)()
            assert False, "Error definition after symbol definition should fail!"
        except CompilationError as e:
            pass

    def test_curated_mandatory_continuation(self):
        lang = """
            document = series | /.*/
            @series_error = "a user defined error message"
            series = "X" | head §"C" "D"
            head = "A" "B"
            """
        # from DHParser.dsl import compileDSL
        # from DHParser.preprocess import nil_preprocessor
        # from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler
        # grammar_src = compileDSL(lang, nil_preprocessor, get_ebnf_grammar(),
        #                          get_ebnf_transformer(), get_ebnf_compiler("test", lang))
        # print(grammar_src)
        parser = grammar_provider(lang)()
        st = parser("X");  assert not st.error_flag
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
        assert st.collect_errors()[0].code == Error.MANDATORY_CONTINUATION
        assert st.collect_errors()[0].message == "a user defined error message"
        # transitivity of mandatory-operator
        st = parser("ABC_");  assert st.error_flag
        assert st.collect_errors()[0].code == Error.MANDATORY_CONTINUATION
        assert st.collect_errors()[0].message == "a user defined error message"

494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
    def test_curated_error_message_case_sensitive(self):
        lang = """
            document = Series | /.*/
            @Series_error = "a user defined error message"
            Series = "X" | head §"C" "D"
            head = "A" "B"
            """
        # from DHParser.dsl import compileDSL
        # from DHParser.preprocess import nil_preprocessor
        # from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler
        # grammar_src = compileDSL(lang, nil_preprocessor, get_ebnf_grammar(),
        #                          get_ebnf_transformer(), get_ebnf_compiler("test", lang))
        # print(grammar_src)
        parser = grammar_provider(lang)()
        st = parser("ABC_");  assert st.error_flag
        assert st.collect_errors()[0].code == Error.MANDATORY_CONTINUATION
        assert st.collect_errors()[0].message == "a user defined error message"

512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554

class TestCustomizedResumeParsing:
    def setup(self):
        lang = """
        @ alpha_resume = 'BETA', 'GAMMA'
        @ beta_resume = 'GAMMA'
        @ bac_resume = /GA\w+/
        document = alpha [beta] gamma "."
          alpha = "ALPHA" abc
            abc = §"a" "b" "c"
          beta = "BETA" (bac | bca)
            bac = "b" "a" §"c"
            bca = "b" "c" §"a"
          gamma = "GAMMA" §(cab | cba)
            cab = "c" "a" §"b"
            cba = "c" "b" §"a"
        """
        try:
            self.gr = grammar_provider(lang)()
        except CompilationError as ce:
            print(ce)

    def test_several_resume_rules_innermost_rule_matching(self):
        gr = self.gr
        content = 'ALPHA abc BETA bad GAMMA cab .'
        cst = gr(content)
        # print(cst.as_sxpr())
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
        assert len(cst.collect_errors()) == 1
        # multiple failures
        content = 'ALPHA acb BETA bad GAMMA cab .'
        cst = gr(content)
        # print(cst.as_sxpr())
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
        assert len(cst.collect_errors()) == 2


555
if __name__ == "__main__":
556
    from DHParser.testing import runner
557

558
    runner("", globals())