test_ebnf.py 14.8 KB
Newer Older
1
2
#!/usr/bin/python3

3
"""test_ebnf.py - tests of the ebnf module of DHParser 
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
                             

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

Eckhart Arnold's avatar
Eckhart Arnold committed
23
import sys
24
from functools import partial
Eckhart Arnold's avatar
Eckhart Arnold committed
25
from multiprocessing import Pool
Eckhart Arnold's avatar
Eckhart Arnold committed
26

Eckhart Arnold's avatar
Eckhart Arnold committed
27
28
sys.path.extend(['../', './'])

Eckhart Arnold's avatar
Eckhart Arnold committed
29
from DHParser.toolkit import is_logging, compile_python_object, supress_warnings
30
from DHParser.parsers import compile_source, Retrieve, WHITESPACE_PTYPE, nil_scanner
31
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, EBNFTransformer, get_ebnf_compiler
32
from DHParser.dsl import CompilationError, compileDSL, DHPARSER_IMPORTS, parser_factory
33
34


35
36
37
38
39
40
41
42
43
44
45
class TestDirectives:
    mini_language = """
        expression =  term  { ("+" | "-") term }
        term       =  factor  { ("*" | "/") factor }
        factor     =  constant | "("  expression  ")"
        constant   =  digit { digit } [ //~ ]
        digit      = /0/ | /1/ | /2/ | /3/ | /4/ | /5/ | /6/ | /7/ | /8/ | /9/ 
        """

    def test_whitespace_linefeed(self):
        lang = "@ whitespace = linefeed\n" + self.mini_language
46
        MinilangParser = parser_factory(lang)
47
48
        parser = MinilangParser()
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
49
        syntax_tree = parser("3 + 4 * 12")
50
        # parser.log_parsing_history("WSP")
51
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
52
        syntax_tree = parser("3 + 4 \n * 12")
53
54
        # parser.log_parsing_history("WSPLF")
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
55
        syntax_tree = parser("3 + 4 \n \n * 12")
56
        assert syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
57
        syntax_tree = parser("3 + 4 \n\n * 12")
58
59
60
61
        assert syntax_tree.collect_errors()

    def test_whitespace_vertical(self):
        lang = "@ whitespace = vertical\n" + self.mini_language
62
        parser = parser_factory(lang)()
63
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
64
        syntax_tree = parser("3 + 4 * 12")
65
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
66
        syntax_tree = parser("3 + 4 \n * 12")
67
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
68
        syntax_tree = parser("3 + 4 \n \n * 12")
69
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
70
        syntax_tree = parser("3 + 4 \n\n * 12")
71
72
        assert not syntax_tree.collect_errors()

73
74
    def test_whitespace_horizontal(self):
        lang = "@ whitespace = horizontal\n" + self.mini_language
75
        parser = parser_factory(lang)()
76
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
77
        syntax_tree = parser("3 + 4 * 12")
78
        assert not syntax_tree.collect_errors()
Eckhart Arnold's avatar
Eckhart Arnold committed
79
        syntax_tree = parser("3 + 4 \n * 12")
80
81
        assert syntax_tree.collect_errors()

82

83
class TestEBNFParser:
Eckhart Arnold's avatar
Eckhart Arnold committed
84
    cases = {
85
86
87
88
89
90
91
92
93
94
95
96
97
98
        "list_": {
            "match": {
                1: "hund",
                2: "hund, katze,maus",
                3: "hund , katze"
            },
            "fail": {
                1: "123",
                2: '"literal"',
                3: "/regexp/"
            }
        }
    }

di68kap's avatar
di68kap committed
99

100
    def setup(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
101
        self.EBNF = get_ebnf_grammar()
102
103
104

    def test_literal(self):
        snippet = '"literal" '
Eckhart Arnold's avatar
Eckhart Arnold committed
105
        result = self.EBNF(snippet, 'literal')
106
107
        assert not result.error_flag
        assert str(result) == snippet
108
        assert result.find(lambda node: node.parser.ptype == WHITESPACE_PTYPE)
109

Eckhart Arnold's avatar
Eckhart Arnold committed
110
        result = self.EBNF(' "literal"', 'literal')
111
112
113
        assert result.error_flag  # literals catch following, but not leading whitespace


di68kap's avatar
di68kap committed
114
115


116
117
118
119
class TestPopRetrieve:
    mini_language = """
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
120
        delimiter      = delimiter_sign  # never use delimiter between capture and retrieve!!!
121
122
123
        delimiter_sign = /`+/
        text           = /[^`]+/ 
        """
124
    mini_lang2 = """
125
        @braces_filter=counterpart_filter
126
        document       = { text | codeblock }
127
128
129
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
130
        closing_braces = /\}+/
131
        text           = /[^{}]+/
132
        """
133
134

    def setup(self):
135
136
        self.minilang_parser = parser_factory(self.mini_language)()
        self.minilang_parser2 = parser_factory(self.mini_lang2)()
137

138
139
140
141
142
143
144
145
    @staticmethod
    def opening_delimiter(node, name):
        return node.tag_name == name and not isinstance(node.parser, Retrieve)

    @staticmethod
    def closing_delimiter(node):
        return isinstance(node.parser, Retrieve)

146
147
    def test_compile_mini_language(self):
        assert self.minilang_parser
148
        assert self.minilang_parser2
149
150
151

    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
Eckhart Arnold's avatar
Eckhart Arnold committed
152
        syntax_tree = self.minilang_parser(teststr)
153
        assert not syntax_tree.collect_errors()
154
155
        delim = str(next(syntax_tree.find(partial(self.opening_delimiter, name="delimiter"))))
        pop = str(next(syntax_tree.find(self.closing_delimiter)))
156
        assert delim == pop
Eckhart Arnold's avatar
Eckhart Arnold committed
157
158
        if is_logging():
            syntax_tree.log("test_PopRetrieve_single_line.cst")
159
160
161
162
163
164
165
166
167
168

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
169
        syntax_tree = self.minilang_parser(teststr)
170
        assert not syntax_tree.collect_errors()
171
172
        delim = str(next(syntax_tree.find(partial(self.opening_delimiter, name="delimiter"))))
        pop = str(next(syntax_tree.find(self.closing_delimiter)))
173
        assert delim == pop
Eckhart Arnold's avatar
Eckhart Arnold committed
174
175
        if is_logging():
            syntax_tree.log("test_PopRetrieve_multi_line.cst")
176
177
178

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
Eckhart Arnold's avatar
Eckhart Arnold committed
179
        syntax_tree = self.minilang_parser2(teststr)
180
181
182
183
        assert not syntax_tree.collect_errors()
        delim = str(next(syntax_tree.find(partial(self.opening_delimiter, name="braces"))))
        pop = str(next(syntax_tree.find(self.closing_delimiter)))
        assert len(delim) == len(pop) and delim != pop
Eckhart Arnold's avatar
Eckhart Arnold committed
184
185
        if is_logging():
            syntax_tree.log("test_PopRetrieve_single_line.cst")
186
187
188
189
190
191
192
193
194
195

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
196
        syntax_tree = self.minilang_parser2(teststr)
197
198
199
200
        assert not syntax_tree.collect_errors()
        delim = str(next(syntax_tree.find(partial(self.opening_delimiter, name="braces"))))
        pop = str(next(syntax_tree.find(self.closing_delimiter)))
        assert len(delim) == len(pop) and delim != pop
Eckhart Arnold's avatar
Eckhart Arnold committed
201
202
        if is_logging():
            syntax_tree.log("test_PopRetrieve_multi_line.cst")
203
204


205
206
class TestSemanticValidation:
    def check(self, minilang, bool_filter=lambda x: x):
Eckhart Arnold's avatar
Eckhart Arnold committed
207
        grammar = get_ebnf_grammar()
Eckhart Arnold's avatar
Eckhart Arnold committed
208
        st = grammar(minilang)
209
        assert not st.collect_errors()
210
        EBNFTransformer(st)
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
        assert bool_filter(st.collect_errors())

    def test_illegal_nesting(self):
        self.check('impossible = { [ "an optional requirement" ] }')

    def test_illegal_nesting_option_required(self):
        self.check('impossible = [ §"an optional requirement" ]')

    def test_illegal_nesting_oneormore_option(self):
        self.check('impossible = { [ "no use"] }+')

    def test_legal_nesting(self):
        self.check('possible = { [ "+" ] "1" }', lambda x: not x)


class TestCompilerErrors:
    def test_error_propagation(self):
228
        ebnf = "@ literalws = wrongvalue  # testing error propagation\n"
Eckhart Arnold's avatar
Eckhart Arnold committed
229
230
        result, messages, st = compile_source(ebnf, None, get_ebnf_grammar(),
            get_ebnf_transformer(), get_ebnf_compiler('ErrorPropagationTest'))
231
232
        assert messages

233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
    def test_undefined_symbols(self):
        """Use of undefined symbols should be reported.
        """
        ebnf = """syntax = { intermediary }
                  intermediary = "This symbol is " [ badly_spelled ] "!"
                  bedly_spilled = "wrong" """
        result, messages, st = compile_source(ebnf, None, get_ebnf_grammar(),
            get_ebnf_transformer(), get_ebnf_compiler('UndefinedSymbols'))
        assert messages

    def test_no_error(self):
        """But reserved symbols should not be repoted as undefined.
        """
        ebnf = """nothing =  WSP__ | COMMENT__\n"""
        result, messages, st = compile_source(ebnf, None, get_ebnf_grammar(),
            get_ebnf_transformer(), get_ebnf_compiler('UndefinedSymbols'))
        assert not bool(messages), messages

251

252
class TestSelfHosting:
Eckhart Arnold's avatar
Eckhart Arnold committed
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
    grammar = r"""
        # EBNF-Grammar in EBNF

        @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
        @ whitespace =  /\s*/                            # whitespace includes linefeed
        @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

        syntax     =  [~//] { definition | directive } §EOF
        definition =  symbol §"=" expression
        directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

        expression =  term { "|" term }
        term       =  { factor }+
        factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                    | [flowmarker] literal
                    | [flowmarker] regexp
                    | [flowmarker] group
                    | [flowmarker] regexchain
                    | [flowmarker] oneormore
                    | repetition
                    | option

        flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                      "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
        retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

        group      =  "(" expression §")"
        regexchain =  ">" expression §"<"                # compiles "expression" into a singular regular expression
        oneormore  =  "{" expression "}+"
        repetition =  "{" expression §"}"
        option     =  "[" expression §"]"

        symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
        literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                    | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
        regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                         # '~' is a whitespace-marker, if present leading or trailing
                                                         # whitespace of a regular expression will be ignored tacitly.
        list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
                                                         # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
        EOF =  !/./        
        """

296
297
    def test_self(self):
        compiler_name = "EBNF"
Eckhart Arnold's avatar
Eckhart Arnold committed
298
299
300
301
        compiler = get_ebnf_compiler(compiler_name, self.grammar)
        parser = get_ebnf_grammar()
        result, errors, syntax_tree = compile_source(self.grammar, None, parser,
                                            get_ebnf_transformer(), compiler)
302
303
304
        assert not errors, str(errors)
        # compile the grammar again using the result of the previous
        # compilation as parser
Eckhart Arnold's avatar
Eckhart Arnold committed
305
        compileDSL(self.grammar, nil_scanner, result, get_ebnf_transformer(), compiler)
306

Eckhart Arnold's avatar
Eckhart Arnold committed
307
308
309
310
311
312
313
314
315
316
317
    def multiprocessing_task(self):
        compiler_name = "EBNF"
        compiler = get_ebnf_compiler(compiler_name, self.grammar)
        parser = get_ebnf_grammar()
        result, errors, syntax_tree = compile_source(self.grammar, None, parser,
                                            get_ebnf_transformer(), compiler)
        return errors

    def test_multiprocessing(self):
        with Pool(processes=2) as pool:
            res = [pool.apply_async(self.multiprocessing_task, ()) for i in range(4)]
318
            errors = [r.get(timeout=10) for r in res]
Eckhart Arnold's avatar
Eckhart Arnold committed
319
320
        for i, e in enumerate(errors):
            assert not e, ("%i: " % i) + str(e)
321
322


323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
class TestBoundaryCases:
    def setup(self):
        self.gr = get_ebnf_grammar()
        self.tr = get_ebnf_transformer()
        self.cp = get_ebnf_compiler()

    def test_empty_grammar(self):
        t = self.gr("")
        self.tr(t)
        r = self.cp(t)
        assert r

    def test_single_statement_grammar(self):
        t = self.gr("i = /i/")
        self.tr(t)
        r = self.cp(t)
        assert r

    def test_two_statement_grammar(self):
        t = self.gr("i = k {k}\nk = /k/")
        self.tr(t)
        r = self.cp(t)
        assert r

347
348
349
350
351
    def test_unconnected_symbols(self):
        ebnf = """root = /.*/
                  unconnected = /.*/
        """
        try:
Eckhart Arnold's avatar
Eckhart Arnold committed
352
353
            with supress_warnings(False):
                grammar = parser_factory(ebnf)()
354
355
            assert False, "EBNF compiler should complain about unconnected rules."
        except CompilationError as err:
356
357
358
359
360
361
362
            grammar_src = err.result
            grammar = compile_python_object(DHPARSER_IMPORTS + grammar_src,
                                            'get_(?:\w+_)?grammar$')()
        assert grammar['root'], "Grammar objects should be subscriptable by parser names!"
        try:
            unconnected = grammar['unconnected']
        except KeyError:
Eckhart Arnold's avatar
Eckhart Arnold committed
363
            assert False, "Grammar objects should be able to cope with unconnected parsers!"
364
365
366
367
368
369
        try:
            nonexistant = grammar['nonexistant']
            assert False, "Grammar object shoul raise a KeyError if subscripted by " \
                          "a non-existant parser name!"
        except KeyError:
            pass
370
371
372
373
374
375
376
377
378
379
380
381


class TestSynonymDetection:
    def test_synonym_detection(self):
        ebnf = """a = b
                  b = /b/
        """
        grammar = parser_factory(ebnf)()
        assert grammar['a'].name == 'a', grammar['a'].name
        assert grammar['b'].name == 'b', grammar['b'].name
        assert grammar('b').as_sexpr().count('b') == 2

382

383
if __name__ == "__main__":
384
385
    from DHParser.testing import runner
    runner("", globals())