24.09., 9:00 - 11:00: Due to updates GitLab will be unavailable for some minutes between 09:00 and 11:00.

test_parse.py 34.4 KB
Newer Older
1 2
#!/usr/bin/python3

3
"""test_parse.py - tests of the parsers-module of DHParser
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import sys
23
from functools import partial
24

Eckhart Arnold's avatar
Eckhart Arnold committed
25
sys.path.extend(['../', './'])
26

27
from DHParser.toolkit import compile_python_object, get_config_value, set_config_value
28
from DHParser.log import logging, is_logging, log_ST, log_parsing_history
29
from DHParser.error import Error, is_error
30
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
31
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, AllOf, SomeOf, \
32
    UnknownParserError, MetaParser, GrammarError, EMPTY_NODE
33
from DHParser import compile_source
34 35
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, DHPARSER_IMPORTS
from DHParser.dsl import grammar_provider, CompilationError
Eckhart Arnold's avatar
Eckhart Arnold committed
36
from DHParser.syntaxtree import Node
37
from DHParser.stringview import StringView
38 39


40 41
class TestParserError:
    def test_parser_error_str(self):
42
        pe = ParserError(Node('TAG', 'test').with_pos(0), StringView('Beispiel'), None, True)
43 44
        assert str(pe).find('Beispiel') >= 0 and str(pe).find('TAG') >= 0

eckhart's avatar
eckhart committed
45

eckhart's avatar
eckhart committed
46 47 48 49 50 51 52 53 54 55
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
        def visitor(p: Parser):
di68kap's avatar
di68kap committed
56
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
57 58 59 60 61 62 63 64 65 66 67
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3


68
class TestInfiLoopsAndRecursion:
69
    def test_direct_left_recursion1(self):
eckhart's avatar
eckhart committed
70
        minilang = """
71 72 73 74
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
75
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
76
        parser = grammar_provider(minilang)()
77
        assert parser
Eckhart Arnold's avatar
Eckhart Arnold committed
78
        syntax_tree = parser(snippet)
79 80
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)
Eckhart Arnold's avatar
Eckhart Arnold committed
81
        if is_logging():
82
            log_ST(syntax_tree, "test_LeftRecursion_direct.cst")
83
            log_parsing_history(parser, "test_LeftRecursion_direct")
84

85
    def test_direct_left_recursion2(self):
86 87 88 89 90 91
        minilang = """
            expr = ex
            ex   = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
92
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
93
        parser = grammar_provider(minilang)()
94
        assert parser
95 96
        with logging():
            syntax_tree = parser(snippet)
97 98
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
99

100
    def test_indirect_left_recursion1(self):
101 102
        minilang = """
            Expr    = //~ (Product | Sum | Value)
103 104
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
105
            Value   = /[0-9.]+/~ | '(' §Expr ')'
106
            """
107
        parser = grammar_provider(minilang)()
108
        assert parser
109
        snippet = "8 * 4"
110
        syntax_tree = parser(snippet)
111
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
112 113
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
114
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
115 116
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
117 118
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
119
        if is_logging():
120
            log_ST(syntax_tree, "test_LeftRecursion_indirect.cst")
121
            log_parsing_history(parser, "test_LeftRecursion_indirect")
122

123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
    # # BEWARE: EXPERIMENTAL TEST can be long running
    # def test_indirect_left_recursion2(self):
    #     arithmetic_syntax = """
    #         expression     = addition | subtraction
    #         addition       = (expression | term) "+" (expression | term)
    #         subtraction    = (expression | term) "-" (expression | term)
    #         term           = multiplication | division
    #         multiplication = (term | factor) "*" (term | factor)
    #         division       = (term | factor) "/" (term | factor)
    #         factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
    #         group          = "(" expression ")"
    #         SIGN           = /[+-]/
    #         NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
    #         VARIABLE       = /[A-Za-z]/~
    #         """
    #     arithmetic = grammar_provider(arithmetic_syntax)()
    #     arithmetic.left_recursion_depth__ = 2
    #     assert arithmetic
    #     syntax_tree = arithmetic("(a + b) * (a - b)")
    #     assert syntax_tree.errors
143

144 145 146 147 148 149 150 151 152 153
    def test_break_inifnite_loop_ZeroOrMore(self):
        forever = ZeroOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
        assert repr(result) == "Node(:EMPTY__, )", repr(result)

    def test_break_inifnite_loop_OneOrMore(self):
        forever = OneOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
        assert repr(result) == "Node(:EMPTY__, )", str(result)

154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
    # def test_infinite_loops(self):
    #     minilang = """forever = { // } \n"""
    #     try:
    #         parser_class = grammar_provider(minilang)
    #     except CompilationError as error:
    #         assert all(e.code == Error.INFINITE_LOOP for e in error.errors)
    #     save = get_config_value('static_analysis')
    #     set_config_value('static_analysis', 'late')
    #     provider = grammar_provider(minilang)
    #     try:
    #         parser = provider()
    #     except GrammarError as error:
    #         assert error.errors[0][2].code == Error.INFINITE_LOOP
    #     set_config_value('static_analysis', 'none')
    #     parser = provider()
    #     snippet = " "
    #     syntax_tree = parser(snippet)
    #     assert any(e.code == Error.INFINITE_LOOP for e in syntax_tree.errors)
    #     res = parser.static_analysis()
    #     assert res and res[0][2].code == Error.INFINITE_LOOP
    #     minilang = """not_forever = { / / } \n"""
    #     parser = grammar_provider(minilang)()
    #     res = parser.static_analysis()
    #     assert not res
    #     set_config_value('static_analysis', save)
179

eckhart's avatar
eckhart committed
180

Eckhart Arnold's avatar
Eckhart Arnold committed
181
class TestFlowControl:
182 183
    def setup(self):
        self.t1 = """
eckhart's avatar
eckhart committed
184
        All work and no play
185 186 187 188 189
        makes Jack a dull boy
        END
        """
        self.t2 = "All word and not play makes Jack a dull boy END\n"

Eckhart Arnold's avatar
Eckhart Arnold committed
190
    def test_lookbehind(self):
191
        ws = RegExp(r'\s*')
Eckhart Arnold's avatar
Eckhart Arnold committed
192
        end = RegExp("END")
193
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
194
        word = RegExp(r'\w+')
Eckhart Arnold's avatar
Eckhart Arnold committed
195 196 197 198
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws

        parser = Grammar(document)
199 200 201 202 203 204 205
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
206
            parser_initialization__ = ["upon instantiation"]
207
            ws = RegExp(r'\s*')
208
            end = RegExp('END')
209
            SUCC_LB = RegExp('\\s*?\\n')
210
            doc_end = Series(Lookbehind(SUCC_LB), end)
211
            word = RegExp(r'\w+')
212 213 214 215 216 217
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
218
        assert not cst.error_flag, cst.as_sxpr()
219
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
220 221 222
        assert cst.error_flag, cst.as_sxpr()


223 224 225 226 227 228 229
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
230 231
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
232
        assert result
233
        assert not messages, str(messages)
234
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
235 236
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
237
        assert node.tag_name == "regex"
238 239
        assert str(node) == 'abc+def'

240 241
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
eckhart's avatar
eckhart committed
242 243
        regex =  /\w+
                  [+]
244 245 246 247 248 249
                  \w* /
        """
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
250
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
251 252
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
253
        assert node.tag_name == "regex"
254 255
        assert str(node) == 'abc+def'

256 257 258 259 260 261 262 263 264
    def text_ignore_case(self):
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
265
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
266 267 268 269
        node, rest = parser.regex('Alpha')
        assert node
        assert not node.error_flag
        assert rest == ''
270
        assert node.tag_name == "regex"
271 272 273 274 275 276 277 278 279 280
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
        result, messages, syntax_tree = compile_source(mlregex, None, get_ebnf_grammar(),
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
281
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
282 283 284 285
        node, rest = parser.regex('Alpha')
        assert node.error_flag


286 287 288 289 290 291 292 293 294 295 296 297
    def test_token(self):
        tokenlang = r"""
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
eckhart's avatar
eckhart committed
298 299 300
        result, messages, syntax_tree = compile_source(
            tokenlang, None, get_ebnf_grammar(), get_ebnf_transformer(),
            get_ebnf_compiler("TokenTest"))
301
        assert result
eckhart's avatar
eckhart committed
302
        assert not messages, str(messages)
303
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
304
        result = parser(testdoc)
305
        # log_parsing_history(parser, "test.log")
306 307
        assert not result.error_flag

308

309
class TestGrammar:
310
    def setup(self):
311 312 313 314 315 316
        grammar = r"""@whitespace = horizontal
        haupt        = textzeile LEERZEILE
        textzeile    = { WORT }+
        WORT         = /[^ \t]+/~
        LEERZEILE    = /\n[ \t]*(?=\n)/~
        """
317 318 319
        self.pyparser, messages, syntax_tree = compile_source(grammar, None, get_ebnf_grammar(),
                                                              get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
        assert self.pyparser
320
        assert not messages
321 322 323 324

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
325
        with logging("LOGS"):
326
            grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
327 328
            grammar("no_file_name*")
        for record in grammar.history__:
329 330
            assert not record.node or record.node.pos >= 0

331
    def test_select_parsing(self):
332
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
333 334 335 336
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
337

di68kap's avatar
di68kap committed
338 339
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
340
            r'''
di68kap's avatar
di68kap committed
341 342 343 344 345 346
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
347 348 349 350
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
351 352 353 354 355 356
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

357

358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser("ABCD");
        assert not st.error_flag
        st = parser("A_CD");
        assert not st.error_flag
        st = parser("AB_D");
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
381 382 383
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
384
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
385
        # transitivity of mandatory-operator
386
        st = parser("ABC_");  assert st.error_flag
387
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
388 389

    def test_series_composition(self):
390
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
391 392 393 394 395
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
396 397 398
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
399
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
400
        st = parser("ABC_E");  assert st.error_flag
401
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
402 403 404

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
405 406 407 408 409 410
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
411
        assert st.errors_sorted[0].code == Error.MANDATORY_CONTINUATION
412

eckhart's avatar
eckhart committed
413 414 415 416 417 418 419 420 421 422
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
423

424

425 426 427
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
428
        prefixes = AllOf(TKN("A"), TKN("B"))
429 430
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
431
        # aternative Form
432
        prefixes = AllOf(Series(TKN("B"), TKN("A")))
433
        assert Grammar(prefixes)('A B').content == 'A B'
434 435 436 437

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
438
        prefixes = AllOf(TKN("A"), TKN("B"))
439 440 441 442 443
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
444
        prefixes = AllOf(TKN("A"), TKN("B"), TKN("A"))
445 446 447
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
448 449 450 451
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
452
        prefixes = SomeOf(TKN("A"), TKN("B"))
453 454
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
455
        # aternative Form
456
        prefixes = SomeOf(Alternative(TKN("B"), TKN("A")))
457 458
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B').content == 'B'
459 460 461 462

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
463
        prefixes = SomeOf(TKN("A"), TKN("B"), TKN("A"))
464 465 466
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
467 468 469
        assert Grammar(prefixes)('A B B').error_flag


470
class TestPopRetrieve:
471
    mini_language = r"""
472 473
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
474
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrival!
475
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
476
        text           = /[^`]+/
477
        """
478
    mini_lang2 = r"""
479 480 481 482 483 484 485 486
        @braces_filter=counterpart
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
487
    mini_lang3 = r"""
488 489 490 491
        document       = { text | env }
        env            = (specialtag | opentag) text [closespecial | closetag]
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
492
        closetag       = close_slash | close_star
493 494 495 496 497 498 499 500
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """

    def setup(self):
501 502 503
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
504 505 506

    @staticmethod
    def opening_delimiter(node, name):
507
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
508 509 510

    @staticmethod
    def closing_delimiter(node):
511 512
        return node.tag_name in {':Pop', ':Retrieve'}
        # return isinstance(node.parser, Retrieve)
513 514 515 516 517 518 519 520 521

    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
522
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
523 524 525

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
526
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
527 528 529

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
530
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
531 532 533

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
534
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
535 536 537

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
538
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
539

540
    def test_cache_neutrality(self):
541 542
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
543
        lang = r"""
544 545 546 547 548 549 550 551 552
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
553
        gr = grammar_provider(lang)()
554
        st = gr(case)
555
        assert not st.error_flag, str(st.errors_sorted)
556 557 558 559

    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
560
        assert not syntax_tree.errors_sorted
561 562
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="delimiter"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
563 564
        assert delim == pop
        if is_logging():
565
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
566 567 568 569 570 571 572 573 574 575 576

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
577
        assert not syntax_tree.errors_sorted
578 579
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="delimiter"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
580 581
        assert delim == pop
        if is_logging():
582
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
583 584 585 586

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
587
        assert not syntax_tree.errors_sorted
588 589
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="braces"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
590 591
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
592
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
593 594 595 596 597 598 599 600 601 602 603

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
604
        assert not syntax_tree.errors_sorted
605 606
        delim = str(next(syntax_tree.select_if(partial(self.opening_delimiter, name="braces"))))
        pop = str(next(syntax_tree.select_if(self.closing_delimiter)))
607 608
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
609
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
610 611


612
class TestWhitespaceHandling:
Eckhart Arnold's avatar
Eckhart Arnold committed
613
    minilang = """
614 615 616 617 618 619 620 621 622
        doc = A B
        A = "A"
        B = "B"
        Rdoc = ar br
        ar = /A/
        br = /B/
        """

    def setup(self):
623
        self.gr = grammar_provider(self.minilang)()
624 625 626 627 628 629 630 631 632 633 634 635

    def test_token_whitespace(self):
        st = self.gr("AB", 'doc')
        assert not st.error_flag
        st = self.gr("A B", 'doc')
        assert not st.error_flag

    def test_regexp_whitespace(self):
        st = self.gr("AB", 'Rdoc')
        assert not st.error_flag
        st = self.gr("A B", 'Rdoc')
        assert st.error_flag
di68kap's avatar
di68kap committed
636 637 638 639 640


class TestErrorReporting:
    grammar = """
        root      = series alpha | anything
eckhart's avatar
eckhart committed
641
        series    = subseries &alpha
di68kap's avatar
di68kap committed
642 643 644 645 646 647 648 649 650 651 652 653 654 655
        subseries = alpha §beta
        alpha     = /[a-z]+/
        beta      = /[A-Z]+/
        anything  = /.*/
        """

    def setup(self):
        self.parser = grammar_provider(self.grammar)()

    def test_error_propagation(self):
        testcode1 = "halloB"
        testcode2 = "XYZ"
        testcode3 = "hallo "
        cst = self.parser(testcode1)
656
        assert not cst.error_flag, str(cst.errors_sorted)
di68kap's avatar
di68kap committed
657 658 659 660
        cst = self.parser(testcode2)
        assert not cst.error_flag
        cst = self.parser(testcode3)
        assert cst.error_flag
661 662


di68kap's avatar
di68kap committed
663 664
class TestBorderlineCases:
    def test_not_matching(self):
665
        minilang = """parser = /X/\n"""
di68kap's avatar
di68kap committed
666 667 668 669
        gr = grammar_provider(minilang)()
        cst = gr('X', 'parser')
        assert not cst.error_flag
        cst = gr(' ', 'parser')
670
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
671
        cst = gr('', 'parser')
672
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_DID_NOT_MATCH
di68kap's avatar
di68kap committed
673 674 675 676 677 678 679

    def test_matching(self):
        minilang = """parser = /.?/"""
        gr = grammar_provider(minilang)()
        cst = gr(' ', 'parser')
        assert not cst.error_flag
        cst = gr('  ', 'parser')
680
        assert cst.error_flag and cst.errors_sorted[0].code == Error.PARSER_STOPPED_BEFORE_END
di68kap's avatar
di68kap committed
681 682 683 684
        cst = gr('', 'parser')
        assert not cst.error_flag


Eckhart Arnold's avatar
Eckhart Arnold committed
685
class TestReentryAfterError:
686
    def setup(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
687 688 689 690 691
        lang = """
        document = alpha [beta] gamma "."
          alpha = "ALPHA" abc
            abc = §"a" "b" "c"
          beta = "BETA" (bac | bca)
692 693
            bac = "b" "a" §"c"
            bca = "b" "c" §"a"
Eckhart Arnold's avatar
Eckhart Arnold committed
694 695 696 697
          gamma = "GAMMA" §(cab | cba)
            cab = "c" "a" §"b"
            cba = "c" "b" §"a"
        """
698
        self.gr = grammar_provider(lang)()
699

700 701
    def test_no_resume_rules(self):
        gr = self.gr;  gr.resume_rules = dict()
702 703
        content = 'ALPHA acb BETA bac GAMMA cab .'
        cst = gr(content)
704 705 706 707
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')

eckhart's avatar
eckhart committed
708 709 710 711 712 713 714 715
    def test_no_resume_rules_partial_parsing(self):
        gr = self.gr;  gr.resume_rules = dict()
        content = 'ALPHA acb'
        cst = gr(content, 'alpha')
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')

716 717
    def test_simple_resume_rule(self):
        gr = self.gr;  gr.resume_rules = dict()
718
        gr.resume_rules__['alpha'] = ['BETA']
719
        content = 'ALPHA acb BETA bac GAMMA cab .'
720 721 722 723 724
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
725
        assert len(cst.errors_sorted) == 1
726

727 728
    def test_failing_resume_rule(self):
        gr = self.gr;  gr.resume_rules = dict()
729
        gr.resume_rules__['alpha'] = ['XXX']
730
        content = 'ALPHA acb BETA bac GAMMA cab .'
731 732 733 734 735 736
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        # assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message

737 738
    def test_severl_reentry_points(self):
        gr = self.gr;  gr.resume_rules = dict()
739
        gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
740
        content = 'ALPHA acb BETA bac GAMMA cab .'
741 742 743 744 745
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
746
        assert len(cst.errors_sorted) == 1
747

748 749
    def test_several_reentry_points_second_point_matching(self):
        gr = self.gr;  gr.resume_rules = dict()
750 751 752
        gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
        content = 'ALPHA acb GAMMA cab .'
        cst = gr(content)
Eckhart Arnold's avatar
Eckhart Arnold committed
753
        assert cst.error_flag
754
        assert cst.content == content
755 756
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
757
        assert len(cst.errors_sorted) == 1
758

759 760 761 762 763 764 765 766 767 768 769
    def test_several_resume_rules_innermost_rule_matching(self):
        gr = self.gr;  gr.resume_rules = dict()
        gr.resume_rules__['alpha'] = ['BETA', 'GAMMA']
        gr.resume_rules__['beta'] = ['GAMMA']
        gr.resume_rules__['bac'] = ['GAMMA']
        content = 'ALPHA abc BETA bad GAMMA cab .'
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
770
        assert len(cst.errors_sorted) == 1
771 772 773 774 775 776 777
        # multiple failures
        content = 'ALPHA acb BETA bad GAMMA cab .'
        cst = gr(content)
        assert cst.error_flag
        assert cst.content == content
        assert cst.pick('alpha').content.startswith('ALPHA')
        # because of resuming, there should be only on error message
778
        assert len(cst.errors_sorted) == 2
779

780

781
class TestConfiguredErrorMessages:
782
    def test_configured_error_message(self):
783 784 785 786 787 788 789 790
        lang = """
            document = series | /.*/
            @series_error = "a badly configured error message {5}"
            series = "X" | head §"C" "D"
            head = "A" "B"
            """
        parser = grammar_provider(lang)()
        st = parser("AB_D");  assert st.error_flag
791 792
        assert st.errors_sorted[0].code == Error.MALFORMED_ERROR_STRING
        assert st.errors_sorted[1].code == Error.MANDATORY_CONTINUATION
793 794


795 796 797 798 799 800 801 802 803 804
class TestUnknownParserError:
    def test_unknown_parser_error(self):
        gr = Grammar()
        try:
            gr("", "NonExistantParser")
            assert False, "UnknownParserError expected!"
        except UnknownParserError:
            pass


805 806
class TestEarlyTokenWhitespaceDrop:
    def setup(self):
807
        self.lang = r"""
808 809 810
            @ drop = token, whitespace
            expression = term  { ("+" | "-") term}
            term       = factor  { ("*"|"/") factor}
eckhart's avatar
eckhart committed
811
            factor     = number | variable | "("  expression  ")"
812
                       | constant | fixed
813
            variable   = /[a-z]/~
814 815
            number     = /\d+/~
            constant   = "A" | "B"
eckhart's avatar
eckhart committed
816
            fixed      = "X"
817 818 819
            """
        self.gr = grammar_provider(self.lang)()

820
    def test_drop(self):
821 822 823
        cst = self.gr('4 + 3 * 5')
        assert not cst.pick(':Token')
        assert not cst.pick(':Whitespace')
824 825
        cst = self.gr('A + B')
        try:
826
            _ = next(cst.select_if(lambda node: node.content == 'A'))
827 828 829 830
            assert False, "Tokens in compound expressions should be dropped!"
        except StopIteration:
            pass
        cst = self.gr('X * y')
831
        assert next(cst.select_if(lambda node: node.content == 'X'))
832

833

Eckhart Arnold's avatar
Eckhart Arnold committed
834
class TestMetaParser:
835 836 837 838 839 840 841
    def setup(self):
        self.mp = MetaParser()
        self.mp.grammar = Grammar()  # override placeholder warning
        self.mp.pname = "named"
        self.mp.tag_name = self.mp.pname

    def test_return_value(self):
842 843
        save = get_config_value('flatten_tree_while_parsing')
        set_config_value('flatten_tree_while_parsing', True)
844
        nd = self.mp._return_value(Node('tagged', 'non-empty'))
Eckhart Arnold's avatar
Eckhart Arnold committed
845 846 847 848
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert len(nd.children) == 1
        assert nd.children[0].tag_name == 'tagged'
        assert nd.children[0].result == "non-empty"
849
        nd = self.mp._return_value(Node('tagged', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
850 851 852 853
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert len(nd.children) == 1
        assert nd.children[0].tag_name == 'tagged'
        assert not nd.children[0].result
854
        nd = self.mp._return_value(Node(':anonymous', 'content'))
Eckhart Arnold's avatar
Eckhart Arnold committed
855 856 857
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert not nd.children
        assert nd.result == 'content'
858
        nd = self.mp._return_value(Node(':anonymous', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
859 860 861
        assert nd.tag_name == 'named', nd.as_sxpr()
        assert not nd.children
        assert not nd.content
862 863 864 865 866
        nd = self.mp._return_value(EMPTY_NODE)
        assert nd.tag_name == 'named' and not nd.children, nd.as_sxpr()
        self.mp.pname = ''
        self.mp.tag_name = ':unnamed'
        nd = self.mp._return_value(Node('tagged', 'non-empty'))
Eckhart Arnold's avatar
Eckhart Arnold committed
867 868 869
        assert nd.tag_name == 'tagged', nd.as_sxpr()
        assert len(nd.children) == 0
        assert nd.content == 'non-empty'
870
        nd = self.mp._return_value(Node('tagged', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
871 872 873
        assert nd.tag_name == 'tagged', nd.as_sxpr()
        assert len(nd.children) == 0
        assert not nd.content
874
        nd = self.mp._return_value(Node(':anonymous', 'content'))
Eckhart Arnold's avatar
Eckhart Arnold committed
875 876 877
        assert nd.tag_name == ':anonymous', nd.as_sxpr()
        assert not nd.children
        assert nd.result == 'content'
878
        nd = self.mp._return_value(Node('', ''))
Eckhart Arnold's avatar
Eckhart Arnold committed
879 880 881
        assert nd.tag_name == '', nd.as_sxpr()
        assert not nd.children
        assert not nd.content
882 883
        assert self.mp._return_value(None) == EMPTY_NODE
        assert self.mp._return_value(EMPTY_NODE) == EMPTY_NODE
884
        set_config_value('flatten_tree_while_parsing', save)
Eckhart Arnold's avatar
Eckhart Arnold committed
885

886 887 888 889 890 891
    def test_return_values(self):
        self.mp.pname = "named"
        self.mp.tag_name = self.mp.pname
        rv = self.mp._return_values((Node('tag', 'content'), EMPTY_NODE))
        assert rv[-1].tag_name != EMPTY_NODE.tag_name, rv[-1].tag_name

892 893 894 895 896
    def test_in_context(self):
        minilang = """
            term       = factor  { (DIV|MUL) factor}
            factor     = NUMBER | VARIABLE
            MUL        = "*" | &factor
eckhart's avatar
eckhart committed
897
            DIV        = "/"
898 899 900 901 902
            NUMBER     = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
            VARIABLE   = /[A-Za-z]/~
            """
        gr = grammar_provider(minilang)()
        cst = gr("2x")
903
        assert bool(cst.pick('MUL')), "Named empty nodes should not be dropped!!!"
Eckhart Arnold's avatar
Eckhart Arnold committed
904

eckhart's avatar
eckhart committed
905

906
if __name__ == "__main__":
907
    from DHParser.testing import runner
908 909
    with logging(False):
        runner("", globals())