test_parse.py 57.8 KB
Newer Older
1
#!/usr/bin/env python3
2

3
"""test_parse.py - tests of the parsers-module of DHParser
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21

Author: Eckhart Arnold <arnold@badw.de>

Copyright 2017 Bavarian Academy of Sciences and Humanities

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

eckhart's avatar
eckhart committed
22
import copy
23
import os
24
import sys
25
from functools import partial
26
from typing import List, Tuple
27

28 29
scriptpath = os.path.dirname(__file__) or '.'
sys.path.append(os.path.abspath(os.path.join(scriptpath, '..')))
30

eckhart's avatar
eckhart committed
31
from DHParser.configuration import get_config_value, set_config_value
32
from DHParser.toolkit import compile_python_object, re
33
from DHParser.log import is_logging, log_ST, log_parsing_history, start_logging
eckhart's avatar
eckhart committed
34 35 36
from DHParser.error import Error, is_error, adjust_error_locations, MANDATORY_CONTINUATION, \
    MALFORMED_ERROR_STRING, MANDATORY_CONTINUATION_AT_EOF, RESUME_NOTICE, PARSER_STOPPED_BEFORE_END, \
    PARSER_NEVER_TOUCHES_DOCUMENT
37
from DHParser.parse import ParserError, Parser, Grammar, Forward, TKN, ZeroOrMore, RE, \
eckhart's avatar
eckhart committed
38
    RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative, \
39
    Interleave, UnknownParserError, CombinedParser, Text, EMPTY_NODE, Capture, Drop, Whitespace, \
40
    GrammarError, Counted, Always, INFINITE
41
from DHParser import compile_source
42
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, \
43
    parse_ebnf, DHPARSER_IMPORTS, compile_ebnf
eckhart's avatar
eckhart committed
44
from DHParser.dsl import grammar_provider, create_parser, raw_compileEBNF
45
from DHParser.syntaxtree import Node, parse_sxpr
46
from DHParser.stringview import StringView
47
from DHParser.trace import set_tracer, trace_history, resume_notices_on
48 49


50 51 52 53 54 55

class TestWhitespace:
    # TODO: add test cases here
    def test_whitespace_comment_mangling(self):
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
56
    def test_non_empty_derivation(self):
57 58
        pass

59

60 61
class TestParserError:
    def test_parser_error_str(self):
62
        pe = ParserError(Node('TAG', 'test').with_pos(0), StringView('Beispiel'), None, True)
63 64
        assert str(pe).find('Beispiel') >= 0 and str(pe).find('TAG') >= 0

65
    def test_false_lookahead_only_message(self):
66
        """PARSER_LOOKAHEAD_*_ONLY errors must not be reported if there
67 68 69 70 71 72
        no lookahead parser in the history!"""
        lang = """
        word = letters { letters | `-` letters }
        letters = /[A-Za-z]+/
        """
        gr = grammar_provider(lang)()
73 74
        set_tracer(gr, trace_history)
        st = gr('hard-time')
75
        assert not st.errors
76
        st = gr('hard-')
77 78
        assert st.errors and not any(e.code == 1045 for e in st.errors)

eckhart's avatar
eckhart committed
79

eckhart's avatar
eckhart committed
80 81 82 83 84 85 86 87 88
class TestParserClass:
    def test_apply(self):
        minilang ="""
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        gr = grammar_provider(minilang)()
        l = []
89 90
        def visitor(context: List[Parser]):
            p = context[-1]
di68kap's avatar
di68kap committed
91
            l.append(p.pname + p.ptype)
eckhart's avatar
eckhart committed
92 93 94 95 96 97 98 99 100 101
        gr.root__.apply(visitor)
        s1 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s2 = ", ".join(l)
        l = []
        gr.root__.apply(visitor)
        s3 = ", ".join(l)
        assert s1 == s2 == s3

102 103
    def test_symbol(self):
        class MyGrammar(Grammar):
104
            wrong = Text('wrong')
105 106 107 108 109
            word = OneOrMore(wrong) + Whitespace(r'\s*') + OneOrMore(RegExp(r'\w+'))
            root__ = word
        gr = MyGrammar()
        regex = gr['word'].parsers[-1].parser
        result = gr.associated_symbol(regex).symbol
110
        assert result == 'word', result
111

eckhart's avatar
eckhart committed
112

113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
class TestInfiLoopsAndRecursion:
    def setup(self):
        pass
        # set_config_value('history_tracking', True)
        # set_config_value('resume_notices', True)
        # start_logging('LOGS')

    def test_very_simple(self):
        minilang = """
            term = term (`*`|`/`) factor | factor
            factor = /[0-9]+/
            """
        grammar_factory = grammar_provider(minilang)
        parser = grammar_factory()
        snippet = "5*4*3*2"
        # set_tracer(parser, trace_history)
        st = parser(snippet)
        if is_logging():
            log_ST(st, 'test_LeftRecursion_very_simple.cst')
            log_parsing_history(parser, 'test_LeftRecursion_very_simple')
        assert not is_error(st.error_flag), str(st.errors)
        st = parser("1*2*3*4*5*6*7*8*9")
        # if is_logging():
        #     log_ST(st, 'test_LeftRecursion_very_simple_2.cst')
        #     log_parsing_history(parser, 'test_LeftRecursion_very_simple_2')
        assert not is_error(st.error_flag)

    def test_direct_left_recursion1(self):
        minilang = """@literalws = right
            expr = expr ("+"|"-") term | term
            term = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
        parser = grammar_provider(minilang)()
        # print(raw_compileEBNF(minilang).result)
        assert parser
        syntax_tree = parser(snippet)
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_direct1.cst")
            log_parsing_history(parser, "test_LeftRecursion_direct1")
        assert not is_error(syntax_tree.error_flag), str(syntax_tree.errors_sorted)
        assert snippet == syntax_tree.content, str(syntax_tree)

    def test_direct_left_recursion2(self):
        minilang = """@literalws = right
            expr = ex
            ex   = expr ("+"|"-") term | term
            term = tr
            tr   = term ("*"|"/") factor | factor
            factor = /[0-9]+/~
            """
        snippet = "9 + 8 + 7 + 6 + 5 + 3 * 4"
        parser = grammar_provider(minilang)()
        assert parser
        syntax_tree = parser(snippet)
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_direct2.cst")
            log_parsing_history(parser, "test_LeftRecursion_direct2")

    def test_indirect_left_recursion1(self):
        minilang = """@literalws = right
            Expr    = //~ (Product | Sum | Value)
            Product = Expr { ('*' | '/') Expr }+
            Sum     = Expr { ('+' | '-') Expr }+
            Value   = /[0-9.]+/~ | '(' §Expr ')'
            """
        # print(raw_compileEBNF(minilang).result)
        parser = grammar_provider(minilang)()
        snippet = "8 * 4"
        syntax_tree = parser(snippet)
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        snippet = "7 + 8 * 4"
        syntax_tree = parser(snippet)
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_indirect1.cst")
            log_parsing_history(parser, "test_LeftRecursion_indirect1")
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        snippet = "9 + 8 * (4 + 3)"
        syntax_tree = parser(snippet)
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
        snippet = "9 + 8 * (4 - 3 / (5 - 1))"
        syntax_tree = parser(snippet)
        assert not is_error(syntax_tree.error_flag), syntax_tree.errors_sorted
        assert snippet == syntax_tree.content
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_indirect1.cst")
            log_parsing_history(parser, "test_LeftRecursion_indirect1")

    # BEWARE: EXPERIMENTAL TEST can be long running
    def test_indirect_left_recursion2(self):
        arithmetic_syntax = r"""@literalws = right
            expression     = addition | subtraction  # | term
            addition       = (expression | term) "+" (expression | term)
            subtraction    = (expression | term) "-" (expression | term)
            term           = multiplication | division  # | factor
            multiplication = (term | factor) "*" (term | factor)
            division       = (term | factor) "/" (term | factor)
            factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
            group          = "(" expression ")"
            SIGN           = /[+-]/
            NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
            VARIABLE       = /[A-Za-z]/~
            """
        arithmetic = grammar_provider(arithmetic_syntax)()
        assert arithmetic
        syntax_tree = arithmetic("(a + b) * (a - b)")
        assert syntax_tree.errors
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_indirect2.cst")
            log_parsing_history(arithmetic, "test_LeftRecursion_indirect2")

    def test_indirect_left_recursion3(self):
        arithmetic_syntax = r"""@literalws = right
            expression     = addition | subtraction | term
            addition       = (expression | term) "+" (expression | term)
            subtraction    = (expression | term) "-" (expression | term)
            term           = multiplication | division | factor
            multiplication = (term | factor) "*" (term | factor)
            division       = (term | factor) "/" (term | factor)
            factor         = [SIGN] ( NUMBER | VARIABLE | group ) { VARIABLE | group }
            group          = "(" expression ")"
            SIGN           = /[+-]/
            NUMBER         = /(?:0|(?:[1-9]\d*))(?:\.\d+)?/~
            VARIABLE       = /[A-Za-z]/~
            """
        arithmetic = grammar_provider(arithmetic_syntax)()
        assert arithmetic
        syntax_tree = arithmetic("(a + b) * (a - b)")
        assert not syntax_tree.errors
        if is_logging():
            log_ST(syntax_tree, "test_LeftRecursion_indirect3.cst")
            log_parsing_history(arithmetic, "test_LeftRecursion_indirect3")


    def test_break_inifnite_loop_ZeroOrMore(self):
        forever = ZeroOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    def test_break_inifnite_loop_OneOrMore(self):
        forever = OneOrMore(RegExp(''))
        result = Grammar(forever)('')  # infinite loops will automatically be broken
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    def test_break_infinite_loop_Counted(self):
        forever = Counted(Always(), (0, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (5, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (INFINITE, INFINITE))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Counted(Always(), (1000, INFINITE - 1))
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    def test_break_infinite_loop_Interleave(self):
        forever = Interleave(Always(), repetitions = [(0, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Interleave(Always(), Always(),
                             repetitions = [(5, INFINITE), (INFINITE, INFINITE)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)
        forever = Interleave(Always(), repetitions = [(1000, INFINITE - 1)])
        result = Grammar(forever)('')  # if this takes very long, something is wrong
        assert repr(result) == "Node(':EMPTY', '')", repr(result)

    # def test_infinite_loops(self):
    #     minilang = """forever = { // } \n"""
    #     try:
    #         parser_class = grammar_provider(minilang)
    #     except CompilationError as error:
    #         assert all(e.code == INFINITE_LOOP for e in error.errors)
    #     save = get_config_value('static_analysis')
    #     set_config_value('static_analysis', 'late')
    #     provider = grammar_provider(minilang)
    #     try:
    #         parser = provider()
    #     except GrammarError as error:
    #         assert error.errors[0][2].code == INFINITE_LOOP
    #     set_config_value('static_analysis', 'none')
    #     parser = provider()
    #     snippet = " "
    #     syntax_tree = parser(snippet)
    #     assert any(e.code == INFINITE_LOOP for e in syntax_tree.errors)
    #     res = parser.static_analysis()
    #     assert res and res[0][2].code == INFINITE_LOOP
    #     minilang = """not_forever = { / / } \n"""
    #     parser = grammar_provider(minilang)()
    #     res = parser.static_analysis()
    #     assert not res
    #     set_config_value('static_analysis', save)
312

eckhart's avatar
eckhart committed
313

314 315 316 317 318 319
# class TestStaticAnalysis:
#     def test_alternative(self):
#         lang = 'doc = "A" | "AB"'
#         parser = create_parser(lang)


Eckhart Arnold's avatar
Eckhart Arnold committed
320
class TestFlowControl:
321 322 323 324 325 326
    t1 = """
         All work and no play
         makes Jack a dull boy
         END
         """
    t2 = "All word and not play makes Jack a dull boy END\n"
327

Eckhart Arnold's avatar
Eckhart Arnold committed
328
    def test_lookbehind(self):
329 330
        ws = RegExp(r'\s*');  ws.pname = "ws"
        end = RegExp("END");  end.pname = "end"
331
        doc_end = Lookbehind(RegExp('\\s*?\\n')) + end
332
        word = RegExp(r'\w+');  word.pname = "word"
Eckhart Arnold's avatar
Eckhart Arnold committed
333 334 335
        sequence = OneOrMore(NegativeLookahead(end) + word + ws)
        document = ws + sequence + doc_end + ws
        parser = Grammar(document)
336 337 338 339 340
        cst = parser(self.t1)
        assert not cst.error_flag, cst.as_sxpr()
        cst = parser(self.t2)
        assert cst.error_flag, cst.as_sxpr()

341 342 343 344 345 346 347 348
        cst = parser(self.t2, parser['ws'], complete_match=False)
        assert cst.did_match() and len(cst) == 0 and not cst.errors
        cst = parser(self.t2, parser['word'], complete_match=False)
        assert cst.did_match() and cst.content == "All" and not cst.errors
        cst = parser(self.t2, parser['end'], complete_match=False)
        assert not cst.did_match()


349 350
    def test_lookbehind_indirect(self):
        class LookbehindTestGrammar(Grammar):
eckhart's avatar
eckhart committed
351
            parser_initialization__ = ["upon instantiation"]
352
            ws = RegExp(r'\s*')
353
            end = RegExp('END')
354
            SUCC_LB = RegExp('\\s*?\\n')
355
            doc_end = Series(Lookbehind(SUCC_LB), end)
356
            word = RegExp(r'\w+')
357 358 359 360 361 362
            sequence = OneOrMore(Series(NegativeLookahead(end), word, ws))
            document = Series(ws, sequence, doc_end, ws)
            root__ = document

        parser = LookbehindTestGrammar()
        cst = parser(self.t1)
Eckhart Arnold's avatar
Eckhart Arnold committed
363
        assert not cst.error_flag, cst.as_sxpr()
364
        cst = parser(self.t2)
Eckhart Arnold's avatar
Eckhart Arnold committed
365 366 367
        assert cst.error_flag, cst.as_sxpr()


368 369 370 371 372 373 374
class TestRegex:
    def test_multilineRegex(self):
        mlregex = r"""
        regex =  /\w+    # one or more alphabetical characters including the underscore
                  [+]    # followed by a plus sign
                  \w*    # possibly followed by more alpha chracters/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
375
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
Eckhart Arnold's avatar
Eckhart Arnold committed
376
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
377
        assert result
378
        assert not messages, str(messages)
379
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
eckhart's avatar
eckhart committed
380 381
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
382
        assert node.tag_name == "regex"
383 384
        assert str(node) == 'abc+def'

385 386
    def test_multilineRegex_wo_Comments(self):
        mlregex = r"""
eckhart's avatar
eckhart committed
387 388
        regex =  /\w+
                  [+]
389 390
                  \w* /
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
391
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
392 393 394
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages, str(messages)
395
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
396 397
        node = parser('abc+def', parser.regex)
        assert not node.error_flag
398
        assert node.tag_name == "regex"
399 400
        assert str(node) == 'abc+def'

401
    def test_ignore_case(self):
402 403 404 405
        mlregex = r"""
        @ ignorecase = True
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
406
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
407 408 409
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
410
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
411
        node, rest = parser.regex(StringView('Alpha'))
412 413
        assert node
        assert rest == ''
414
        assert node.tag_name == "regex"
415 416 417 418 419 420
        assert str(node) == 'Alpha'

        mlregex = r"""
        @ ignorecase = False
        regex = /alpha/
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
421
        result, messages, _ = compile_source(mlregex, None, get_ebnf_grammar(),
422 423 424
                        get_ebnf_transformer(), get_ebnf_compiler('MultilineRegexTest'))
        assert result
        assert not messages
425
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
426 427
        node, rest = parser.regex(StringView('Alpha'))
        assert node is None
428

429
    def test_token(self):
430
        tokenlang = r"""@literalws = right
431 432 433 434 435 436 437 438 439 440
            @whitespace = linefeed
            lang        = "" begin_token {/\w+/ ""} end_token
            begin_token = "\begin{document}"
            end_token   = "\end{document}"
            """
        testdoc = r"""
            \begin{document}
            test
            \end{document}
            """
Eckhart Arnold's avatar
Eckhart Arnold committed
441
        result, messages, _ = compile_source(
eckhart's avatar
eckhart committed
442 443
            tokenlang, None, get_ebnf_grammar(), get_ebnf_transformer(),
            get_ebnf_compiler("TokenTest"))
444
        assert result
eckhart's avatar
eckhart committed
445
        assert not messages, str(messages)
446
        parser = compile_python_object(DHPARSER_IMPORTS + result, r'\w+Grammar$')()
Eckhart Arnold's avatar
Eckhart Arnold committed
447
        result = parser(testdoc)
448
        # log_parsing_history(parser, "test.log")
449
        assert not result.error_flag, str(result.errors_sorted)
450

451

452
class TestGrammar:
453 454 455 456 457 458 459 460
    grammar = r"""@whitespace = horizontal
    haupt        = textzeile LEERZEILE
    textzeile    = { WORT }+
    WORT         = /[^ \t]+/~
    LEERZEILE    = /\n[ \t]*(?=\n)/~
    """
    pyparser, messages, _ = compile_source(grammar, None, get_ebnf_grammar(),
                                           get_ebnf_transformer(), get_ebnf_compiler("PosTest"))
eckhart's avatar
eckhart committed
461
    assert pyparser, str(messages)
eckhart's avatar
eckhart committed
462
    assert not messages, str(messages)
463 464 465 466

    def test_pos_values_initialized(self):
        # checks whether pos values in the parsing result and in the
        # history record have been initialized
467 468
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
        grammar("no_file_name*")
469
        for record in grammar.history__:
470 471
            assert not record.node or record.node.pos >= 0

472
    def test_select_parsing(self):
473
        grammar = compile_python_object(DHPARSER_IMPORTS + self.pyparser, r'\w+Grammar$')()
474 475 476 477
        grammar("wort", "WORT")
        grammar("eine Zeile", "textzeile")
        grammar("kein Haupt", "haupt")
        grammar("so ist es richtig", "haupt")
478

di68kap's avatar
di68kap committed
479 480
    def test_grammar_subclassing(self):
        class Arithmetic(Grammar):
481
            r'''
di68kap's avatar
di68kap committed
482 483 484 485 486 487
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
488 489 490 491
            INTEGER = RE('\\d+')
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
di68kap's avatar
di68kap committed
492 493 494 495 496 497
            root__ = expression

        grammar = Arithmetic()
        CST = grammar('3+4')
        assert not CST.error_flag, CST.as_sxpr()

498 499 500 501 502 503 504
    def test_incomplete_matching(self):
        """Tests whether the flag `complete_match` works as expected when
        calling a grammar object in order to parse a document."""
        gr = grammar_provider('word = ~/\\w+/\n')()
        st = gr('eins')
        assert not st.errors
        st = gr('eins zwei')
505
        assert st.errors[0].code == PARSER_STOPPED_BEFORE_END
506 507 508
        st = gr('eins zwei', complete_match=False)
        assert not st.errors

509
    def test_synonym(self):
di68kap's avatar
di68kap committed
510
        lang = r"""
511 512 513 514 515 516 517 518 519
            doc  = { word | number }
            word = /\w+/ S
            number = [VZ] /\d+/ S 
            S    = ~        # let S by a synonym for anonymous whitespace
            VZ   = "-"
        """
        gr = grammar_provider(lang)()
        st = gr('eins 1 zwei2drei 3')
        # set_config_value('compiled_EBNF_log', 'grammar.log')
520
        gr = grammar_provider("@drop = whitespace, strings" + lang)()
521 522
        st = gr('eins 1 zwei2drei 3')
        st = gr('-3')
523
        assert str(gr['S']) == "S = ~", str(gr['S'])
524

525

526 527 528 529 530 531 532
class TestSeries:
    def test_non_mandatory(self):
        lang = """
        document = series | /.*/
        series = "A" "B" "C" "D"
        """
        parser = grammar_provider(lang)()
Eckhart Arnold's avatar
Eckhart Arnold committed
533
        st = parser("ABCD")
534
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
535
        st = parser("A_CD")
536
        assert not st.error_flag
Eckhart Arnold's avatar
Eckhart Arnold committed
537
        st = parser("AB_D")
538 539 540 541 542 543 544 545 546 547 548
        assert not st.error_flag

    def test_mandatory(self):
        """Test for the §-operator. The Series-parser should raise an
        error for any non-match that occurs after the mandatory-operator.
        """
        lang = """
        document = series | /.*/
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
549 550 551
        st = parser("ABCD");  assert not st.error_flag
        st = parser("A_CD");  assert not st.error_flag
        st = parser("AB_D");  assert st.error_flag
552
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
553
        # transitivity of mandatory-operator
554
        st = parser("ABC_");  assert st.error_flag
555
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
556 557

    def test_series_composition(self):
558
        TA, TB, TC, TD, TE = (TKN(b) for b in "ABCDE")
559 560 561 562 563
        s1 = Series(TA, TB, TC, mandatory=2)
        s2 = Series(TD, TE)

        combined = Alternative(s1 + s2, RegExp('.*'))
        parser = Grammar(combined)
564 565 566
        st = parser("ABCDE");  assert not st.error_flag
        st = parser("A_CDE");  assert not st.error_flag
        st = parser("AB_DE");  assert st.error_flag
567
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
568
        st = parser("ABC_E");  assert st.error_flag
569
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
570 571 572

        combined = Alternative(s2 + s1, RegExp('.*'))
        parser = Grammar(combined)
573 574 575 576 577 578
        st = parser("DEABC");  assert not st.error_flag
        st = parser("_EABC");  assert not st.error_flag
        st = parser("D_ABC");  assert not st.error_flag
        st = parser("DE_BC");  assert not st.error_flag
        st = parser("DEA_C");  assert not st.error_flag
        st = parser("DEAB_");  assert st.error_flag
579
        assert st.errors_sorted[0].code == MANDATORY_CONTINUATION
580

eckhart's avatar
eckhart committed
581 582 583 584 585 586 587 588 589 590
    # def test_boundary_cases(self):
    #     lang = """
    #     document = series | §!single | /.*/
    #     series = "A" "B" §"C" "D"
    #     single = "E"
    #     """
    #     parser_class = grammar_provider(lang)
    #     parser = parser_class()
    #     print(parser.python_src__)
    #     print(parser_class.python_src__)
eckhart's avatar
eckhart committed
591

592 593
    def test_ebnf_serialization(self):
        ebnf_grammar = get_ebnf_grammar()
eckhart's avatar
eckhart committed
594
        # TODO: Add test here
595
        ebnf = ebnf_grammar.as_ebnf()
596
        # print(ebnf)
597 598


599 600 601
class TestAllOfSomeOf:
    def test_allOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
602
        prefixes = Interleave(TKN("A"), TKN("B"))
603 604
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
605 606 607 608

    def test_allOf_completeness(self):
        """Test that an error is raised if not  all parsers of an AllOf-List
        match."""
eckhart's avatar
eckhart committed
609
        prefixes = Interleave(TKN("A"), TKN("B"))
610 611 612 613 614
        assert Grammar(prefixes)('B').error_flag

    def test_allOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
615
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
616 617 618
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
619 620 621 622
        assert Grammar(prefixes)('A B B').error_flag

    def test_someOf_order(self):
        """Test that parsers of an AllOf-List can match in arbitrary order."""
eckhart's avatar
eckhart committed
623
        prefixes = Interleave(TKN("A"), TKN("B"))
624 625
        assert Grammar(prefixes)('A B').content == 'A B'
        assert Grammar(prefixes)('B A').content == 'B A'
eckhart's avatar
eckhart committed
626 627 628
        st = Grammar(prefixes)('B')
        assert st.error_flag
        prefixes = Interleave(TKN("B"), TKN("A"), repetitions=((0, 1), (0, 1)))
629
        assert Grammar(prefixes)('A B').content == 'A B'
eckhart's avatar
eckhart committed
630 631 632
        st = Grammar(prefixes)('B')
        assert not st.error_flag
        assert st.content == 'B'
633 634 635 636

    def test_someOf_redundance(self):
        """Test that one and the same parser may be listed several times
        and must be matched several times accordingly."""
eckhart's avatar
eckhart committed
637
        prefixes = Interleave(TKN("A"), TKN("B"), TKN("A"))
638 639 640
        assert Grammar(prefixes)('A A B').content == 'A A B'
        assert Grammar(prefixes)('A B A').content == 'A B A'
        assert Grammar(prefixes)('B A A').content == 'B A A'
641 642 643
        assert Grammar(prefixes)('A B B').error_flag


644 645
class TestInterleave:
    def test_interleave_most_simple(self):
646
        letterset = Interleave(Text("A"), Text("B"), Text("C"))
647 648
        gr = Grammar(letterset)
        st = gr('ABC')
649
        assert not st.errors, str(st.errors)
650 651 652 653 654 655 656 657 658 659
        assert st.content == "ABC"
        st = gr('BCA')
        assert not st.errors
        assert st.content == "BCA"
        st = gr('BCBA')
        assert st.errors
        st = gr('AB')
        assert st.errors

    def test_interleave(self):
660
        letterset = Interleave(Text("A"), Text("B"), Text("C"),
661 662 663 664
                               repetitions=[(1, 1000), (0, 1), (1, 1)])
        gr = Grammar(letterset)
        st = gr('AABC')
        assert not st.errors
665 666 667 668 669 670 671 672 673 674
        st = gr('BACAAA')
        assert not st.errors
        st = gr('ABCC')
        assert st.errors
        st = gr('AAACAAA')
        assert not st.errors
        st = gr('AAABAAA')
        assert st.errors


675 676 677 678
class TestErrorRecovery:
    def test_series_skip(self):
        lang = """
        document = series | /.*/
679
        @series_skip = /(?=[A-Z])/
680 681 682 683 684
        series = "A" "B" §"C" "D"
        """
        parser = grammar_provider(lang)()
        st = parser('AB_D')
        assert len(st.errors) == 1  # no additional "stopped before end"-error!
685
        resume_notices_on(parser)
686
        st = parser('AB_D')
687
        assert len(st.errors) == 2 and any(err.code == RESUME_NOTICE for err in st.errors)
eckhart's avatar
eckhart committed
688
        assert 'Skipping' in str(st.errors_sorted[1])
689

690
    def test_Interleave_skip(self):
691 692 693
        lang = """
        document = allof | /.*/
        @allof_skip = /[A-Z]/
694
        allof = "A" ° §"B" ° "C" ° "D"
695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710
        """
        parser = grammar_provider(lang)()
        st = parser('CADB')
        assert 'allof' in st and st['allof'].content == "CADB"
        st = parser('_BCD')
        assert st.equals(parse_sxpr('(document "_BCD")'))
        st = parser('_ABC')
        assert st.equals(parse_sxpr('(document "_ABC")'))
        st = parser('A_CD')
        assert st['allof'].content == "A_CD"
        st = parser('AB_D')
        assert st['allof'].content == "AB_D"
        st = parser('A__D')
        assert st['allof'].content == "A__D"
        st = parser('CA_D')
        assert st['allof'].content == "CA_D"
711 712
        st = parser('A_CB')
        assert st['allof'].content == "A_CB"
713
        st = parser('BC_A')
714
        assert 'allof' not in st
715 716


717
class TestPopRetrieve:
718
    mini_language = r"""
719 720
        document       = { text | codeblock }
        codeblock      = delimiter { text | (!:delimiter delimiter_sign) } ::delimiter
721
        delimiter      = delimiter_sign  # never use delimiter between capture and pop except for retrieval!
722
        delimiter_sign = /`+/
eckhart's avatar
eckhart committed
723
        text           = /[^`]+/
724
        """
725
    mini_lang2 = r"""
726
        @braces_filter = matching_bracket()
727 728 729 730 731 732 733
        document       = { text | codeblock }
        codeblock      = braces { text | opening_braces | (!:braces closing_braces) } ::braces
        braces         = opening_braces
        opening_braces = /\{+/
        closing_braces = /\}+/
        text           = /[^{}]+/
        """
734
    mini_lang3 = r"""@literalws = right
735
        document       = { text | env }
eckhart's avatar
eckhart committed
736
        env            = (specialtag | opentag) text [ closespecial | closetag ]
737 738
        opentag        = "<" name ">"
        specialtag     = "<" /ABC/ !name ">"
eckhart's avatar
eckhart committed
739
        closetag       = close_slash | close_star
740 741 742 743 744 745
        close_slash    = "<" ::name "/>"
        close_star     = "<" ::name "*>"
        closespecial   = "<" /ABC/~ ">"
        name           = /\w+/~
        text           = /[^<>]+/
        """
746
    mini_lang4 = r"""@literalws = right
747 748 749 750 751 752 753
        document       = { text | env }
        env            = opentag document closetag
        opentag        = "<" name ">"
        closetag       = "</" :?name ">"
        name           = /\w+/~
        text           = /[^<>]+/        
    """
754 755

    def setup(self):
756 757 758
        self.minilang_parser = grammar_provider(self.mini_language)()
        self.minilang_parser2 = grammar_provider(self.mini_lang2)()
        self.minilang_parser3 = grammar_provider(self.mini_lang3)()
759
        self.minilang_parser4 = grammar_provider(self.mini_lang4)()
760 761

    @staticmethod
762
    def has_tag_name(node, name):
763
        return node.tag_name == name # and not isinstance(node.parser, Retrieve)
764

765 766
    def test_capture_assertions(self):
        try:
767 768 769
            _ = Grammar(Capture(Drop(Whitespace(r'\s*'))))
            assert False, "GrammarError expected!"
        except GrammarError as ge:
770 771
            pass
        try:
772
            _ = Grammar(Capture(Series(Text(' '), Drop(Whitespace(r'\s*')))))
773
            assert False, "ValueError expected!"
774
        except GrammarError:
775
            pass
776 777 778
        cp = Capture(RegExp(r'\w+'))
        cp.pname = "capture"
        _ = Grammar(cp)
779

780 781 782 783
    def test_compile_mini_language(self):
        assert self.minilang_parser
        assert self.minilang_parser2
        assert self.minilang_parser3
784
        assert self.minilang_parser4
785 786 787 788

    def test_stackhandling(self):
        ambigous_opening = "<ABCnormal> normal tag <ABCnormal*>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
789
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
790 791 792

        ambigous_opening = "<ABCnormal> normal tag <ABCnormal/>"
        syntax_tree = self.minilang_parser3(ambigous_opening)
793
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
794 795 796

        forgot_closing_tag = "<em> where is the closing tag?"
        syntax_tree = self.minilang_parser3(forgot_closing_tag)
797
        assert syntax_tree.error_flag, str(syntax_tree.errors_sorted)
798 799 800

        proper = "<em> has closing tag <em/>"
        syntax_tree = self.minilang_parser3(proper)
801
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
802 803 804

        proper = "<em> has closing tag <em*>"
        syntax_tree = self.minilang_parser3(proper)
805
        assert not syntax_tree.error_flag, str(syntax_tree.errors_sorted)
806

807
    def test_optional_match(self):
808 809 810 811
        # from DHParser.dsl import compileEBNF
        # src = compileEBNF(self.mini_lang4)
        # print(src)
        # return
812 813 814 815 816 817
        test1 = '<info>Hey, you</info>'
        st = self.minilang_parser4(test1)
        assert not st.error_flag, str(st.errors_sorted)
        test12 = '<info>Hey, <emph>you</emph></info>'
        st = self.minilang_parser4(test1)
        assert not st.error_flag
818
        test2 = '<info>Hey, you</>'
819 820 821
        # set_config_value('history_tracking', True)
        # set_tracer(self.minilang_parser4, trace_history)
        # start_logging('LOGS')
822
        st = self.minilang_parser4(test2)
823 824 825 826 827 828 829 830 831
        # log_parsing_history(self.minilang_parser4, "optional_match")
        # print(st.as_sxpr())
        assert not st.error_flag
        test3 = '<info>Hey, <emph>you</></>'
        st = self.minilang_parser4(test3)
        assert not st.error_flag
        test4 = '<info>Hey, <emph>you</></info>'
        st = self.minilang_parser4(test4)
        assert not st.error_flag
832 833 834 835 836 837 838 839 840 841 842

    def test_rollback_behaviour_of_optional_match(self):
        test1 = '<info>Hey, you</info*>'
        st = self.minilang_parser4(test1)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag
        test2 = '<info>Hey, you</*>'
        st = self.minilang_parser4(test2)
        assert not self.minilang_parser4.variables__['name']
        assert st.error_flag

843
    def test_cache_neutrality_1(self):
844 845
        """Test that packrat-caching does not interfere with the variable-
        changing parsers: Capture and Retrieve."""
846
        lang = r"""@literalws = right
847 848 849 850 851 852 853 854 855
            text = opening closing
            opening = (unmarked_package | marked_package)
            closing = ::variable
            unmarked_package = package "."
            marked_package = package "*" "."
            package = "(" variable ")"
            variable = /\w+/~
            """
        case = "(secret)*. secret"
856
        gr = grammar_provider(lang)()
857
        st = gr(case)
858
        assert not st.error_flag, str(st.errors_sorted)
859

860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877
    def test_cache_neutrality_2(self):
        lang = r'''document = variantA | variantB
            variantA  = delimiter `X` ::delimiter `!` 
            variantB  = `A` delimiter ::delimiter `!` 
            delimiter = `A` | `X`
        '''
        gr = grammar_provider(lang)()
        case = 'AXA!'
        st = gr(case)
        assert not st.errors
        case = 'AXX!'
        set_config_value('history_tracking', True)
        start_logging('LOGS')
        set_tracer(gr, trace_history)
        st = gr(case)
        log_parsing_history(gr, 'test_cache_neutrality_2')
        print(st.as_sxpr())

878 879 880
    def test_single_line(self):
        teststr = "Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
        syntax_tree = self.minilang_parser(teststr)
881 882
        assert not syntax_tree.errors_sorted, \
            ''.join(str(error) for error in syntax_tree.errors_sorted)
883 884 885
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
886 887
        assert delim == pop
        if is_logging():
888
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
889 890 891 892 893 894 895 896 897 898 899

    def test_multi_line(self):
        teststr = """
            Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ebde

            Absatz ohne ``` codeblock, aber
            das stellt sich erst am Ende herause...

            Mehrzeliger ```code block
            """
        syntax_tree = self.minilang_parser(teststr)
900
        assert not syntax_tree.errors_sorted
901 902 903
        matchf = partial(self.has_tag_name, name="delimiter")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
904 905
        assert delim == pop
        if is_logging():
906
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
907 908 909 910

    def test_single_line_complement(self):
        teststr = "Anfang {{{code block }} <- keine Ende-Zeichen ! }}} Ende"
        syntax_tree = self.minilang_parser2(teststr)
911
        assert not syntax_tree.errors_sorted
912 913 914 915 916
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
        assert len(delim) == len(pop)
        assert delim != pop
917
        if is_logging():
918
            log_ST(syntax_tree, "test_PopRetrieve_single_line.cst")
919 920 921 922 923 924 925 926 927 928 929

    def test_multi_line_complement(self):
        teststr = """
            Anfang {{{code block {{ <- keine Ende-Zeichen ! }}} Ende

            Absatz ohne {{{ codeblock, aber
            das stellt sich erst am Ende heraus...

            Mehrzeliger }}}code block
            """
        syntax_tree = self.minilang_parser2(teststr)
930
        assert not syntax_tree.errors_sorted
931 932 933
        matchf = partial(self.has_tag_name, name="braces")
        delim = str(next(syntax_tree.select_if(matchf)))
        pop = str(next(syntax_tree.select_if(matchf, reverse=True)))
934 935
        assert len(delim) == len(pop) and delim != pop
        if is_logging():
936
            log_ST(syntax_tree, "test_PopRetrieve_multi_line.cst")
937

938
    def test_autoretrieve(self):
939
        lang = r"""@literalws = right
940
            document   = { definition } § EOF
941
            definition = symbol :defsign value
942
            symbol     = /\w+/~                      
943 944
            defsign    = "=" | ":="
            value      = /\d+/~
eckhart's avatar
eckhart committed
945
            EOF        = !/./ [ :?defsign ]   # eat up captured defsigns
946
        """
947
        # print(raw_compileEBNF(lang).result)
948
        parser = grammar_provider(lang)()
949
        st = parser("X := 1")
950
        assert not st.error_flag, str(st.errors)
951
        st1 = st
952 953
        st = parser("")
        assert not st.error_flag
954 955 956

        lines = [line for line in lang.split('\n') if line.strip()]
        eof_line = lines.pop()
957
        lines.insert(2, eof_line)
958 959 960
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
961
        assert not st.errors, str(st.errors)
962 963
        assert st.equals(st1)

964 965
        del lines[2]
        lines.insert(3, eof_line)
966 967 968 969 970 971 972
        lang = '\n'.join(lines)
        parser = grammar_provider(lang)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)

        # and, finally...
973
        lang_variant = r"""@literalws = right
974 975 976 977
            document   = { definition } § EOF
            symbol     = /\w+/~                      
            defsign    = "=" | ":="
            value      = /\d+/~
978
            EOF        = !/./ :?defsign   # eat up captured defsign, only if it has been retrieved
979 980 981 982 983 984
            definition = symbol :defsign value
        """
        parser = grammar_provider(lang_variant)()
        st = parser("X := 1")
        assert not st.errors
        assert st.equals(st1)
985 986
        st = parser('')
        assert "EOF expected" in str(st.errors)
di68kap's avatar
di68kap committed
987

988

989
class TestWhitespaceHandling:
990
    minilang = """@literalws = right
991 992 993 994 995 996 997
        doc = A B
        A = "A"
        B = "B"
        Rdoc = ar br
        ar = /A/
        br = /B/
        """
998
    gr = grammar_provider(minilang)()
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010

    def test_token_whitespace(self):
        st = self.gr("AB", 'doc')
        assert not st.error_flag
        st = self.gr("A B", 'doc')
        assert not st.error_flag

    def test_regexp_whitespace(self):
        st = self.gr("AB", 'Rdoc')
        assert not st.error_flag
        st = self.gr("A B", 'Rdoc')
        assert st.error_flag
di68kap's avatar
di68kap committed
1011 1012 1013 1014 1015


class TestErrorReporting:
    grammar = """
        root      = series alpha | anything
eckhart's avatar
eckhart committed
1016
        series    = subseries &alpha
di68kap's avatar
di68kap committed
1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030
        subseries = alpha §beta
        alpha     = /[a-z]+/
        beta      = /[A-Z]+/
        anything  = /.*/
        """

    def setup(self):
        self.parser = grammar_provider(self.grammar)()

    def test_error_propagation(self):
        testcode1 = "halloB"
        testcode2 = "XYZ"
        testcode3 = "hallo "
        cst = self.parser(testcode1)
1031
        assert not cst.error_flag, str(cst.errors_sorted)
di68kap's avatar
di68kap committed
1032 1033 1034 1035
        cst = self.parser(testcode2)
        assert not cst.error_flag
        cst = self.parser(testcode3)
        assert cst.error_flag