Starting from 2021-07-01, all LRZ GitLab users will be required to explicitly accept the GitLab Terms of Service. Please see the detailed information at https://doku.lrz.de/display/PUBLIC/GitLab and make sure that your projects conform to the requirements.

parsers.py 54.6 KB
Newer Older
1
"""parsers.py - parser combinators for for DHParser
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
Eckhart Arnold's avatar
Eckhart Arnold committed
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48

Module ``parsers.py`` contains a number of classes that together
make up parser combinators for left-recursive grammers. For each
element of the extended Backus-Naur-Form as well as for a regular
expression token a class is defined. The set of classes can be used to
define a parser for (ambiguous) left-recursive grammers.


References and Acknowledgements:

Dominikus Herzberg: Objekt-orientierte Parser-Kombinatoren in Python,
Blog-Post, September, 18th 2008 on denkspuren. gedanken, ideen,
anregungen und links rund um informatik-themen, URL:
http://denkspuren.blogspot.de/2008/09/objekt-orientierte-parser-kombinatoren.html

Dominikus Herzberg: Eine einfache Grammatik für LaTeX, Blog-Post,
September, 18th 2008 on denkspuren. gedanken, ideen, anregungen und
links rund um informatik-themen, URL:
http://denkspuren.blogspot.de/2008/09/eine-einfache-grammatik-fr-latex.html

Dominikus Herzberg: Uniform Syntax, Blog-Post, February, 27th 2007 on
denkspuren. gedanken, ideen, anregungen und links rund um
informatik-themen, URL:
http://denkspuren.blogspot.de/2007/02/uniform-syntax.html

Richard A. Frost, Rahmatullah Hafiz and Paul Callaghan: Parser
Combinators for Ambiguous Left-Recursive Grammars, in: P. Hudak and
D.S. Warren (Eds.): PADL 2008, LNCS 4902, pp. 167–181, Springer-Verlag
Berlin Heidelberg 2008.

Juancarlo Añez: grako, a PEG parser generator in Python,
https://bitbucket.org/apalala/grako
49 50
"""

51

Eckhart Arnold's avatar
Eckhart Arnold committed
52
import abc
53
import copy
54 55
import os
import platform
Eckhart Arnold's avatar
Eckhart Arnold committed
56 57
from functools import partial

58 59 60 61
try:
    import regex as re
except ImportError:
    import re
62
try:
63 64 65 66 67
    from typing import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union
    try:
        from typing import Collection
    except ImportError:
        pass
68
except ImportError:
69
    from .typing34 import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union
70

71
from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name
Eckhart Arnold's avatar
Eckhart Arnold committed
72 73
from DHParser.syntaxtree import WHITESPACE_PTYPE, TOKEN_PTYPE, ZOMBIE_PARSER, ParserBase, \
    Node, TransformationFunc
74
from DHParser.toolkit import load_if_file, error_messages
Eckhart Arnold's avatar
Eckhart Arnold committed
75

76

77 78
__all__ = ['ScannerFunc',
           'HistoryRecord',
Eckhart Arnold's avatar
Eckhart Arnold committed
79
           'Parser',
80
           'Grammar',
Eckhart Arnold's avatar
Eckhart Arnold committed
81 82 83 84 85 86 87 88 89 90
           'RX_SCANNER_TOKEN',
           'BEGIN_SCANNER_TOKEN',
           'END_SCANNER_TOKEN',
           'make_token',
           'nil_scanner',
           'ScannerToken',
           'RegExp',
           'RE',
           'Token',
           'mixin_comment',
91 92 93
           # 'UnaryOperator',
           # 'NaryOperator',
           'Synonym',
Eckhart Arnold's avatar
Eckhart Arnold committed
94 95 96
           'Optional',
           'ZeroOrMore',
           'OneOrMore',
97
           'Series',
Eckhart Arnold's avatar
Eckhart Arnold committed
98 99 100 101 102 103 104
           'Alternative',
           'FlowOperator',
           'Required',
           'Lookahead',
           'NegativeLookahead',
           'Lookbehind',
           'NegativeLookbehind',
105 106 107
           'last_value',
           'counterpart',
           'accumulate',
Eckhart Arnold's avatar
Eckhart Arnold committed
108 109 110 111
           'Capture',
           'Retrieve',
           'Pop',
           'Forward',
112
           'Compiler',
Eckhart Arnold's avatar
Eckhart Arnold committed
113
           'compile_source']
Eckhart Arnold's avatar
Eckhart Arnold committed
114

115

116

117 118 119 120 121 122 123 124 125 126
########################################################################
#
# Grammar and parsing infrastructure
#
########################################################################


ScannerFunc = Union[Callable[[str], str], partial]


127 128 129 130 131
LEFT_RECURSION_DEPTH = 20 if platform.python_implementation() == "PyPy" \
                          else 8 # type: int
# because of python's recursion depth limit, this value ought not to be set too high
MAX_DROPOUTS = 25  # type: int
# stop trying to recover parsing after so many errors
132

133

134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
class HistoryRecord:
    """
    Stores debugging information about one completed step in the
    parsing history. 

    A parsing step is "completed" when the last one of a nested
    sequence of parser-calls returns. The call stack including
    the last parser call will be frozen in the ``HistoryRecord``-
    object. In addition a reference to the generated leaf node
    (if any) will be stored and the result status of the last
    parser call, which ist either MATCH, FAIL (i.e. no match)
    or ERROR.
    """
    __slots__ = ('call_stack', 'node', 'remaining')

    MATCH = "MATCH"
    ERROR = "ERROR"
    FAIL = "FAIL"

153 154 155 156
    def __init__(self, call_stack: List['Parser'], node: Node, remaining: int) -> None:
        self.call_stack = call_stack    # type: List['Parser']
        self.node = node                # type: Node
        self.remaining = remaining      # type: int
157

158
    def err_msg(self) -> str:
159 160
        return self.ERROR + ": " + "; ".join(self.node._errors).replace('\n', '\\')

161
    @property
162
    def stack(self) -> str:
163 164
        return "->".join((repr(p) if p.ptype == ':RegExp' else p.name or p.ptype)
                         for p in self.call_stack)
165 166

    @property
167
    def status(self) -> str:
168 169
        return self.FAIL if self.node is None else \
            self.err_msg() if self.node._errors else self.MATCH
170 171

    @property
172 173 174
    def extent(self) -> slice:
        return (slice(-self.remaining - self.node.len, -self.remaining) if self.node
                else slice(-self.remaining, None))
175 176 177


def add_parser_guard(parser_func):
178
    def guarded_call(parser: 'Parser', text: str) -> Tuple[Node, str]:
179 180
        try:
            location = len(text)
181 182
            grammar = parser.grammar  # grammar may be 'None' for unconnected parsers!

183
            if not grammar.moving_forward__:
184
                # rollback variable changes from discarded parser passes
185 186 187
                if grammar.last_rb__loc__ <= location:
                    grammar.rollback_to__(location)
                grammar.moving_forward__ = True
188
                grammar.left_recursion_encountered__ = False
189 190 191

            if grammar.history_tracking__:
                grammar.call_stack__.append(parser)
192

193 194 195
            # if location has already been visited by the current parser,
            # return saved result
            if location in parser.visited:
196
                return parser.visited[location]
197 198
            # break left recursion at the maximum allowed depth
            if parser.recursion_counter.setdefault(location, 0) > LEFT_RECURSION_DEPTH:
199
                grammar.left_recursion_encountered__ = True
200 201 202 203 204 205 206
                return None, text

            parser.recursion_counter[location] += 1

            # run original __call__ method
            node, rest = parser_func(parser, text)

207
            if node is None:
Eckhart Arnold's avatar
Eckhart Arnold committed
208 209 210 211 212 213 214
                # retrieve an earlier match result (from left recursion)
                # if it exists
                node, rest = parser.visited.get(location, (None, rest))
                # don't overwrite any positive match (i.e. node not None) in the cache
                # and don't add empty entries for parsers returning from left recursive calls!
                if node is None and not grammar.left_recursion_encountered__:
                    # ortherwise also cache None-results
215 216
                    parser.visited[location] = None, rest
            else:
217
                # variable manipulating parsers will be excluded, though,
218
                # because caching would interfere with changes of variable state
219
                if grammar.last_rb__loc__ > location:
Eckhart Arnold's avatar
Eckhart Arnold committed
220 221
                    # in case of left recursion, the first recursive step that
                    # matches will store its result in the cache
222
                    parser.visited[location] = (node, rest)
223
                grammar.last_node__ = node   # store last node for Lookbehind parser
224 225 226

            parser.recursion_counter[location] -= 1

227
            if grammar.history_tracking__:
228
                # don't track returning parsers except in case an error has occurred
229 230 231
                if grammar.moving_forward__ or (node and node._errors):
                    record = HistoryRecord(grammar.call_stack__.copy(), node, len(rest))
                    grammar.history__.append(record)
232
                    # print(record.stack, record.status, rest[:20].replace('\n', '|'))
233 234
                grammar.call_stack__.pop()
            grammar.moving_forward__ = False
235

236 237 238 239 240 241 242 243 244 245 246
        except RecursionError:
            node = Node(None, text[:min(10, max(1, text.find("\n")))] + " ...")
            node.add_error("maximum recursion depth of parser reached; "
                           "potentially due to too many errors!")
            rest = ''

        return node, rest

    return guarded_call


Eckhart Arnold's avatar
Eckhart Arnold committed
247
class ParserMetaClass(abc.ABCMeta):
248 249 250 251 252 253 254 255 256 257
    def __init__(cls, name, bases, attrs):
        # The following condition is necessary for classes that don't override
        # the __call__() method, because in these cases the non-overridden
        # __call__()-method would be substituted a second time!
        guarded_parser_call = add_parser_guard(cls.__call__)
        if cls.__call__.__code__ != guarded_parser_call.__code__:
            cls.__call__ = guarded_parser_call
        super(ParserMetaClass, cls).__init__(name, bases, attrs)


Eckhart Arnold's avatar
Eckhart Arnold committed
258
class Parser(ParserBase, metaclass=ParserMetaClass):
259 260
    ApplyFunc = Callable[['Parser'], None]

261 262
    def __init__(self, name: str = '') -> None:
        # assert isinstance(name, str), str(name)
Eckhart Arnold's avatar
Eckhart Arnold committed
263
        super(Parser, self).__init__(name)
264
        self._grammar = None  # type: 'Grammar'
265 266
        self.reset()

Eckhart Arnold's avatar
Eckhart Arnold committed
267
    def __deepcopy__(self, memo):
268 269
        return self.__class__(self.name)

270
    def reset(self):
271
        self.visited = dict()            # type: Dict[int, Tuple[Node, str]]
272
        self.recursion_counter = dict()  # type: Dict[int, int]
273
        self.cycle_detection = set()     # type: Set[Callable]
274
        return self
275

276
    def __call__(self, text: str) -> Tuple[Node, str]:
277 278
        return None, text  # default behaviour: don't match

279
    def __add__(self, other: 'Parser') -> 'Series':
280
        return Series(self, other)
281

282
    def __or__(self, other: 'Parser') -> 'Alternative':
283 284
        return Alternative(self, other)

285
    @property
286
    def grammar(self) -> 'Grammar':
287 288 289
        return self._grammar

    @grammar.setter
290 291
    def grammar(self, grammar: 'Grammar'):
        self._grammar = grammar
292 293 294 295 296
        self._grammar_assigned_notifier()

    def _grammar_assigned_notifier(self):
        pass

297
    def apply(self, func: ApplyFunc):
298 299
        """
        Applies function `func(parser)` recursively to this parser and all
Eckhart Arnold's avatar
Eckhart Arnold committed
300
        descendants of the tree of parsers. The same function can never
301 302 303 304 305 306 307 308 309 310
        be applied twice between calls of the ``reset()``-method!
        """
        if func in self.cycle_detection:
            return False
        else:
            self.cycle_detection.add(func)
            func(self)
            return True


311
def mixin_comment(whitespace: str, comment: str) -> str:
312 313
    """
    Returns a regular expression that merges comment and whitespace
314 315 316 317 318 319 320 321 322 323 324
    regexps. Thus comments cann occur whereever whitespace is allowed
    and will be skipped just as implicit whitespace.

    Note, that because this works on the level of regular expressions,
    nesting comments is not possible. It also makes it much harder to
    use directives inside comments (which isn't recommended, anyway).
    """
    wspc = '(?:' + whitespace + '(?:' + comment + whitespace + ')*)'
    return wspc


325
class Grammar:
326 327
    root__ = None  # type: Union[Parser, None]
    # root__ must be overwritten with the root-parser by grammar subclass
Eckhart Arnold's avatar
Eckhart Arnold committed
328
    parser_initialization__ = "pending"  # type: str
329 330 331 332 333
    # some default values
    COMMENT__ = r''  # r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'[\t ]*', comment=COMMENT__)
    wspL__ = ''
    wspR__ = WSP__
334

335

336 337
    @classmethod
    def _assign_parser_names(cls):
338 339
        """
        Initializes the `parser.name` fields of those
340 341
        Parser objects that are directly assigned to a class field with
        the field's name, e.g.
342
            class Grammar(Grammar):
343 344 345
                ...
                symbol = RE('(?!\\d)\\w+')
        After the call of this method symbol.name == "symbol"
346 347 348 349
        holds. Names assigned via the ``name``-parameter of the
        constructor will not be overwritten. Parser names starting or
        ending with a double underscore like ``root__`` will be
        ignored. See ``toolkit.sane_parser_name()``
350 351 352 353

        This is done only once, upon the first instantiation of the
        grammar class!

354 355 356 357 358
        Attention: If there exists more than one reference to the same
        parser, only the first one will be chosen for python versions 
        greater or equal 3.6.  For python version <= 3.5 an arbitrarily
        selected reference will be chosen. See PEP 520
        (www.python.org/dev/peps/pep-0520/) for an explanation of why. 
359 360 361 362 363
        """
        if cls.parser_initialization__ == "done":
            return
        cdict = cls.__dict__
        for entry, parser in cdict.items():
364
            if isinstance(parser, Parser) and sane_parser_name(entry):
365
                if not parser.name:
366
                    parser.name = entry
367
                if (isinstance(parser, Forward) and (not parser.parser.name)):
368 369 370
                    parser.parser.name = entry
        cls.parser_initialization__ = "done"

371

Eckhart Arnold's avatar
Eckhart Arnold committed
372 373 374
    def __init__(self, root: Parser=None) -> None:
        # if not hasattr(self.__class__, 'parser_initialization__'):
        #     self.__class__.parser_initialization__ = "pending"
375 376 377 378
        if not hasattr(self.__class__, 'wspL__'):
            self.wspL__ = ''
        if not hasattr(self.__class__, 'wspR__'):
            self.wspR__ = ''
379 380 381
        self.all_parsers__ = set()  # type: Set[Parser]
        self.dirty_flag__ = False
        self.history_tracking__ = False
382
        self._reset__()
383

Eckhart Arnold's avatar
Eckhart Arnold committed
384 385
        # prepare parsers in the class, first
        self._assign_parser_names()
386

Eckhart Arnold's avatar
Eckhart Arnold committed
387 388 389 390
        # then deep-copy the parser tree from class to instance;
        # parsers not connected to the root object will be copied later
        # on demand (see Grammar.__getitem__()). Usually, the need to
        # do so only arises during testing.
391
        self.root__ = root if root else copy.deepcopy(self.__class__.root__)
392

393
        if self.wspL__:
Eckhart Arnold's avatar
Eckhart Arnold committed
394
            self.wsp_left_parser__ = Whitespace(self.wspL__)  # type: ParserBase
395
            self.wsp_left_parser__.grammar = self
396
            self.all_parsers__.add(self.wsp_left_parser__)  # don't you forget about me...
397 398 399
        else:
            self.wsp_left_parser__ = ZOMBIE_PARSER
        if self.wspR__:
Eckhart Arnold's avatar
Eckhart Arnold committed
400
            self.wsp_right_parser__ = Whitespace(self.wspR__)  # type: ParserBase
401
            self.wsp_right_parser__.grammar = self
402
            self.all_parsers__.add(self.wsp_right_parser__)  # don't you forget about me...
403 404
        else:
            self.wsp_right_parser__ = ZOMBIE_PARSER
405
        self.root__.apply(self._add_parser__)
406

407

408
    def __getitem__(self, key):
409 410 411
        try:
            return self.__dict__[key]
        except KeyError:
412 413
            parser_template = getattr(self, key, None)
            if parser_template:
Eckhart Arnold's avatar
Eckhart Arnold committed
414
                # add parser to grammar object on the fly...
415
                parser = copy.deepcopy(parser_template)
416
                parser.apply(self._add_parser__)
417
                # assert self[key] == parser
Eckhart Arnold's avatar
Eckhart Arnold committed
418
                return self[key]
419
            raise KeyError('Unknown parser "%s" !' % key)
420

421

422
    def _reset__(self):
423
        self.document__ = ""          # type: str
424
        # variables stored and recalled by Capture and Retrieve parsers
425 426
        self.variables__ = dict()     # type: Dict[str, List[str]]
        self.rollback__ = []          # type: List[Tuple[int, Callable]]
427
        self.last_rb__loc__ = -1  # type: int
428
        # previously parsed node, needed by Lookbehind parser
429
        self.last_node__ = None       # type: Node
430
        # support for call stack tracing
431
        self.call_stack__ = []        # type: List[Parser]
432
        # snapshots of call stacks
433
        self.history__ = []           # type: List[HistoryRecord]
434
        # also needed for call stack tracing
435
        self.moving_forward__ = True  # type: bool
436
        self.left_recursion_encountered__ = False  # type: bool
437

438

439
    def _add_parser__(self, parser: Parser) -> None:
440 441
        """
        Adds the particular copy of the parser object to this
442
        particular instance of Grammar.
443
        """
444
        if parser.name:
Eckhart Arnold's avatar
Eckhart Arnold committed
445 446 447
            # prevent overwriting instance variables or parsers of a different class
            assert parser.name not in self.__dict__ or \
                   isinstance(self.__dict__[parser.name], parser.__class__), \
448 449
                ('Cannot add parser "%s" because a field with the same name '
                 'already exists in grammar object!' % parser.name)
450
            setattr(self, parser.name, parser)
451
        self.all_parsers__.add(parser)
452 453
        parser.grammar = self

454

Eckhart Arnold's avatar
Eckhart Arnold committed
455
    def __call__(self, document: str, start_parser="root__") -> Node:
456 457
        """
        Parses a document with with parser-combinators.
458 459 460

        Args:
            document (str): The source text to be parsed.
461 462 463
            start_parser (str): The name of the parser with which to
                start. This is useful for testing particular parsers
                (i.e. particular parts of the EBNF-Grammar.)
464 465 466
        Returns:
            Node: The root node ot the parse tree.
        """
467
        # assert isinstance(document, str), type(document)
468 469
        if self.root__ is None:
            raise NotImplementedError()
470
        if self.dirty_flag__:
471
            self._reset__()
472
            for parser in self.all_parsers__:
473 474
                parser.reset()
        else:
475 476 477
            self.dirty_flag__ = True
        self.history_tracking__ = is_logging()
        self.document__ = document
478
        self.last_rb__loc__ = len(document) + 1  # rollback location
Eckhart Arnold's avatar
Eckhart Arnold committed
479
        parser = self[start_parser] if isinstance(start_parser, str) else start_parser
480 481
        assert parser.grammar == self, "Cannot run parsers from a different grammar object!" \
                                       " %s vs. %s" % (str(self), str(parser.grammar))
482
        stitches = []  # type: List[Node]
483
        rest = document
484 485
        if not rest:
            result, ignore = parser(rest)
486 487 488 489 490 491 492 493 494 495
        while rest and len(stitches) < MAX_DROPOUTS:
            result, rest = parser(rest)
            if rest:
                fwd = rest.find("\n") + 1 or len(rest)
                skip, rest = rest[:fwd], rest[fwd:]
                if result is None:
                    error_msg = "Parser did not match! Invalid source file?"
                else:
                    stitches.append(result)
                    error_msg = "Parser stopped before end" + \
496 497
                                (("! trying to recover" +
                                  (" but stopping history recording at this point."
498
                                   if self.history_tracking__ else "..."))
499 500 501 502
                                 if len(stitches) < MAX_DROPOUTS
                                 else " too often! Terminating parser.")
                stitches.append(Node(None, skip))
                stitches[-1].add_error(error_msg)
503
                if self.history_tracking__:
504 505 506 507
                    # some parsers may have matched and left history records with nodes != None.
                    # Because these are not connected to the stiched root node, their pos
                    # properties will not be initialized by setting the root node's pos property
                    # to zero. Therefore, their pos properties need to be initialized here
508
                    for record in self.history__:
509 510
                        if record.node and record.node._pos < 0:
                            record.node.pos = 0
511 512 513
                    record = HistoryRecord(self.call_stack__.copy(), stitches[-1], len(rest))
                    self.history__.append(record)
                    self.history_tracking__ = False
514 515 516 517
        if stitches:
            if rest:
                stitches.append(Node(None, rest))
            result = Node(None, tuple(stitches))
518
        if any(self.variables__.values()):
519
            result.add_error("Capture-retrieve-stack not empty after end of parsing: "
520
                             + str(self.variables__))
521 522 523
        result.pos = 0  # calculate all positions
        return result

524

525
    def push_rollback__(self, location, func):
526 527
        """
        Adds a rollback function that either removes or re-adds
528 529 530 531
        values on the variable stack (`self.variables`) that have been
        added (or removed) by Capture or Pop Parsers, the results of
        which have been dismissed.
        """
532 533 534
        self.rollback__.append((location, func))
        self.last_rb__loc__ = location

535

536
    def rollback_to__(self, location):
537 538
        """
        Rolls back the variable stacks (`self.variables`) to its
539 540
        state at an earlier location in the parsed document.
        """
541 542 543 544 545 546 547
        while self.rollback__ and self.rollback__[-1][0] <= location:
            loc, rollback_func = self.rollback__.pop()
            assert not loc > self.last_rb__loc__
            rollback_func()
        self.last_rb__loc__ == self.rollback__[-1][0] if self.rollback__ \
            else (len(self.document__) + 1)

548

549
    def log_parsing_history__(self, log_file_name: str = '') -> None:
550 551
        """
        Writes a log of the parsing history of the most recently parsed
552 553 554
        document. 
        """
        def prepare_line(record):
555
            excerpt = self.document__.__getitem__(record.extent)[:25].replace('\n', '\\n')
556
            excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
557
            return record.stack, record.status, excerpt
558 559

        def write_log(history, log_name):
Eckhart Arnold's avatar
Eckhart Arnold committed
560
            path = os.path.join(log_dir(), log_name + "_parser.log")
561 562 563 564 565 566
            if history:
                with open(path, "w", encoding="utf-8") as f:
                    f.write("\n".join(history))
            elif os.path.exists(path):
                os.remove(path)

Eckhart Arnold's avatar
Eckhart Arnold committed
567 568 569 570
        if not log_file_name:
            name = self.__class__.__name__
            log_file_name = name[:-7] if name.lower().endswith('grammar') else name
        full_history, match_history, errors_only = [], [], []
571
        for record in self.history__:
Eckhart Arnold's avatar
Eckhart Arnold committed
572 573
            line = ";  ".join(prepare_line(record))
            full_history.append(line)
574
            if record.node and record.node.parser.ptype != WHITESPACE_PTYPE:
Eckhart Arnold's avatar
Eckhart Arnold committed
575
                match_history.append(line)
576
                if record.node.error_flag:
Eckhart Arnold's avatar
Eckhart Arnold committed
577 578 579 580
                    errors_only.append(line)
        write_log(full_history, log_file_name + '_full')
        write_log(match_history, log_file_name + '_match')
        write_log(errors_only, log_file_name + '_errors')
581 582


Eckhart Arnold's avatar
Eckhart Arnold committed
583
def dsl_error_msg(parser: Parser, error_str: str) -> str:
584 585
    """
    Returns an error message for errors in the parser configuration,
586 587 588
    e.g. errors that result in infinite loops.

    Args:
Eckhart Arnold's avatar
Eckhart Arnold committed
589
        parser (Parser):  The parser where the error was noticed. Note
590
            that this is not necessarily the parser that caused the
591
            error but only where the error became apparent.
592 593 594 595 596
        error_str (str):  A short string describing the error.
    Returns:  
        str: An error message including the call stack if history 
        tacking has been turned in the grammar object.
    """
597
    msg = ["DSL parser specification error:", error_str, 'Caught by parser "%s".' % str(parser)]
598 599
    if parser.grammar.history__:
        msg.extend(["\nCall stack:", parser.grammar.history__[-1].stack])
600 601 602 603 604
    else:
        msg.extend(["\nEnable history tracking in Grammar object to display call stack."])
    return " ".join(msg)


605

606 607 608 609 610 611 612 613 614 615 616 617
########################################################################
#
# Token and Regular Expression parser classes (i.e. leaf classes)
#
########################################################################


RX_SCANNER_TOKEN = re.compile('\w+')
BEGIN_SCANNER_TOKEN = '\x1b'
END_SCANNER_TOKEN = '\x1c'


618
def make_token(token: str, argument: str = '') -> str:
619 620
    """
    Turns the ``token`` and ``argument`` into a special token that
621 622 623 624 625 626 627 628 629 630 631 632
    will be caught by the `ScannerToken`-parser.

    This function is a support function that should be used by scanners
    to inject scanner tokens into the source text.
    """
    assert RX_SCANNER_TOKEN.match(token)
    assert argument.find(BEGIN_SCANNER_TOKEN) < 0
    assert argument.find(END_SCANNER_TOKEN) < 0

    return BEGIN_SCANNER_TOKEN + token + argument + END_SCANNER_TOKEN


633
def nil_scanner(text: str) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
634
    return text
635 636 637


class ScannerToken(Parser):
638 639 640 641 642 643 644 645 646 647
    """
    Parses tokens that have been inserted by a Scanner.
    
    Scanners can generate Tokens with the ``make_token``-function.
    These tokens start and end with magic characters that can only be
    matched by the ScannerToken Parser. Scanner tokens can be used to
    insert BEGIN - END delimiters at the beginning or ending of an 
    indented block. Otherwise indented block are difficult to handle 
    with parsing expression grammars.
    """
648 649 650

    def __init__(self, scanner_token: str) -> None:
        assert scanner_token and scanner_token.isupper()
651
        assert RX_SCANNER_TOKEN.match(scanner_token)
652
        super(ScannerToken, self).__init__(scanner_token)
653

654
    def __call__(self, text: str) -> Tuple[Node, str]:
655 656 657 658 659
        if text[0:1] == BEGIN_SCANNER_TOKEN:
            end = text.find(END_SCANNER_TOKEN, 1)
            if end < 0:
                node = Node(self, '').add_error(
                    'END_SCANNER_TOKEN delimiter missing from scanner token. '
660
                    '(Most likely due to a scanner bug!)')  # type: Node
661 662 663 664
                return node, text[1:]
            elif end == 0:
                node = Node(self, '').add_error(
                    'Scanner token cannot have zero length. '
665
                    '(Most likely due to a scanner bug!)')
666 667 668 669 670 671
                return node, text[2:]
            elif text.find(BEGIN_SCANNER_TOKEN, 1, end) >= 0:
                node = Node(self, text[len(self.name) + 1:end])
                node.add_error(
                    'Scanner tokens must not be nested or contain '
                    'BEGIN_SCANNER_TOKEN delimiter as part of their argument. '
672
                    '(Most likely due to a scanner bug!)')
673 674 675 676 677 678 679 680
                return node, text[end:]
            if text[1:len(self.name) + 1] == self.name:
                return Node(self, text[len(self.name) + 1:end]), \
                       text[end + 1:]
        return None, text


class RegExp(Parser):
681 682
    """
    Regular expression parser.
683 684 685 686 687 688
    
    The RegExp-parser parses text that matches a regular expression.
    RegExp can also be considered as the "atomic parser", because all
    other parsers delegate part of the parsing job to other parsers,
    but do not match text directly.
    """
689 690

    def __init__(self, regexp, name: str = '') -> None:
691
        super(RegExp, self).__init__(name)
692 693 694
        self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

    def __deepcopy__(self, memo):
Eckhart Arnold's avatar
Eckhart Arnold committed
695
        # `regex` supports deep copies, but not `re`
696
        try:
Eckhart Arnold's avatar
Eckhart Arnold committed
697
            regexp = copy.deepcopy(self.regexp, memo)
698 699
        except TypeError:
            regexp = self.regexp.pattern
700
        return RegExp(regexp, self.name)
701

702
    def __call__(self, text: str) -> Tuple[Node, str]:
703 704 705 706 707 708
        match = text[0:1] != BEGIN_SCANNER_TOKEN and self.regexp.match(text)  # ESC starts a scanner token.
        if match:
            end = match.end()
            return Node(self, text[:end]), text[end:]
        return None, text

709
    def __repr__(self):
710
        return '/%s/' % self.regexp.pattern
711

712

713 714
class Whitespace(RegExp):
    assert WHITESPACE_PTYPE == ":Whitespace"
715 716


717
class RE(Parser):
718 719
    """
    Regular Expressions with optional leading or trailing whitespace.
720 721 722 723 724 725 726 727 728 729
    
    The RE-parser parses pieces of text that match a given regular
    expression. Other than the ``RegExp``-Parser it can also skip 
    "implicit whitespace" before or after the matched text.
    
    The whitespace is in turn defined by a regular expression. It
    should be made sure that this expression also matches the empty
    string, e.g. use r'\s*' or r'[\t ]+', but not r'\s+'. If the
    respective parameters in the constructor are set to ``None`` the
    default whitespace expression from the Grammar object will be used.
730
    """
731
    def __init__(self, regexp, wL=None, wR=None, name=''):
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746
        """Constructor for class RE.
                
        Args:
            regexp (str or regex object):  The regular expression to be
                used for parsing. 
            wL (str or regexp):  Left whitespace regular expression, 
                i.e. either ``None``, the empty string or a regular
                expression (e.g. "\s*") that defines whitespace. An 
                empty string means no whitespace will be skipped,
                ``None`` means that the default whitespace will be 
                used.
            wR (str or regexp):  Right whitespace regular expression.
                See above.
            name:  The optional name of the parser.
        """
747
        super(RE, self).__init__(name)
748 749
        self.wL = wL
        self.wR = wR
750 751
        self.wspLeft = Whitespace(wL) if wL else ZOMBIE_PARSER
        self.wspRight = Whitespace(wR) if wR else ZOMBIE_PARSER
752 753
        self.main = RegExp(regexp)

Eckhart Arnold's avatar
Eckhart Arnold committed
754 755 756 757 758
    def __deepcopy__(self, memo={}):
        try:
            regexp = copy.deepcopy(self.main.regexp, memo)
        except TypeError:
            regexp = self.main.regexp.pattern
759
        return self.__class__(regexp, self.wL, self.wR, self.name)
Eckhart Arnold's avatar
Eckhart Arnold committed
760

761
    def __call__(self, text: str) -> Tuple[Node, str]:
762
        # assert self.main.regexp.pattern != "@"
763
        t = text    # type: str
764 765 766 767 768 769 770 771 772
        wL, t = self.wspLeft(t)
        main, t = self.main(t)
        if main:
            wR, t = self.wspRight(t)
            result = tuple(nd for nd in (wL, main, wR)
                           if nd and nd.result != '')
            return Node(self, result), t
        return None, text

773
    def __repr__(self):
di68kap's avatar
di68kap committed
774 775
        wL = '~' if self.wspLeft else ''
        wR = '~' if self.wspRight else ''
776
        return wL + '/%s/' % self.main.regexp.pattern + wR
777

778 779
    def _grammar_assigned_notifier(self):
        if self.grammar:
780
            # use default whitespace parsers if not otherwise specified
781 782 783 784 785
            if self.wL is None:
                self.wspLeft = self.grammar.wsp_left_parser__
            if self.wR is None:
                self.wspRight = self.grammar.wsp_right_parser__

786
    def apply(self, func: Parser.ApplyFunc):
787 788 789 790 791 792 793 794
        if super(RE, self).apply(func):
            if self.wL:
                self.wspLeft.apply(func)
            if self.wR:
                self.wspRight.apply(func)
            self.main.apply(func)


795
class Token(RE):
796 797
    """
    Class Token parses simple strings. Any regular regular
798 799 800 801 802 803 804 805
    expression commands will be interpreted as simple sequence of
    characters.

    Other than that class Token is essentially a renamed version of
    class RE. Because tokens often have a particular semantic different
    from other REs, parsing them with a separate parser class allows to
    distinguish them by their parser type.
    """
806 807
    assert TOKEN_PTYPE == ":Token"

808
    def __init__(self, token: str, wL=None, wR=None, name: str = '') -> None:
809 810 811 812 813
        self.token = token
        super(Token, self).__init__(escape_re(token), wL, wR, name)

    def __deepcopy__(self, memo={}):
        return self.__class__(self.token, self.wL, self.wR, self.name)
814

815
    def __repr__(self):
816
        return '"%s"' % self.token if self.token.find('"') < 0 else "'%s'" % self.token
817

818 819 820 821 822 823 824 825 826

########################################################################
#
# Combinator parser classes (i.e. trunk classes of the parser tree)
#
########################################################################


class UnaryOperator(Parser):
827
    def __init__(self, parser: Parser, name: str = '') -> None:
828
        super(UnaryOperator, self).__init__(name)
829
        # assert isinstance(parser, Parser)
830
        self.parser = parser  # type: Parser
831

Eckhart Arnold's avatar
Eckhart Arnold committed
832 833
    def __deepcopy__(self, memo):
        parser = copy.deepcopy(self.parser, memo)
834
        return self.__class__(parser, self.name)
Eckhart Arnold's avatar
Eckhart Arnold committed
835

836
    def apply(self, func: Parser.ApplyFunc):
837 838 839 840 841
        if super(UnaryOperator, self).apply(func):
            self.parser.apply(func)


class NaryOperator(Parser):
842
    def __init__(self, *parsers: Parser, name: str = '') -> None:
843
        super(NaryOperator, self).__init__(name)
844
        # assert all([isinstance(parser, Parser) for parser in parsers]), str(parsers)
845
        self.parsers = parsers  # type: Tuple[Parser, ...]
846

Eckhart Arnold's avatar
Eckhart Arnold committed
847 848
    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
849
        return self.__class__(*parsers, name=self.name)
Eckhart Arnold's avatar
Eckhart Arnold committed
850

851
    def apply(self, func: Parser.ApplyFunc):
852 853 854 855 856 857
        if super(NaryOperator, self).apply(func):
            for parser in self.parsers:
                parser.apply(func)


class Optional(UnaryOperator):
858
    def __init__(self, parser: Parser, name: str = '') -> None:
859
        super(Optional, self).__init__(parser, name)
860
        # assert isinstance(parser, Parser)
861
        assert not isinstance(parser, Optional), \