parser.py 58.6 KB
Newer Older
1
"""parsers.py - parser combinators for for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
Eckhart Arnold's avatar
Eckhart Arnold committed
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

Module ``parsers.py`` contains a number of classes that together
make up parser combinators for left-recursive grammers. For each
element of the extended Backus-Naur-Form as well as for a regular
expression token a class is defined. The set of classes can be used to
define a parser for (ambiguous) left-recursive grammers.


References and Acknowledgements:

Dominikus Herzberg: Objekt-orientierte Parser-Kombinatoren in Python,
Blog-Post, September, 18th 2008 on denkspuren. gedanken, ideen,
anregungen und links rund um informatik-themen, URL:
http://denkspuren.blogspot.de/2008/09/objekt-orientierte-parser-kombinatoren.html

Dominikus Herzberg: Eine einfache Grammatik für LaTeX, Blog-Post,
September, 18th 2008 on denkspuren. gedanken, ideen, anregungen und
links rund um informatik-themen, URL:
http://denkspuren.blogspot.de/2008/09/eine-einfache-grammatik-fr-latex.html

Dominikus Herzberg: Uniform Syntax, Blog-Post, February, 27th 2007 on
denkspuren. gedanken, ideen, anregungen und links rund um
informatik-themen, URL:
http://denkspuren.blogspot.de/2007/02/uniform-syntax.html

Richard A. Frost, Rahmatullah Hafiz and Paul Callaghan: Parser
Combinators for Ambiguous Left-Recursive Grammars, in: P. Hudak and
D.S. Warren (Eds.): PADL 2008, LNCS 4902, pp. 167–181, Springer-Verlag
Berlin Heidelberg 2008.

47
48
49
50
Elizabeth Scott and Adrian Johnstone, GLL Parsing,
in: Electronic Notes in Theoretical Computer Science 253 (2010) 177–189,
http://dotat.at/tmp/gll.pdf

Eckhart Arnold's avatar
Eckhart Arnold committed
51
52
Juancarlo Añez: grako, a PEG parser generator in Python,
https://bitbucket.org/apalala/grako
53
54
55

Vegard Øye: General Parser Combinators in Racket, 2012,
https://epsil.github.io/gll/
56
57
"""

58

Eckhart Arnold's avatar
Eckhart Arnold committed
59
import abc
60
import copy
61
62
import os
import platform
Eckhart Arnold's avatar
Eckhart Arnold committed
63
64
from functools import partial

65
66
67
68
try:
    import regex as re
except ImportError:
    import re
69
try:
70
    from typing import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union
71
72
73
74
    # try:
    #     from typing import Collection
    # except ImportError:
    #     pass
75
except ImportError:
76
    from .typing34 import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union
77

78
from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name
Eckhart Arnold's avatar
Eckhart Arnold committed
79
80
from DHParser.syntaxtree import WHITESPACE_PTYPE, TOKEN_PTYPE, ZOMBIE_PARSER, ParserBase, \
    Node, TransformationFunc
81
from DHParser.toolkit import load_if_file, error_messages
Eckhart Arnold's avatar
Eckhart Arnold committed
82

83
__all__ = ('PreprocessorFunc',
84
           'HistoryRecord',
Eckhart Arnold's avatar
Eckhart Arnold committed
85
           'Parser',
86
           'Grammar',
87
88
89
           'RX_PREPROCESSOR_TOKEN',
           'BEGIN_TOKEN',
           'END_TOKEN',
Eckhart Arnold's avatar
Eckhart Arnold committed
90
           'make_token',
91
92
           'nil_preprocessor',
           'PreprocessorToken',
Eckhart Arnold's avatar
Eckhart Arnold committed
93
94
95
96
           'RegExp',
           'RE',
           'Token',
           'mixin_comment',
97
98
99
           # 'UnaryOperator',
           # 'NaryOperator',
           'Synonym',
Eckhart Arnold's avatar
Eckhart Arnold committed
100
101
102
           'Optional',
           'ZeroOrMore',
           'OneOrMore',
103
           'Series',
Eckhart Arnold's avatar
Eckhart Arnold committed
104
105
106
107
108
109
110
           'Alternative',
           'FlowOperator',
           'Required',
           'Lookahead',
           'NegativeLookahead',
           'Lookbehind',
           'NegativeLookbehind',
111
112
113
           'last_value',
           'counterpart',
           'accumulate',
Eckhart Arnold's avatar
Eckhart Arnold committed
114
115
116
117
           'Capture',
           'Retrieve',
           'Pop',
           'Forward',
118
           'Compiler',
119
           'compile_source')
Eckhart Arnold's avatar
Eckhart Arnold committed
120

121

122

123
124
125
126
127
128
129
########################################################################
#
# Grammar and parsing infrastructure
#
########################################################################


130
PreprocessorFunc = Union[Callable[[str], str], partial]
131
132


133
134
135
136
137
LEFT_RECURSION_DEPTH = 20 if platform.python_implementation() == "PyPy" \
                          else 8 # type: int
# because of python's recursion depth limit, this value ought not to be set too high
MAX_DROPOUTS = 25  # type: int
# stop trying to recover parsing after so many errors
138

139

140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
class HistoryRecord:
    """
    Stores debugging information about one completed step in the
    parsing history. 

    A parsing step is "completed" when the last one of a nested
    sequence of parser-calls returns. The call stack including
    the last parser call will be frozen in the ``HistoryRecord``-
    object. In addition a reference to the generated leaf node
    (if any) will be stored and the result status of the last
    parser call, which ist either MATCH, FAIL (i.e. no match)
    or ERROR.
    """
    __slots__ = ('call_stack', 'node', 'remaining')

    MATCH = "MATCH"
    ERROR = "ERROR"
    FAIL = "FAIL"

159
160
161
162
    def __init__(self, call_stack: List['Parser'], node: Node, remaining: int) -> None:
        self.call_stack = call_stack    # type: List['Parser']
        self.node = node                # type: Node
        self.remaining = remaining      # type: int
163

164
    def err_msg(self) -> str:
165
166
        return self.ERROR + ": " + "; ".join(self.node._errors).replace('\n', '\\')

167
    @property
168
    def stack(self) -> str:
169
170
        return "->".join((repr(p) if p.ptype == ':RegExp' else p.name or p.ptype)
                         for p in self.call_stack)
171
172

    @property
173
    def status(self) -> str:
174
175
        return self.FAIL if self.node is None else \
            self.err_msg() if self.node._errors else self.MATCH
176
177

    @property
178
179
180
    def extent(self) -> slice:
        return (slice(-self.remaining - self.node.len, -self.remaining) if self.node
                else slice(-self.remaining, None))
181
182
183


def add_parser_guard(parser_func):
184
185
186
187
188
    """
    Add a wrapper function to a parser functions (i.e. Parser.__call__ method)
    that takes care of memoizing, left recursion and optionally tracing
    (aka "history tracking") of parser calls. Returns the wrapped call.
    """
189
    def guarded_call(parser: 'Parser', text: str) -> Tuple[Node, str]:
190
191
        try:
            location = len(text)
192
193
            grammar = parser.grammar  # grammar may be 'None' for unconnected parsers!

194
            if not grammar.moving_forward__:
195
                # rollback variable changes from discarded parser passes
196
197
198
                if grammar.last_rb__loc__ <= location:
                    grammar.rollback_to__(location)
                grammar.moving_forward__ = True
199
                grammar.left_recursion_encountered__ = False
200
201
202

            if grammar.history_tracking__:
                grammar.call_stack__.append(parser)
203

204
205
206
            # if location has already been visited by the current parser,
            # return saved result
            if location in parser.visited:
207
                return parser.visited[location]
208

209
210
            # break left recursion at the maximum allowed depth
            if parser.recursion_counter.setdefault(location, 0) > LEFT_RECURSION_DEPTH:
211
                grammar.left_recursion_encountered__ = True
212
213
214
215
216
217
218
                return None, text

            parser.recursion_counter[location] += 1

            # run original __call__ method
            node, rest = parser_func(parser, text)

219
            if node is None:
Eckhart Arnold's avatar
Eckhart Arnold committed
220
221
222
223
224
225
226
                # retrieve an earlier match result (from left recursion)
                # if it exists
                node, rest = parser.visited.get(location, (None, rest))
                # don't overwrite any positive match (i.e. node not None) in the cache
                # and don't add empty entries for parsers returning from left recursive calls!
                if node is None and not grammar.left_recursion_encountered__:
                    # ortherwise also cache None-results
227
228
                    parser.visited[location] = None, rest
            else:
229
                # variable manipulating parsers will be excluded, though,
230
                # because caching would interfere with changes of variable state
231
                if grammar.last_rb__loc__ > location:
Eckhart Arnold's avatar
Eckhart Arnold committed
232
233
                    # in case of left recursion, the first recursive step that
                    # matches will store its result in the cache
234
                    parser.visited[location] = (node, rest)
235
                grammar.last_node__ = node   # store last node for Lookbehind parser
236
237
238

            parser.recursion_counter[location] -= 1

239
            if grammar.history_tracking__:
240
                # don't track returning parsers except in case an error has occurred
241
242
243
                if grammar.moving_forward__ or (node and node._errors):
                    record = HistoryRecord(grammar.call_stack__.copy(), node, len(rest))
                    grammar.history__.append(record)
244
                    # print(record.stack, record.status, rest[:20].replace('\n', '|'))
245
246
                grammar.call_stack__.pop()
            grammar.moving_forward__ = False
247

248
249
250
251
252
253
254
255
256
257
258
        except RecursionError:
            node = Node(None, text[:min(10, max(1, text.find("\n")))] + " ...")
            node.add_error("maximum recursion depth of parser reached; "
                           "potentially due to too many errors!")
            rest = ''

        return node, rest

    return guarded_call


Eckhart Arnold's avatar
Eckhart Arnold committed
259
class ParserMetaClass(abc.ABCMeta):
260
261
262
263
264
    """
    ParserMetaClass adds a wrapper to the __call__ method of parser
    objects during initialization that takes care of memoizing,
    left recursion and tracing.
    """
265
    def __init__(cls, name, bases, attrs):
266
267
        guarded_parser_call = add_parser_guard(cls.__call__)
        # The following check is necessary for classes that don't override
268
269
270
271
272
273
274
        # the __call__() method, because in these cases the non-overridden
        # __call__()-method would be substituted a second time!
        if cls.__call__.__code__ != guarded_parser_call.__code__:
            cls.__call__ = guarded_parser_call
        super(ParserMetaClass, cls).__init__(name, bases, attrs)


Eckhart Arnold's avatar
Eckhart Arnold committed
275
class Parser(ParserBase, metaclass=ParserMetaClass):
276
277
278
279
280
281
    """
    (Abstract) Base class for Parser combinator parsers. Any parser
    object that is actually used for parsing (i.e. no mock parsers)
    should should be derived from this class.
    """

282
283
    ApplyFunc = Callable[['Parser'], None]

284
285
    def __init__(self, name: str = '') -> None:
        # assert isinstance(name, str), str(name)
Eckhart Arnold's avatar
Eckhart Arnold committed
286
        super(Parser, self).__init__(name)
287
        self._grammar = None  # type: 'Grammar'
288
289
        self.reset()

Eckhart Arnold's avatar
Eckhart Arnold committed
290
    def __deepcopy__(self, memo):
291
292
        return self.__class__(self.name)

293
    def reset(self):
294
        self.visited = dict()            # type: Dict[int, Tuple[Node, str]]
295
        self.recursion_counter = dict()  # type: Dict[int, int]
296
        self.cycle_detection = set()     # type: Set[Callable]
297
        return self
298

299
    def __call__(self, text: str) -> Tuple[Node, str]:
300
301
        return None, text  # default behaviour: don't match

302
    def __add__(self, other: 'Parser') -> 'Series':
303
304
        """The + operator generates a series-parser that applies two
        parsers in sequence."""
305
        return Series(self, other)
306

307
    def __or__(self, other: 'Parser') -> 'Alternative':
308
309
310
        """The | operator generates an alternative parser that applies
        the first parser and, if that does not match, the second parser.
        """
311
312
        return Alternative(self, other)

313
    @property
314
    def grammar(self) -> 'Grammar':
315
316
317
        return self._grammar

    @grammar.setter
318
    def grammar(self, grammar: 'Grammar'):
319
320
        assert self._grammar is None or self._grammar == grammar, \
            "Parser has already been assigned to a Grammar object!"
321
        self._grammar = grammar
322
323
324
        self._grammar_assigned_notifier()

    def _grammar_assigned_notifier(self):
325
326
        """A function that notifies the parser object that it has been
        assigned to a grammar."""
327
328
        pass

329
    def apply(self, func: ApplyFunc):
330
331
        """
        Applies function `func(parser)` recursively to this parser and all
Eckhart Arnold's avatar
Eckhart Arnold committed
332
        descendants of the tree of parsers. The same function can never
333
334
335
336
337
338
339
340
341
342
        be applied twice between calls of the ``reset()``-method!
        """
        if func in self.cycle_detection:
            return False
        else:
            self.cycle_detection.add(func)
            func(self)
            return True


343
def mixin_comment(whitespace: str, comment: str) -> str:
344
345
    """
    Returns a regular expression that merges comment and whitespace
346
347
348
349
350
351
352
353
354
355
356
    regexps. Thus comments cann occur whereever whitespace is allowed
    and will be skipped just as implicit whitespace.

    Note, that because this works on the level of regular expressions,
    nesting comments is not possible. It also makes it much harder to
    use directives inside comments (which isn't recommended, anyway).
    """
    wspc = '(?:' + whitespace + '(?:' + comment + whitespace + ')*)'
    return wspc


357
class Grammar:
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
    """
    Class Grammar is the (abstract) base class for grammars. Grammars
    are collections of parser objects that are assigned to class
    variables of a descendent of class Grammar. Except for tesing
    purposes all parser objects should be rooted in a sinlge 'root'
    parser which should be assigned to the class variable 'root__'

    Collecting the parsers that define a grammar in a class and
    assigning the named parsers to class variables rather than
    global variables keeps the namespace clean. As a consequence,
    though, it is highly recommended that a Grammar class should not
    define any other variables or methods with names that are legal
    parser names. A name ending with a double underscore '__' is *not*
    a legal parser name and can safely be used.

    Example:
        class Arithmetic(Grammar):
            # special fields for implicit whitespace and comment configuration
            COMMENT__ = r'#.*(?:\n|$)'  # Python style comments
            wspR__ = mixin_comment(whitespace=r'[\t ]*', comment=COMMENT__)

            # parsers
            expression = Forward()
            INTEGER = RE('\\d+')
            factor = INTEGER | Token("(") + expression + Token(")")
            term = factor + ZeroOrMore((Token("*") | Token("/")) + factor)
            expression.set(term + ZeroOrMore((Token("+") | Token("-")) + term))
            root__ = expression

    Upon instantiation the parser objects are deep-copied to the
    Grammar object and assigned to object variables of the same name.
    Any parser that is directly assigned to a class variable is a
    'named' parser and its field `parser.name` contains the variable
    name after instantiation of the Grammar class. All other parsers,
    i.e. parsers that are defined within a `named` parser, are
    "anonymous parsers" where `parser.name` is the empty string.
    If one and the same parser is assigned to several class variables
    such as, for example the parser `expression` in the example above,
    the first name sticks.

    Grammar objects are callable. Calling a grammar object with a UTF-8
    encoded document, initiates the parsing of the document with the
    root parser. The return value is the concrete syntax tree. Grammar
    objects can be reused (i.e. called again) after parsing. Thus, it
    is not necessary to instantiate more than one Grammar object per
    thread.

    Grammar objects contain a few special fields for implicit
    whitespace and comments that should be overwritten, if the defaults
    (no comments, horizontal right aligned whitespace) don't fit:
    COMMENT__   - regular expression string for matching comments
    wspL__      - regular expression string for left aligned whitespace
    wspR__      - regular expression string for right aligned whitespace
    """

413
414
    root__ = None  # type: Union[Parser, None]
    # root__ must be overwritten with the root-parser by grammar subclass
Eckhart Arnold's avatar
Eckhart Arnold committed
415
    parser_initialization__ = "pending"  # type: str
416
417
418
419
420
    # some default values
    COMMENT__ = r''  # r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'[\t ]*', comment=COMMENT__)
    wspL__ = ''
    wspR__ = WSP__
421

422

423
    @classmethod
424
    def _assign_parser_names__(cls):
425
426
        """
        Initializes the `parser.name` fields of those
427
428
        Parser objects that are directly assigned to a class field with
        the field's name, e.g.
429
            class Grammar(Grammar):
430
431
432
                ...
                symbol = RE('(?!\\d)\\w+')
        After the call of this method symbol.name == "symbol"
433
434
435
436
        holds. Names assigned via the ``name``-parameter of the
        constructor will not be overwritten. Parser names starting or
        ending with a double underscore like ``root__`` will be
        ignored. See ``toolkit.sane_parser_name()``
437
438
439
440

        This is done only once, upon the first instantiation of the
        grammar class!

441
442
443
444
445
        Attention: If there exists more than one reference to the same
        parser, only the first one will be chosen for python versions 
        greater or equal 3.6.  For python version <= 3.5 an arbitrarily
        selected reference will be chosen. See PEP 520
        (www.python.org/dev/peps/pep-0520/) for an explanation of why. 
446
447
448
449
450
        """
        if cls.parser_initialization__ == "done":
            return
        cdict = cls.__dict__
        for entry, parser in cdict.items():
451
            if isinstance(parser, Parser) and sane_parser_name(entry):
452
                if not parser.name:
453
                    parser.name = entry
454
                if (isinstance(parser, Forward) and (not parser.parser.name)):
455
456
457
                    parser.parser.name = entry
        cls.parser_initialization__ = "done"

458

Eckhart Arnold's avatar
Eckhart Arnold committed
459
460
461
    def __init__(self, root: Parser=None) -> None:
        # if not hasattr(self.__class__, 'parser_initialization__'):
        #     self.__class__.parser_initialization__ = "pending"
462
463
464
465
        # if not hasattr(self.__class__, 'wspL__'):
        #     self.wspL__ = ''
        # if not hasattr(self.__class__, 'wspR__'):
        #     self.wspR__ = ''
466
467
468
        self.all_parsers__ = set()  # type: Set[Parser]
        self.dirty_flag__ = False
        self.history_tracking__ = False
469
        self._reset__()
470

Eckhart Arnold's avatar
Eckhart Arnold committed
471
        # prepare parsers in the class, first
472
        self._assign_parser_names__()
473

Eckhart Arnold's avatar
Eckhart Arnold committed
474
475
476
477
        # then deep-copy the parser tree from class to instance;
        # parsers not connected to the root object will be copied later
        # on demand (see Grammar.__getitem__()). Usually, the need to
        # do so only arises during testing.
478
        self.root__ = root if root else copy.deepcopy(self.__class__.root__)
479

480
        if self.wspL__:
Eckhart Arnold's avatar
Eckhart Arnold committed
481
            self.wsp_left_parser__ = Whitespace(self.wspL__)  # type: ParserBase
482
            self.wsp_left_parser__.grammar = self
483
            self.all_parsers__.add(self.wsp_left_parser__)  # don't you forget about me...
484
485
486
        else:
            self.wsp_left_parser__ = ZOMBIE_PARSER
        if self.wspR__:
Eckhart Arnold's avatar
Eckhart Arnold committed
487
            self.wsp_right_parser__ = Whitespace(self.wspR__)  # type: ParserBase
488
            self.wsp_right_parser__.grammar = self
489
            self.all_parsers__.add(self.wsp_right_parser__)  # don't you forget about me...
490
491
        else:
            self.wsp_right_parser__ = ZOMBIE_PARSER
492
        self.root__.apply(self._add_parser__)
493

494

495
    def __getitem__(self, key):
496
497
498
        try:
            return self.__dict__[key]
        except KeyError:
499
500
            parser_template = getattr(self, key, None)
            if parser_template:
Eckhart Arnold's avatar
Eckhart Arnold committed
501
                # add parser to grammar object on the fly...
502
                parser = copy.deepcopy(parser_template)
503
                parser.apply(self._add_parser__)
504
                # assert self[key] == parser
Eckhart Arnold's avatar
Eckhart Arnold committed
505
                return self[key]
506
            raise KeyError('Unknown parser "%s" !' % key)
507

508

509
    def _reset__(self):
510
        self.document__ = ""          # type: str
511
        # variables stored and recalled by Capture and Retrieve parsers
512
513
        self.variables__ = dict()     # type: Dict[str, List[str]]
        self.rollback__ = []          # type: List[Tuple[int, Callable]]
514
        self.last_rb__loc__ = -1  # type: int
515
        # previously parsed node, needed by Lookbehind parser
516
        self.last_node__ = None       # type: Node
517
        # support for call stack tracing
518
        self.call_stack__ = []        # type: List[Parser]
519
        # snapshots of call stacks
520
        self.history__ = []           # type: List[HistoryRecord]
521
        # also needed for call stack tracing
522
        self.moving_forward__ = True  # type: bool
523
        self.left_recursion_encountered__ = False  # type: bool
524

525

526
    def _add_parser__(self, parser: Parser) -> None:
527
528
        """
        Adds the particular copy of the parser object to this
529
        particular instance of Grammar.
530
        """
531
        if parser.name:
Eckhart Arnold's avatar
Eckhart Arnold committed
532
533
534
            # prevent overwriting instance variables or parsers of a different class
            assert parser.name not in self.__dict__ or \
                   isinstance(self.__dict__[parser.name], parser.__class__), \
535
536
                ('Cannot add parser "%s" because a field with the same name '
                 'already exists in grammar object!' % parser.name)
537
            setattr(self, parser.name, parser)
538
        self.all_parsers__.add(parser)
539
540
        parser.grammar = self

541

Eckhart Arnold's avatar
Eckhart Arnold committed
542
    def __call__(self, document: str, start_parser="root__") -> Node:
543
544
        """
        Parses a document with with parser-combinators.
545
546
547

        Args:
            document (str): The source text to be parsed.
548
549
550
            start_parser (str): The name of the parser with which to
                start. This is useful for testing particular parsers
                (i.e. particular parts of the EBNF-Grammar.)
551
552
553
        Returns:
            Node: The root node ot the parse tree.
        """
554
        # assert isinstance(document, str), type(document)
555
556
        if self.root__ is None:
            raise NotImplementedError()
557
        if self.dirty_flag__:
558
            self._reset__()
559
            for parser in self.all_parsers__:
560
561
                parser.reset()
        else:
562
563
564
            self.dirty_flag__ = True
        self.history_tracking__ = is_logging()
        self.document__ = document
565
        self.last_rb__loc__ = len(document) + 1  # rollback location
Eckhart Arnold's avatar
Eckhart Arnold committed
566
        parser = self[start_parser] if isinstance(start_parser, str) else start_parser
567
568
        assert parser.grammar == self, "Cannot run parsers from a different grammar object!" \
                                       " %s vs. %s" % (str(self), str(parser.grammar))
569
        stitches = []  # type: List[Node]
570
        rest = document
571
572
        if not rest:
            result, ignore = parser(rest)
573
574
575
576
577
578
579
580
581
582
        while rest and len(stitches) < MAX_DROPOUTS:
            result, rest = parser(rest)
            if rest:
                fwd = rest.find("\n") + 1 or len(rest)
                skip, rest = rest[:fwd], rest[fwd:]
                if result is None:
                    error_msg = "Parser did not match! Invalid source file?"
                else:
                    stitches.append(result)
                    error_msg = "Parser stopped before end" + \
583
584
                                (("! trying to recover" +
                                  (" but stopping history recording at this point."
585
                                   if self.history_tracking__ else "..."))
586
587
588
589
                                 if len(stitches) < MAX_DROPOUTS
                                 else " too often! Terminating parser.")
                stitches.append(Node(None, skip))
                stitches[-1].add_error(error_msg)
590
                if self.history_tracking__:
591
592
593
594
                    # some parsers may have matched and left history records with nodes != None.
                    # Because these are not connected to the stiched root node, their pos
                    # properties will not be initialized by setting the root node's pos property
                    # to zero. Therefore, their pos properties need to be initialized here
595
                    for record in self.history__:
596
597
                        if record.node and record.node._pos < 0:
                            record.node.pos = 0
598
599
600
                    record = HistoryRecord(self.call_stack__.copy(), stitches[-1], len(rest))
                    self.history__.append(record)
                    self.history_tracking__ = False
601
602
603
604
        if stitches:
            if rest:
                stitches.append(Node(None, rest))
            result = Node(None, tuple(stitches))
605
        if any(self.variables__.values()):
606
            result.add_error("Capture-retrieve-stack not empty after end of parsing: "
607
                             + str(self.variables__))
608
609
610
        result.pos = 0  # calculate all positions
        return result

611

612
    def push_rollback__(self, location, func):
613
614
        """
        Adds a rollback function that either removes or re-adds
615
616
617
618
        values on the variable stack (`self.variables`) that have been
        added (or removed) by Capture or Pop Parsers, the results of
        which have been dismissed.
        """
619
620
621
        self.rollback__.append((location, func))
        self.last_rb__loc__ = location

622

623
    def rollback_to__(self, location):
624
625
        """
        Rolls back the variable stacks (`self.variables`) to its
626
627
        state at an earlier location in the parsed document.
        """
628
629
630
631
632
633
634
        while self.rollback__ and self.rollback__[-1][0] <= location:
            loc, rollback_func = self.rollback__.pop()
            assert not loc > self.last_rb__loc__
            rollback_func()
        self.last_rb__loc__ == self.rollback__[-1][0] if self.rollback__ \
            else (len(self.document__) + 1)

635

636
    def log_parsing_history__(self, log_file_name: str = '') -> None:
637
638
        """
        Writes a log of the parsing history of the most recently parsed
639
640
641
        document. 
        """
        def prepare_line(record):
642
            excerpt = self.document__.__getitem__(record.extent)[:25].replace('\n', '\\n')
643
            excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
644
            return record.stack, record.status, excerpt
645
646

        def write_log(history, log_name):
Eckhart Arnold's avatar
Eckhart Arnold committed
647
            path = os.path.join(log_dir(), log_name + "_parser.log")
648
649
650
651
652
653
            if history:
                with open(path, "w", encoding="utf-8") as f:
                    f.write("\n".join(history))
            elif os.path.exists(path):
                os.remove(path)

Eckhart Arnold's avatar
Eckhart Arnold committed
654
655
656
657
        if not log_file_name:
            name = self.__class__.__name__
            log_file_name = name[:-7] if name.lower().endswith('grammar') else name
        full_history, match_history, errors_only = [], [], []
658
        for record in self.history__:
Eckhart Arnold's avatar
Eckhart Arnold committed
659
660
            line = ";  ".join(prepare_line(record))
            full_history.append(line)
661
            if record.node and record.node.parser.ptype != WHITESPACE_PTYPE:
Eckhart Arnold's avatar
Eckhart Arnold committed
662
                match_history.append(line)
663
                if record.node.error_flag:
Eckhart Arnold's avatar
Eckhart Arnold committed
664
665
666
667
                    errors_only.append(line)
        write_log(full_history, log_file_name + '_full')
        write_log(match_history, log_file_name + '_match')
        write_log(errors_only, log_file_name + '_errors')
668
669


Eckhart Arnold's avatar
Eckhart Arnold committed
670
def dsl_error_msg(parser: Parser, error_str: str) -> str:
671
672
    """
    Returns an error message for errors in the parser configuration,
673
674
675
    e.g. errors that result in infinite loops.

    Args:
Eckhart Arnold's avatar
Eckhart Arnold committed
676
        parser (Parser):  The parser where the error was noticed. Note
677
            that this is not necessarily the parser that caused the
678
            error but only where the error became apparent.
679
680
681
682
683
        error_str (str):  A short string describing the error.
    Returns:  
        str: An error message including the call stack if history 
        tacking has been turned in the grammar object.
    """
684
    msg = ["DSL parser specification error:", error_str, 'Caught by parser "%s".' % str(parser)]
685
686
    if parser.grammar.history__:
        msg.extend(["\nCall stack:", parser.grammar.history__[-1].stack])
687
688
689
690
691
    else:
        msg.extend(["\nEnable history tracking in Grammar object to display call stack."])
    return " ".join(msg)


692

693
694
695
696
697
698
699
########################################################################
#
# Token and Regular Expression parser classes (i.e. leaf classes)
#
########################################################################


700
701
702
RX_PREPROCESSOR_TOKEN = re.compile('\w+')
BEGIN_TOKEN = '\x1b'
END_TOKEN = '\x1c'
703
704


705
def make_token(token: str, argument: str = '') -> str:
706
707
    """
    Turns the ``token`` and ``argument`` into a special token that
708
    will be caught by the `PreprocessorToken`-parser.
709

710
711
    This function is a support function that should be used by
    preprocessors to inject preprocessor tokens into the source text.
712
    """
713
714
715
    assert RX_PREPROCESSOR_TOKEN.match(token)
    assert argument.find(BEGIN_TOKEN) < 0
    assert argument.find(END_TOKEN) < 0
716

717
    return BEGIN_TOKEN + token + argument + END_TOKEN
718
719


720
def nil_preprocessor(text: str) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
721
    return text
722
723


724
class PreprocessorToken(Parser):
725
    """
726
    Parses tokens that have been inserted by a preprocessor.
727
    
728
    Preprocessors can generate Tokens with the ``make_token``-function.
729
    These tokens start and end with magic characters that can only be
730
731
732
    matched by the PreprocessorToken Parser. Such tokens can be used to
    insert BEGIN - END delimiters at the beginning or ending of a
    quoted block, for example.
733
    """
734

735
736
737
738
    def __init__(self, token: str) -> None:
        assert token and token.isupper()
        assert RX_PREPROCESSOR_TOKEN.match(token)
        super(PreprocessorToken, self).__init__(token)
739

740
    def __call__(self, text: str) -> Tuple[Node, str]:
741
742
        if text[0:1] == BEGIN_TOKEN:
            end = text.find(END_TOKEN, 1)
743
744
            if end < 0:
                node = Node(self, '').add_error(
745
746
                    'END_TOKEN delimiter missing from preprocessor token. '
                    '(Most likely due to a preprocessor bug!)')  # type: Node
747
748
749
                return node, text[1:]
            elif end == 0:
                node = Node(self, '').add_error(
750
751
                    'Preprocessor-token cannot have zero length. '
                    '(Most likely due to a preprocessor bug!)')
752
                return node, text[2:]
753
            elif text.find(BEGIN_TOKEN, 1, end) >= 0:
754
755
                node = Node(self, text[len(self.name) + 1:end])
                node.add_error(
756
757
758
                    'Preprocessor-tokens must not be nested or contain '
                    'BEGIN_TOKEN delimiter as part of their argument. '
                    '(Most likely due to a preprocessor bug!)')
759
760
761
762
763
764
765
766
                return node, text[end:]
            if text[1:len(self.name) + 1] == self.name:
                return Node(self, text[len(self.name) + 1:end]), \
                       text[end + 1:]
        return None, text


class RegExp(Parser):
767
768
    """
    Regular expression parser.
769
770
771
772
773
774
    
    The RegExp-parser parses text that matches a regular expression.
    RegExp can also be considered as the "atomic parser", because all
    other parsers delegate part of the parsing job to other parsers,
    but do not match text directly.
    """
775
776

    def __init__(self, regexp, name: str = '') -> None:
777
        super(RegExp, self).__init__(name)
778
779
780
        self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

    def __deepcopy__(self, memo):
Eckhart Arnold's avatar
Eckhart Arnold committed
781
        # `regex` supports deep copies, but not `re`
782
        try:
Eckhart Arnold's avatar
Eckhart Arnold committed
783
            regexp = copy.deepcopy(self.regexp, memo)
784
785
        except TypeError:
            regexp = self.regexp.pattern
786
        return RegExp(regexp, self.name)
787

788
    def __call__(self, text: str) -> Tuple[Node, str]:
789
        match = text[0:1] != BEGIN_TOKEN and self.regexp.match(text)  # ESC starts a preprocessor token.
790
791
792
793
794
        if match:
            end = match.end()
            return Node(self, text[:end]), text[end:]
        return None, text

795
    def __repr__(self):
796
        return '/%s/' % self.regexp.pattern
797

798

799
800
class Whitespace(RegExp):
    assert WHITESPACE_PTYPE == ":Whitespace"
801
802


803
class RE(Parser):
804
805
    """
    Regular Expressions with optional leading or trailing whitespace.
806
807
808
809
810
811
812
813
814
815
    
    The RE-parser parses pieces of text that match a given regular
    expression. Other than the ``RegExp``-Parser it can also skip 
    "implicit whitespace" before or after the matched text.
    
    The whitespace is in turn defined by a regular expression. It
    should be made sure that this expression also matches the empty
    string, e.g. use r'\s*' or r'[\t ]+', but not r'\s+'. If the
    respective parameters in the constructor are set to ``None`` the
    default whitespace expression from the Grammar object will be used.
816
    """
817
    def __init__(self, regexp, wL=None, wR=None, name=''):
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
        """Constructor for class RE.
                
        Args:
            regexp (str or regex object):  The regular expression to be
                used for parsing. 
            wL (str or regexp):  Left whitespace regular expression, 
                i.e. either ``None``, the empty string or a regular
                expression (e.g. "\s*") that defines whitespace. An 
                empty string means no whitespace will be skipped,
                ``None`` means that the default whitespace will be 
                used.
            wR (str or regexp):  Right whitespace regular expression.
                See above.
            name:  The optional name of the parser.
        """
833
        super(RE, self).__init__(name)
834
835
        self.wL = wL
        self.wR = wR
836
837
        self.wspLeft = Whitespace(wL) if wL else ZOMBIE_PARSER
        self.wspRight = Whitespace(wR) if wR else ZOMBIE_PARSER
838
839
        self.main = RegExp(regexp)

Eckhart Arnold's avatar
Eckhart Arnold committed
840
841
842
843
844
    def __deepcopy__(self, memo={}):
        try:
            regexp = copy.deepcopy(self.main.regexp, memo)
        except TypeError:
            regexp = self.main.regexp.pattern
845
        return self.__class__(regexp, self.wL, self.wR, self.name)
Eckhart Arnold's avatar
Eckhart Arnold committed
846

847
    def __call__(self, text: str) -> Tuple[Node, str]:
848
        # assert self.main.regexp.pattern != "@"
849
        t = text    # type: str
850
851
852
853
854
855
856
857
858
        wL, t = self.wspLeft(t)
        main, t = self.main(t)
        if main:
            wR, t = self.wspRight(t)
            result = tuple(nd for nd in (wL, main, wR)
                           if nd and nd.result != '')
            return Node(self, result), t
        return None, text

859
    def __repr__(self):
di68kap's avatar
di68kap committed
860
861
        wL = '~' if self.wspLeft else ''
        wR = '~' if self.wspRight else ''
862
        return wL + '/%s/' % self.main.regexp.pattern + wR
863

864
865
    def _grammar_assigned_notifier(self):
        if self.grammar:
866
            # use default whitespace parsers if not otherwise specified
867
868
869
870
871
            if self.wL is None:
                self.wspLeft = self.grammar.wsp_left_parser__
            if self.wR is None:
                self.wspRight = self.grammar.wsp_right_parser__

872
    def apply(self, func: Parser.ApplyFunc):
873
874
875
876
877
878
879
880
        if super(RE, self).apply(func):
            if self.wL:
                self.wspLeft.apply(func)
            if self.wR:
                self.wspRight.apply(func)
            self.main.apply(func)


881
class Token(RE):
882
883
    """
    Class Token parses simple strings. Any regular regular
884
885
886
887
888
889
890
891
    expression commands will be interpreted as simple sequence of
    characters.

    Other than that class Token is essentially a renamed version of
    class RE. Because tokens often have a particular semantic different
    from other REs, parsing them with a separate parser class allows to
    distinguish them by their parser type.
    """
892
893
    assert TOKEN_PTYPE == ":Token"

894
    def __init__(self, token: str, wL=None, wR=None, name: str = '') -> None:
895
896
897
898
899
        self.token = token
        super(Token, self).__init__(escape_re(token), wL, wR, name)

    def __deepcopy__(self, memo={}):
        return self.__class__(self.token, self.wL, self.wR, self.name)
900

901
    def __repr__(self):
902
        return '"%s"' % self.token if self.token.find('"') < 0 else "'%s'" % self.token
903

904
905
906
907
908
909
910
911
912

########################################################################
#
# Combinator parser classes (i.e. trunk classes of the parser tree)
#
########################################################################


class UnaryOperator(Parser):
913
    def __init__(self, parser: Parser, name: str = '') -> None:
914
        super(UnaryOperator, self).__init__(name)
915
        # assert isinstance(parser, Parser)
916
        self.parser = parser  # type: Parser
917

Eckhart Arnold's avatar
Eckhart Arnold committed
918
919
    def __deepcopy__(self, memo):
        parser = copy.deepcopy(self.parser, memo)
920
        return self.__class__(parser, self.name)
Eckhart Arnold's avatar
Eckhart Arnold committed
921

922
    def apply(self, func: Parser.ApplyFunc):
923
924
925
926
927
        if super(UnaryOperator, self).apply(func):
            self.parser.apply(func)


class NaryOperator(Parser):
928
    def __init__(self, *parsers: Parser, name: str = '') -> None:
929
        super(NaryOperator, self).__init__(name)
930
        # assert all([isinstance(parser, Parser) for parser in parsers]), str(parsers)
931
        self.parsers = parsers  # type: Tuple[Parser, ...]
932

Eckhart Arnold's avatar
Eckhart Arnold committed
933
934
    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
935
        return self.__class__(*parsers, name=self.name)
Eckhart Arnold's avatar
Eckhart Arnold committed
936

937
    def apply(self, func: Parser.ApplyFunc):
938
939
940
941
942
943
        if super(NaryOperator, self).apply(func):
            for parser in self.parsers:
                parser.apply(func)


class Optional(UnaryOperator):
944
    def __init__(self, parser: Parser, name: str = '') -> None:
945
        super(Optional, self).__init__(parser, name)
946
        # assert isinstance(parser, Parser)
947
        assert not isinstance(parser, Optional), \
948
            "Redundant nesting of options: %s(%s)" % \
949
950
            (str(name), str(parser.name))
        assert not isinstance(parser, Required), \
951
            "Nesting options with required elements is contradictory: " \
952
953
            "%s(%s)" % (str(name), str(parser.name))

954
    def __call__(self, text: str) -> Tuple[Node, str]:
955
956
957
958
959
        node, text = self.parser(text)
        if node:
            return Node(self, node), text
        return Node(self, ()), text

960
    def __repr__(self):
961
962
        return '[' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + ']'
963
964

class ZeroOrMore(Optional):
965
966
    def __call__(self, text: str) -> Tuple[Node, str]:
        results = ()  # type: Tuple[Node, ...]
967
968
969
        n = len(text) + 1
        while text and len(text) < n:
            n = len(text)
970
971
972
            node, text = self.parser(text)
            if not node:
                break
973
            if len(text) == n:
di68kap's avatar
di68kap committed
974
                node.add_error(dsl_error_msg(self, 'Infinite Loop detected.'))
975
976
977
            results += (node,)
        return Node(self, results), text

978
    def __repr__(self):
979
980
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + '}'
981

982
983

class OneOrMore(UnaryOperator):
984
    def __init__(self, parser: Parser, name: str = '') -> None:
985
        super(OneOrMore, self).__init__(parser, name)
986
987
988
989
        assert not isinstance(pars