parser.py 54.5 KB
Newer Older
1
"""parsers.py - parser combinators for for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
Eckhart Arnold's avatar
Eckhart Arnold committed
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48

Module ``parsers.py`` contains a number of classes that together
make up parser combinators for left-recursive grammers. For each
element of the extended Backus-Naur-Form as well as for a regular
expression token a class is defined. The set of classes can be used to
define a parser for (ambiguous) left-recursive grammers.


References and Acknowledgements:

Dominikus Herzberg: Objekt-orientierte Parser-Kombinatoren in Python,
Blog-Post, September, 18th 2008 on denkspuren. gedanken, ideen,
anregungen und links rund um informatik-themen, URL:
http://denkspuren.blogspot.de/2008/09/objekt-orientierte-parser-kombinatoren.html

Dominikus Herzberg: Eine einfache Grammatik für LaTeX, Blog-Post,
September, 18th 2008 on denkspuren. gedanken, ideen, anregungen und
links rund um informatik-themen, URL:
http://denkspuren.blogspot.de/2008/09/eine-einfache-grammatik-fr-latex.html

Dominikus Herzberg: Uniform Syntax, Blog-Post, February, 27th 2007 on
denkspuren. gedanken, ideen, anregungen und links rund um
informatik-themen, URL:
http://denkspuren.blogspot.de/2007/02/uniform-syntax.html

Richard A. Frost, Rahmatullah Hafiz and Paul Callaghan: Parser
Combinators for Ambiguous Left-Recursive Grammars, in: P. Hudak and
D.S. Warren (Eds.): PADL 2008, LNCS 4902, pp. 167–181, Springer-Verlag
Berlin Heidelberg 2008.

Juancarlo Añez: grako, a PEG parser generator in Python,
https://bitbucket.org/apalala/grako
49
50
"""

51

Eckhart Arnold's avatar
Eckhart Arnold committed
52
import abc
53
import copy
54
55
import os
import platform
Eckhart Arnold's avatar
Eckhart Arnold committed
56
57
from functools import partial

58
59
60
61
try:
    import regex as re
except ImportError:
    import re
62
try:
63
64
65
66
67
    from typing import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union
    try:
        from typing import Collection
    except ImportError:
        pass
68
except ImportError:
69
    from .typing34 import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union
70

71
from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name
Eckhart Arnold's avatar
Eckhart Arnold committed
72
73
from DHParser.syntaxtree import WHITESPACE_PTYPE, TOKEN_PTYPE, ZOMBIE_PARSER, ParserBase, \
    Node, TransformationFunc
74
from DHParser.toolkit import load_if_file, error_messages
Eckhart Arnold's avatar
Eckhart Arnold committed
75

76
__all__ = ['PreprocessorFunc',
77
           'HistoryRecord',
Eckhart Arnold's avatar
Eckhart Arnold committed
78
           'Parser',
79
           'Grammar',
80
81
82
           'RX_PREPROCESSOR_TOKEN',
           'BEGIN_TOKEN',
           'END_TOKEN',
Eckhart Arnold's avatar
Eckhart Arnold committed
83
           'make_token',
84
85
           'nil_preprocessor',
           'PreprocessorToken',
Eckhart Arnold's avatar
Eckhart Arnold committed
86
87
88
89
           'RegExp',
           'RE',
           'Token',
           'mixin_comment',
90
91
92
           # 'UnaryOperator',
           # 'NaryOperator',
           'Synonym',
Eckhart Arnold's avatar
Eckhart Arnold committed
93
94
95
           'Optional',
           'ZeroOrMore',
           'OneOrMore',
96
           'Series',
Eckhart Arnold's avatar
Eckhart Arnold committed
97
98
99
100
101
102
103
           'Alternative',
           'FlowOperator',
           'Required',
           'Lookahead',
           'NegativeLookahead',
           'Lookbehind',
           'NegativeLookbehind',
104
105
106
           'last_value',
           'counterpart',
           'accumulate',
Eckhart Arnold's avatar
Eckhart Arnold committed
107
108
109
110
           'Capture',
           'Retrieve',
           'Pop',
           'Forward',
111
           'Compiler',
Eckhart Arnold's avatar
Eckhart Arnold committed
112
           'compile_source']
Eckhart Arnold's avatar
Eckhart Arnold committed
113

114

115

116
117
118
119
120
121
122
########################################################################
#
# Grammar and parsing infrastructure
#
########################################################################


123
PreprocessorFunc = Union[Callable[[str], str], partial]
124
125


126
127
128
129
130
LEFT_RECURSION_DEPTH = 20 if platform.python_implementation() == "PyPy" \
                          else 8 # type: int
# because of python's recursion depth limit, this value ought not to be set too high
MAX_DROPOUTS = 25  # type: int
# stop trying to recover parsing after so many errors
131

132

133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
class HistoryRecord:
    """
    Stores debugging information about one completed step in the
    parsing history. 

    A parsing step is "completed" when the last one of a nested
    sequence of parser-calls returns. The call stack including
    the last parser call will be frozen in the ``HistoryRecord``-
    object. In addition a reference to the generated leaf node
    (if any) will be stored and the result status of the last
    parser call, which ist either MATCH, FAIL (i.e. no match)
    or ERROR.
    """
    __slots__ = ('call_stack', 'node', 'remaining')

    MATCH = "MATCH"
    ERROR = "ERROR"
    FAIL = "FAIL"

152
153
154
155
    def __init__(self, call_stack: List['Parser'], node: Node, remaining: int) -> None:
        self.call_stack = call_stack    # type: List['Parser']
        self.node = node                # type: Node
        self.remaining = remaining      # type: int
156

157
    def err_msg(self) -> str:
158
159
        return self.ERROR + ": " + "; ".join(self.node._errors).replace('\n', '\\')

160
    @property
161
    def stack(self) -> str:
162
163
        return "->".join((repr(p) if p.ptype == ':RegExp' else p.name or p.ptype)
                         for p in self.call_stack)
164
165

    @property
166
    def status(self) -> str:
167
168
        return self.FAIL if self.node is None else \
            self.err_msg() if self.node._errors else self.MATCH
169
170

    @property
171
172
173
    def extent(self) -> slice:
        return (slice(-self.remaining - self.node.len, -self.remaining) if self.node
                else slice(-self.remaining, None))
174
175
176


def add_parser_guard(parser_func):
177
    def guarded_call(parser: 'Parser', text: str) -> Tuple[Node, str]:
178
179
        try:
            location = len(text)
180
181
            grammar = parser.grammar  # grammar may be 'None' for unconnected parsers!

182
            if not grammar.moving_forward__:
183
                # rollback variable changes from discarded parser passes
184
185
186
                if grammar.last_rb__loc__ <= location:
                    grammar.rollback_to__(location)
                grammar.moving_forward__ = True
187
                grammar.left_recursion_encountered__ = False
188
189
190

            if grammar.history_tracking__:
                grammar.call_stack__.append(parser)
191

192
193
194
            # if location has already been visited by the current parser,
            # return saved result
            if location in parser.visited:
195
                return parser.visited[location]
196
197
            # break left recursion at the maximum allowed depth
            if parser.recursion_counter.setdefault(location, 0) > LEFT_RECURSION_DEPTH:
198
                grammar.left_recursion_encountered__ = True
199
200
201
202
203
204
205
                return None, text

            parser.recursion_counter[location] += 1

            # run original __call__ method
            node, rest = parser_func(parser, text)

206
            if node is None:
Eckhart Arnold's avatar
Eckhart Arnold committed
207
208
209
210
211
212
213
                # retrieve an earlier match result (from left recursion)
                # if it exists
                node, rest = parser.visited.get(location, (None, rest))
                # don't overwrite any positive match (i.e. node not None) in the cache
                # and don't add empty entries for parsers returning from left recursive calls!
                if node is None and not grammar.left_recursion_encountered__:
                    # ortherwise also cache None-results
214
215
                    parser.visited[location] = None, rest
            else:
216
                # variable manipulating parsers will be excluded, though,
217
                # because caching would interfere with changes of variable state
218
                if grammar.last_rb__loc__ > location:
Eckhart Arnold's avatar
Eckhart Arnold committed
219
220
                    # in case of left recursion, the first recursive step that
                    # matches will store its result in the cache
221
                    parser.visited[location] = (node, rest)
222
                grammar.last_node__ = node   # store last node for Lookbehind parser
223
224
225

            parser.recursion_counter[location] -= 1

226
            if grammar.history_tracking__:
227
                # don't track returning parsers except in case an error has occurred
228
229
230
                if grammar.moving_forward__ or (node and node._errors):
                    record = HistoryRecord(grammar.call_stack__.copy(), node, len(rest))
                    grammar.history__.append(record)
231
                    # print(record.stack, record.status, rest[:20].replace('\n', '|'))
232
233
                grammar.call_stack__.pop()
            grammar.moving_forward__ = False
234

235
236
237
238
239
240
241
242
243
244
245
        except RecursionError:
            node = Node(None, text[:min(10, max(1, text.find("\n")))] + " ...")
            node.add_error("maximum recursion depth of parser reached; "
                           "potentially due to too many errors!")
            rest = ''

        return node, rest

    return guarded_call


Eckhart Arnold's avatar
Eckhart Arnold committed
246
class ParserMetaClass(abc.ABCMeta):
247
248
249
250
251
252
253
254
255
256
    def __init__(cls, name, bases, attrs):
        # The following condition is necessary for classes that don't override
        # the __call__() method, because in these cases the non-overridden
        # __call__()-method would be substituted a second time!
        guarded_parser_call = add_parser_guard(cls.__call__)
        if cls.__call__.__code__ != guarded_parser_call.__code__:
            cls.__call__ = guarded_parser_call
        super(ParserMetaClass, cls).__init__(name, bases, attrs)


Eckhart Arnold's avatar
Eckhart Arnold committed
257
class Parser(ParserBase, metaclass=ParserMetaClass):
258
259
    ApplyFunc = Callable[['Parser'], None]

260
261
    def __init__(self, name: str = '') -> None:
        # assert isinstance(name, str), str(name)
Eckhart Arnold's avatar
Eckhart Arnold committed
262
        super(Parser, self).__init__(name)
263
        self._grammar = None  # type: 'Grammar'
264
265
        self.reset()

Eckhart Arnold's avatar
Eckhart Arnold committed
266
    def __deepcopy__(self, memo):
267
268
        return self.__class__(self.name)

269
    def reset(self):
270
        self.visited = dict()            # type: Dict[int, Tuple[Node, str]]
271
        self.recursion_counter = dict()  # type: Dict[int, int]
272
        self.cycle_detection = set()     # type: Set[Callable]
273
        return self
274

275
    def __call__(self, text: str) -> Tuple[Node, str]:
276
277
        return None, text  # default behaviour: don't match

278
    def __add__(self, other: 'Parser') -> 'Series':
279
        return Series(self, other)
280

281
    def __or__(self, other: 'Parser') -> 'Alternative':
282
283
        return Alternative(self, other)

284
    @property
285
    def grammar(self) -> 'Grammar':
286
287
288
        return self._grammar

    @grammar.setter
289
290
    def grammar(self, grammar: 'Grammar'):
        self._grammar = grammar
291
292
293
294
295
        self._grammar_assigned_notifier()

    def _grammar_assigned_notifier(self):
        pass

296
    def apply(self, func: ApplyFunc):
297
298
        """
        Applies function `func(parser)` recursively to this parser and all
Eckhart Arnold's avatar
Eckhart Arnold committed
299
        descendants of the tree of parsers. The same function can never
300
301
302
303
304
305
306
307
308
309
        be applied twice between calls of the ``reset()``-method!
        """
        if func in self.cycle_detection:
            return False
        else:
            self.cycle_detection.add(func)
            func(self)
            return True


310
def mixin_comment(whitespace: str, comment: str) -> str:
311
312
    """
    Returns a regular expression that merges comment and whitespace
313
314
315
316
317
318
319
320
321
322
323
    regexps. Thus comments cann occur whereever whitespace is allowed
    and will be skipped just as implicit whitespace.

    Note, that because this works on the level of regular expressions,
    nesting comments is not possible. It also makes it much harder to
    use directives inside comments (which isn't recommended, anyway).
    """
    wspc = '(?:' + whitespace + '(?:' + comment + whitespace + ')*)'
    return wspc


324
class Grammar:
325
326
    root__ = None  # type: Union[Parser, None]
    # root__ must be overwritten with the root-parser by grammar subclass
Eckhart Arnold's avatar
Eckhart Arnold committed
327
    parser_initialization__ = "pending"  # type: str
328
329
330
331
332
    # some default values
    COMMENT__ = r''  # r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'[\t ]*', comment=COMMENT__)
    wspL__ = ''
    wspR__ = WSP__
333

334

335
336
    @classmethod
    def _assign_parser_names(cls):
337
338
        """
        Initializes the `parser.name` fields of those
339
340
        Parser objects that are directly assigned to a class field with
        the field's name, e.g.
341
            class Grammar(Grammar):
342
343
344
                ...
                symbol = RE('(?!\\d)\\w+')
        After the call of this method symbol.name == "symbol"
345
346
347
348
        holds. Names assigned via the ``name``-parameter of the
        constructor will not be overwritten. Parser names starting or
        ending with a double underscore like ``root__`` will be
        ignored. See ``toolkit.sane_parser_name()``
349
350
351
352

        This is done only once, upon the first instantiation of the
        grammar class!

353
354
355
356
357
        Attention: If there exists more than one reference to the same
        parser, only the first one will be chosen for python versions 
        greater or equal 3.6.  For python version <= 3.5 an arbitrarily
        selected reference will be chosen. See PEP 520
        (www.python.org/dev/peps/pep-0520/) for an explanation of why. 
358
359
360
361
362
        """
        if cls.parser_initialization__ == "done":
            return
        cdict = cls.__dict__
        for entry, parser in cdict.items():
363
            if isinstance(parser, Parser) and sane_parser_name(entry):
364
                if not parser.name:
365
                    parser.name = entry
366
                if (isinstance(parser, Forward) and (not parser.parser.name)):
367
368
369
                    parser.parser.name = entry
        cls.parser_initialization__ = "done"

370

Eckhart Arnold's avatar
Eckhart Arnold committed
371
372
373
    def __init__(self, root: Parser=None) -> None:
        # if not hasattr(self.__class__, 'parser_initialization__'):
        #     self.__class__.parser_initialization__ = "pending"
374
375
376
377
        if not hasattr(self.__class__, 'wspL__'):
            self.wspL__ = ''
        if not hasattr(self.__class__, 'wspR__'):
            self.wspR__ = ''
378
379
380
        self.all_parsers__ = set()  # type: Set[Parser]
        self.dirty_flag__ = False
        self.history_tracking__ = False
381
        self._reset__()
382

Eckhart Arnold's avatar
Eckhart Arnold committed
383
384
        # prepare parsers in the class, first
        self._assign_parser_names()
385

Eckhart Arnold's avatar
Eckhart Arnold committed
386
387
388
389
        # then deep-copy the parser tree from class to instance;
        # parsers not connected to the root object will be copied later
        # on demand (see Grammar.__getitem__()). Usually, the need to
        # do so only arises during testing.
390
        self.root__ = root if root else copy.deepcopy(self.__class__.root__)
391

392
        if self.wspL__:
Eckhart Arnold's avatar
Eckhart Arnold committed
393
            self.wsp_left_parser__ = Whitespace(self.wspL__)  # type: ParserBase
394
            self.wsp_left_parser__.grammar = self
395
            self.all_parsers__.add(self.wsp_left_parser__)  # don't you forget about me...
396
397
398
        else:
            self.wsp_left_parser__ = ZOMBIE_PARSER
        if self.wspR__:
Eckhart Arnold's avatar
Eckhart Arnold committed
399
            self.wsp_right_parser__ = Whitespace(self.wspR__)  # type: ParserBase
400
            self.wsp_right_parser__.grammar = self
401
            self.all_parsers__.add(self.wsp_right_parser__)  # don't you forget about me...
402
403
        else:
            self.wsp_right_parser__ = ZOMBIE_PARSER
404
        self.root__.apply(self._add_parser__)
405

406

407
    def __getitem__(self, key):
408
409
410
        try:
            return self.__dict__[key]
        except KeyError:
411
412
            parser_template = getattr(self, key, None)
            if parser_template:
Eckhart Arnold's avatar
Eckhart Arnold committed
413
                # add parser to grammar object on the fly...
414
                parser = copy.deepcopy(parser_template)
415
                parser.apply(self._add_parser__)
416
                # assert self[key] == parser
Eckhart Arnold's avatar
Eckhart Arnold committed
417
                return self[key]
418
            raise KeyError('Unknown parser "%s" !' % key)
419

420

421
    def _reset__(self):
422
        self.document__ = ""          # type: str
423
        # variables stored and recalled by Capture and Retrieve parsers
424
425
        self.variables__ = dict()     # type: Dict[str, List[str]]
        self.rollback__ = []          # type: List[Tuple[int, Callable]]
426
        self.last_rb__loc__ = -1  # type: int
427
        # previously parsed node, needed by Lookbehind parser
428
        self.last_node__ = None       # type: Node
429
        # support for call stack tracing
430
        self.call_stack__ = []        # type: List[Parser]
431
        # snapshots of call stacks
432
        self.history__ = []           # type: List[HistoryRecord]
433
        # also needed for call stack tracing
434
        self.moving_forward__ = True  # type: bool
435
        self.left_recursion_encountered__ = False  # type: bool
436

437

438
    def _add_parser__(self, parser: Parser) -> None:
439
440
        """
        Adds the particular copy of the parser object to this
441
        particular instance of Grammar.
442
        """
443
        if parser.name:
Eckhart Arnold's avatar
Eckhart Arnold committed
444
445
446
            # prevent overwriting instance variables or parsers of a different class
            assert parser.name not in self.__dict__ or \
                   isinstance(self.__dict__[parser.name], parser.__class__), \
447
448
                ('Cannot add parser "%s" because a field with the same name '
                 'already exists in grammar object!' % parser.name)
449
            setattr(self, parser.name, parser)
450
        self.all_parsers__.add(parser)
451
452
        parser.grammar = self

453

Eckhart Arnold's avatar
Eckhart Arnold committed
454
    def __call__(self, document: str, start_parser="root__") -> Node:
455
456
        """
        Parses a document with with parser-combinators.
457
458
459

        Args:
            document (str): The source text to be parsed.
460
461
462
            start_parser (str): The name of the parser with which to
                start. This is useful for testing particular parsers
                (i.e. particular parts of the EBNF-Grammar.)
463
464
465
        Returns:
            Node: The root node ot the parse tree.
        """
466
        # assert isinstance(document, str), type(document)
467
468
        if self.root__ is None:
            raise NotImplementedError()
469
        if self.dirty_flag__:
470
            self._reset__()
471
            for parser in self.all_parsers__:
472
473
                parser.reset()
        else:
474
475
476
            self.dirty_flag__ = True
        self.history_tracking__ = is_logging()
        self.document__ = document
477
        self.last_rb__loc__ = len(document) + 1  # rollback location
Eckhart Arnold's avatar
Eckhart Arnold committed
478
        parser = self[start_parser] if isinstance(start_parser, str) else start_parser
479
480
        assert parser.grammar == self, "Cannot run parsers from a different grammar object!" \
                                       " %s vs. %s" % (str(self), str(parser.grammar))
481
        stitches = []  # type: List[Node]
482
        rest = document
483
484
        if not rest:
            result, ignore = parser(rest)
485
486
487
488
489
490
491
492
493
494
        while rest and len(stitches) < MAX_DROPOUTS:
            result, rest = parser(rest)
            if rest:
                fwd = rest.find("\n") + 1 or len(rest)
                skip, rest = rest[:fwd], rest[fwd:]
                if result is None:
                    error_msg = "Parser did not match! Invalid source file?"
                else:
                    stitches.append(result)
                    error_msg = "Parser stopped before end" + \
495
496
                                (("! trying to recover" +
                                  (" but stopping history recording at this point."
497
                                   if self.history_tracking__ else "..."))
498
499
500
501
                                 if len(stitches) < MAX_DROPOUTS
                                 else " too often! Terminating parser.")
                stitches.append(Node(None, skip))
                stitches[-1].add_error(error_msg)
502
                if self.history_tracking__:
503
504
505
506
                    # some parsers may have matched and left history records with nodes != None.
                    # Because these are not connected to the stiched root node, their pos
                    # properties will not be initialized by setting the root node's pos property
                    # to zero. Therefore, their pos properties need to be initialized here
507
                    for record in self.history__:
508
509
                        if record.node and record.node._pos < 0:
                            record.node.pos = 0
510
511
512
                    record = HistoryRecord(self.call_stack__.copy(), stitches[-1], len(rest))
                    self.history__.append(record)
                    self.history_tracking__ = False
513
514
515
516
        if stitches:
            if rest:
                stitches.append(Node(None, rest))
            result = Node(None, tuple(stitches))
517
        if any(self.variables__.values()):
518
            result.add_error("Capture-retrieve-stack not empty after end of parsing: "
519
                             + str(self.variables__))
520
521
522
        result.pos = 0  # calculate all positions
        return result

523

524
    def push_rollback__(self, location, func):
525
526
        """
        Adds a rollback function that either removes or re-adds
527
528
529
530
        values on the variable stack (`self.variables`) that have been
        added (or removed) by Capture or Pop Parsers, the results of
        which have been dismissed.
        """
531
532
533
        self.rollback__.append((location, func))
        self.last_rb__loc__ = location

534

535
    def rollback_to__(self, location):
536
537
        """
        Rolls back the variable stacks (`self.variables`) to its
538
539
        state at an earlier location in the parsed document.
        """
540
541
542
543
544
545
546
        while self.rollback__ and self.rollback__[-1][0] <= location:
            loc, rollback_func = self.rollback__.pop()
            assert not loc > self.last_rb__loc__
            rollback_func()
        self.last_rb__loc__ == self.rollback__[-1][0] if self.rollback__ \
            else (len(self.document__) + 1)

547

548
    def log_parsing_history__(self, log_file_name: str = '') -> None:
549
550
        """
        Writes a log of the parsing history of the most recently parsed
551
552
553
        document. 
        """
        def prepare_line(record):
554
            excerpt = self.document__.__getitem__(record.extent)[:25].replace('\n', '\\n')
555
            excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
556
            return record.stack, record.status, excerpt
557
558

        def write_log(history, log_name):
Eckhart Arnold's avatar
Eckhart Arnold committed
559
            path = os.path.join(log_dir(), log_name + "_parser.log")
560
561
562
563
564
565
            if history:
                with open(path, "w", encoding="utf-8") as f:
                    f.write("\n".join(history))
            elif os.path.exists(path):
                os.remove(path)

Eckhart Arnold's avatar
Eckhart Arnold committed
566
567
568
569
        if not log_file_name:
            name = self.__class__.__name__
            log_file_name = name[:-7] if name.lower().endswith('grammar') else name
        full_history, match_history, errors_only = [], [], []
570
        for record in self.history__:
Eckhart Arnold's avatar
Eckhart Arnold committed
571
572
            line = ";  ".join(prepare_line(record))
            full_history.append(line)
573
            if record.node and record.node.parser.ptype != WHITESPACE_PTYPE:
Eckhart Arnold's avatar
Eckhart Arnold committed
574
                match_history.append(line)
575
                if record.node.error_flag:
Eckhart Arnold's avatar
Eckhart Arnold committed
576
577
578
579
                    errors_only.append(line)
        write_log(full_history, log_file_name + '_full')
        write_log(match_history, log_file_name + '_match')
        write_log(errors_only, log_file_name + '_errors')
580
581


Eckhart Arnold's avatar
Eckhart Arnold committed
582
def dsl_error_msg(parser: Parser, error_str: str) -> str:
583
584
    """
    Returns an error message for errors in the parser configuration,
585
586
587
    e.g. errors that result in infinite loops.

    Args:
Eckhart Arnold's avatar
Eckhart Arnold committed
588
        parser (Parser):  The parser where the error was noticed. Note
589
            that this is not necessarily the parser that caused the
590
            error but only where the error became apparent.
591
592
593
594
595
        error_str (str):  A short string describing the error.
    Returns:  
        str: An error message including the call stack if history 
        tacking has been turned in the grammar object.
    """
596
    msg = ["DSL parser specification error:", error_str, 'Caught by parser "%s".' % str(parser)]
597
598
    if parser.grammar.history__:
        msg.extend(["\nCall stack:", parser.grammar.history__[-1].stack])
599
600
601
602
603
    else:
        msg.extend(["\nEnable history tracking in Grammar object to display call stack."])
    return " ".join(msg)


604

605
606
607
608
609
610
611
########################################################################
#
# Token and Regular Expression parser classes (i.e. leaf classes)
#
########################################################################


612
613
614
RX_PREPROCESSOR_TOKEN = re.compile('\w+')
BEGIN_TOKEN = '\x1b'
END_TOKEN = '\x1c'
615
616


617
def make_token(token: str, argument: str = '') -> str:
618
619
    """
    Turns the ``token`` and ``argument`` into a special token that
620
    will be caught by the `PreprocessorToken`-parser.
621

622
623
    This function is a support function that should be used by
    preprocessors to inject preprocessor tokens into the source text.
624
    """
625
626
627
    assert RX_PREPROCESSOR_TOKEN.match(token)
    assert argument.find(BEGIN_TOKEN) < 0
    assert argument.find(END_TOKEN) < 0
628

629
    return BEGIN_TOKEN + token + argument + END_TOKEN
630
631


632
def nil_preprocessor(text: str) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
633
    return text
634
635


636
class PreprocessorToken(Parser):
637
    """
638
    Parses tokens that have been inserted by a preprocessor.
639
    
640
    Preprocessors can generate Tokens with the ``make_token``-function.
641
    These tokens start and end with magic characters that can only be
642
643
644
    matched by the PreprocessorToken Parser. Such tokens can be used to
    insert BEGIN - END delimiters at the beginning or ending of a
    quoted block, for example.
645
    """
646

647
648
649
650
    def __init__(self, token: str) -> None:
        assert token and token.isupper()
        assert RX_PREPROCESSOR_TOKEN.match(token)
        super(PreprocessorToken, self).__init__(token)
651

652
    def __call__(self, text: str) -> Tuple[Node, str]:
653
654
        if text[0:1] == BEGIN_TOKEN:
            end = text.find(END_TOKEN, 1)
655
656
            if end < 0:
                node = Node(self, '').add_error(
657
658
                    'END_TOKEN delimiter missing from preprocessor token. '
                    '(Most likely due to a preprocessor bug!)')  # type: Node
659
660
661
                return node, text[1:]
            elif end == 0:
                node = Node(self, '').add_error(
662
663
                    'Preprocessor-token cannot have zero length. '
                    '(Most likely due to a preprocessor bug!)')
664
                return node, text[2:]
665
            elif text.find(BEGIN_TOKEN, 1, end) >= 0:
666
667
                node = Node(self, text[len(self.name) + 1:end])
                node.add_error(
668
669
670
                    'Preprocessor-tokens must not be nested or contain '
                    'BEGIN_TOKEN delimiter as part of their argument. '
                    '(Most likely due to a preprocessor bug!)')
671
672
673
674
675
676
677
678
                return node, text[end:]
            if text[1:len(self.name) + 1] == self.name:
                return Node(self, text[len(self.name) + 1:end]), \
                       text[end + 1:]
        return None, text


class RegExp(Parser):
679
680
    """
    Regular expression parser.
681
682
683
684
685
686
    
    The RegExp-parser parses text that matches a regular expression.
    RegExp can also be considered as the "atomic parser", because all
    other parsers delegate part of the parsing job to other parsers,
    but do not match text directly.
    """
687
688

    def __init__(self, regexp, name: str = '') -> None:
689
        super(RegExp, self).__init__(name)
690
691
692
        self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

    def __deepcopy__(self, memo):
Eckhart Arnold's avatar
Eckhart Arnold committed
693
        # `regex` supports deep copies, but not `re`
694
        try:
Eckhart Arnold's avatar
Eckhart Arnold committed
695
            regexp = copy.deepcopy(self.regexp, memo)
696
697
        except TypeError:
            regexp = self.regexp.pattern
698
        return RegExp(regexp, self.name)
699

700
    def __call__(self, text: str) -> Tuple[Node, str]:
701
        match = text[0:1] != BEGIN_TOKEN and self.regexp.match(text)  # ESC starts a preprocessor token.
702
703
704
705
706
        if match:
            end = match.end()
            return Node(self, text[:end]), text[end:]
        return None, text

707
    def __repr__(self):
708
        return '/%s/' % self.regexp.pattern
709

710

711
712
class Whitespace(RegExp):
    assert WHITESPACE_PTYPE == ":Whitespace"
713
714


715
class RE(Parser):
716
717
    """
    Regular Expressions with optional leading or trailing whitespace.
718
719
720
721
722
723
724
725
726
727
    
    The RE-parser parses pieces of text that match a given regular
    expression. Other than the ``RegExp``-Parser it can also skip 
    "implicit whitespace" before or after the matched text.
    
    The whitespace is in turn defined by a regular expression. It
    should be made sure that this expression also matches the empty
    string, e.g. use r'\s*' or r'[\t ]+', but not r'\s+'. If the
    respective parameters in the constructor are set to ``None`` the
    default whitespace expression from the Grammar object will be used.
728
    """
729
    def __init__(self, regexp, wL=None, wR=None, name=''):
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
        """Constructor for class RE.
                
        Args:
            regexp (str or regex object):  The regular expression to be
                used for parsing. 
            wL (str or regexp):  Left whitespace regular expression, 
                i.e. either ``None``, the empty string or a regular
                expression (e.g. "\s*") that defines whitespace. An 
                empty string means no whitespace will be skipped,
                ``None`` means that the default whitespace will be 
                used.
            wR (str or regexp):  Right whitespace regular expression.
                See above.
            name:  The optional name of the parser.
        """
745
        super(RE, self).__init__(name)
746
747
        self.wL = wL
        self.wR = wR
748
749
        self.wspLeft = Whitespace(wL) if wL else ZOMBIE_PARSER
        self.wspRight = Whitespace(wR) if wR else ZOMBIE_PARSER
750
751
        self.main = RegExp(regexp)

Eckhart Arnold's avatar
Eckhart Arnold committed
752
753
754
755
756
    def __deepcopy__(self, memo={}):
        try:
            regexp = copy.deepcopy(self.main.regexp, memo)
        except TypeError:
            regexp = self.main.regexp.pattern
757
        return self.__class__(regexp, self.wL, self.wR, self.name)
Eckhart Arnold's avatar
Eckhart Arnold committed
758

759
    def __call__(self, text: str) -> Tuple[Node, str]:
760
        # assert self.main.regexp.pattern != "@"
761
        t = text    # type: str
762
763
764
765
766
767
768
769
770
        wL, t = self.wspLeft(t)
        main, t = self.main(t)
        if main:
            wR, t = self.wspRight(t)
            result = tuple(nd for nd in (wL, main, wR)
                           if nd and nd.result != '')
            return Node(self, result), t
        return None, text

771
    def __repr__(self):
di68kap's avatar
di68kap committed
772
773
        wL = '~' if self.wspLeft else ''
        wR = '~' if self.wspRight else ''
774
        return wL + '/%s/' % self.main.regexp.pattern + wR
775

776
777
    def _grammar_assigned_notifier(self):
        if self.grammar:
778
            # use default whitespace parsers if not otherwise specified
779
780
781
782
783
            if self.wL is None:
                self.wspLeft = self.grammar.wsp_left_parser__
            if self.wR is None:
                self.wspRight = self.grammar.wsp_right_parser__

784
    def apply(self, func: Parser.ApplyFunc):
785
786
787
788
789
790
791
792
        if super(RE, self).apply(func):
            if self.wL:
                self.wspLeft.apply(func)
            if self.wR:
                self.wspRight.apply(func)
            self.main.apply(func)


793
class Token(RE):
794
795
    """
    Class Token parses simple strings. Any regular regular
796
797
798
799
800
801
802
803
    expression commands will be interpreted as simple sequence of
    characters.

    Other than that class Token is essentially a renamed version of
    class RE. Because tokens often have a particular semantic different
    from other REs, parsing them with a separate parser class allows to
    distinguish them by their parser type.
    """
804
805
    assert TOKEN_PTYPE == ":Token"

806
    def __init__(self, token: str, wL=None, wR=None, name: str = '') -> None:
807
808
809
810
811
        self.token = token
        super(Token, self).__init__(escape_re(token), wL, wR, name)

    def __deepcopy__(self, memo={}):
        return self.__class__(self.token, self.wL, self.wR, self.name)
812

813
    def __repr__(self):
814
        return '"%s"' % self.token if self.token.find('"') < 0 else "'%s'" % self.token
815

816
817
818
819
820
821
822
823
824

########################################################################
#
# Combinator parser classes (i.e. trunk classes of the parser tree)
#
########################################################################


class UnaryOperator(Parser):
825
    def __init__(self, parser: Parser, name: str = '') -> None:
826
        super(UnaryOperator, self).__init__(name)
827
        # assert isinstance(parser, Parser)
828
        self.parser = parser  # type: Parser
829

Eckhart Arnold's avatar
Eckhart Arnold committed
830
831
    def __deepcopy__(self, memo):
        parser = copy.deepcopy(self.parser, memo)
832
        return self.__class__(parser, self.name)
Eckhart Arnold's avatar
Eckhart Arnold committed
833

834
    def apply(self, func: Parser.ApplyFunc):
835
836
837
838
839
        if super(UnaryOperator, self).apply(func):
            self.parser.apply(func)


class NaryOperator(Parser):
840
    def __init__(self, *parsers: Parser, name: str = '') -> None:
841
        super(NaryOperator, self).__init__(name)
842
        # assert all([isinstance(parser, Parser) for parser in parsers]), str(parsers)
843
        self.parsers = parsers  # type: Tuple[Parser, ...]
844

Eckhart Arnold's avatar
Eckhart Arnold committed
845
846
    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
847
        return self.__class__(*parsers, name=self.name)
Eckhart Arnold's avatar
Eckhart Arnold committed
848

849
    def apply(self, func: Parser.ApplyFunc):
850
851
852
853
854
855
        if super(NaryOperator, self).apply(func):
            for parser in self.parsers:
                parser.apply(func)


class Optional(UnaryOperator):
856
    def __init__(self, parser: Parser, name: str = '') -> None:
857
        super(Optional, self).__init__(parser, name)
858
        # assert isinstance(parser, Parser)
859
        assert not isinstance(parser, Optional), \
860
            "Redundant nesting of options: %s(%s)" % \
861
862
            (str(name), str(parser.name))
        assert not isinstance(parser, Required), \
863
            "Nesting options with required elements is contradictory: " \
864
865
            "%s(%s)" % (str(name), str(parser.name))

866
    def __call__(self, text: str) -> Tuple[Node, str]:
867
868
869
870
871
        node, text = self.parser(text)
        if node:
            return Node(self, node), text
        return Node(self, ()), text

872
    def __repr__(self):
873
874
        return '[' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + ']'
875
876

class ZeroOrMore(Optional):
877
878
    def __call__(self, text: str) -> Tuple[Node, str]:
        results = ()  # type: Tuple[Node, ...]
879
880
881
        n = len(text) + 1
        while text and len(text) < n:
            n = len(text)
882
883
884
            node, text = self.parser(text)
            if not node:
                break
885
            if len(text) == n:
di68kap's avatar
di68kap committed
886
                node.add_error(dsl_error_msg(self, 'Infinite Loop detected.'))
887
888
889
            results += (node,)
        return Node(self, results), text

890
    def __repr__(self):
891
892
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + '}'
893

894
895

class OneOrMore(UnaryOperator):
896
    def __init__(self, parser: Parser, name: str = '') -> None:
897
        super(OneOrMore, self).__init__(parser, name)
898
899
900
901
        assert not isinstance(parser, Optional), \
            "Use ZeroOrMore instead of nesting OneOrMore and Optional: " \
            "%s(%s)" % (str(name), str(parser.name))

902
903
904
    def __call__(self, text: str) -> Tuple[Node, str]:
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: str
905
906
907
        n = len(text) + 1
        while text_ and len(text_) < n:
            n = len(text_)
908
909
910
            node, text_ = self.parser(text_)
            if not node:
                break
911
            if len(text_) == n:
912
                node.add_error(dsl_error_msg(self, 'Infinite Loop detected.'))
913
914
915
916
917
            results += (node,)
        if results == ():
            return None, text
        return Node(self, results), text_

918
    def __repr__(self):
919
920
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + '}+'
921

922

923
class Series(NaryOperator):
924
    def __init__(self, *parsers: Parser, name: str = '') -> None:
925
        super(Series, self).__init__(*parsers, name=name)
926
927
        assert len(self.parsers) >= 1

928
929
930
    def __call__(self, text: str) -> Tuple[Node, str]:
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: str
931
932
933
        for parser in self.parsers:
            node, text_ = parser(text_)
            if not node:
934
935
                return None, text
            results += (node,)
936
937
938
939
940
            if node.error_flag:
                break
        assert len(results) <= len(self.parsers)
        return Node(self, results), text_

941
942
943
    def __repr__(self):
        return " ".join(parser.repr for parser in self.parsers)

944
    def __add__(self, other: Parser) -> 'Series':
945
946
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
                        else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]