parse.py 173 KB
Newer Older
1
# parse.py - parser combinators for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
Eckhart Arnold's avatar
Eckhart Arnold committed
17

18
19

"""
20
Module ``parse`` contains the python classes and functions for
21
22
DHParser's packrat-parser. It's central class is the
``Grammar``-class, which is the base class for any concrete
23
24
25
26
27
28
29
30
Grammar. Grammar-objects are callable and parsing is done by
calling a Grammar-object with a source text as argument.

The different parsing functions are callable descendants of class
``Parser``. Usually, they are organized in a tree and defined
within the namespace of a grammar-class. See ``ebnf.EBNFGrammar``
for an example.
"""
31
32

from bisect import bisect_left
di68kap's avatar
di68kap committed
33
import functools
Eckhart Arnold's avatar
Eckhart Arnold committed
34
from collections import defaultdict, namedtuple
35
import copy
36
from typing import Callable, cast, List, Tuple, Set, AbstractSet, Dict, \
37
    DefaultDict, Sequence, Union, Optional, Iterator, Hashable, NamedTuple
38

eckhart's avatar
eckhart committed
39
from DHParser.configuration import get_config_value
Eckhart Arnold's avatar
Eckhart Arnold committed
40
from DHParser.error import Error, ErrorCode, MANDATORY_CONTINUATION, \
41
    UNDEFINED_RETRIEVE, PARSER_LOOKAHEAD_FAILURE_ONLY, \
eckhart's avatar
eckhart committed
42
    PARSER_LOOKAHEAD_MATCH_ONLY, PARSER_STOPPED_BEFORE_END, PARSER_NEVER_TOUCHES_DOCUMENT, \
43
44
45
    MALFORMED_ERROR_STRING, MANDATORY_CONTINUATION_AT_EOF, DUPLICATE_PARSERS_IN_ALTERNATIVE, \
    CAPTURE_WITHOUT_PARSERNAME, CAPTURE_DROPPED_CONTENT_WARNING, LOOKAHEAD_WITH_OPTIONAL_PARSER, \
    BADLY_NESTED_OPTIONAL_PARSER, BAD_ORDER_OF_ALTERNATIVES, BAD_MANDATORY_SETUP, \
46
    OPTIONAL_REDUNDANTLY_NESTED_WARNING, CAPTURE_STACK_NOT_EMPTY, BAD_REPETITION_COUNT, \
47
    AUTORETRIEVED_SYMBOL_NOT_CLEARED, RECURSION_DEPTH_LIMIT_HIT, SourceMapFunc
eckhart's avatar
eckhart committed
48
from DHParser.log import CallItem, HistoryRecord
49
from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME
50
from DHParser.stringview import StringView, EMPTY_STRING_VIEW
eckhart's avatar
eckhart committed
51
from DHParser.syntaxtree import ChildrenType, Node, RootNode, WHITESPACE_PTYPE, \
di68kap's avatar
di68kap committed
52
    TOKEN_PTYPE, ZOMBIE_TAG, EMPTY_NODE, EMPTY_PTYPE, ResultType
53
from DHParser.toolkit import sane_parser_name, escape_ctrl_chars, re, cython, \
54
    abbreviate_middle, RX_NEVER_MATCH, RxPatternType, linebreaks, line_col, identity
55

56

57
__all__ = ('ParserError',
eckhart's avatar
eckhart committed
58
59
60
           'ApplyFunc',
           'FlagFunc',
           'ParseFunc',
61
           'Parser',
62
           'AnalysisError',
63
           'GrammarError',
64
           'Grammar',
65
66
           'Always',
           'Never',
eckhart's avatar
eckhart committed
67
           'AnyChar',
68
           'PreprocessorToken',
69
70
           'Text',
           'DropText',
71
           'RegExp',
72
           'update_scanner',
73
74
           'RE',
           'TKN',
75
           'Whitespace',
76
           'DropRegExp',
77
           'mixin_comment',
di68kap's avatar
di68kap committed
78
           'mixin_nonempty',
di68kap's avatar
di68kap committed
79
           'CombinedParser',
80
           'TreeReduction',
eckhart's avatar
eckhart committed
81
82
           'UnaryParser',
           'NaryParser',
83
           'Drop',
84
85
86
87
           'Synonym',
           'Option',
           'ZeroOrMore',
           'OneOrMore',
88
           'NO_MANDATORY',
Eckhart Arnold's avatar
Eckhart Arnold committed
89
           'MandatoryNary',
90
91
           'Series',
           'Alternative',
92
           'longest_match',
93
           'INFINITE',
94
           'Counted',
95
           'Interleave',
eckhart's avatar
eckhart committed
96
           'Required',
97
98
99
100
101
           'Lookahead',
           'NegativeLookahead',
           'Lookbehind',
           'NegativeLookbehind',
           'last_value',
102
103
           'optional_last_value',
           'matching_bracket',
104
105
106
           'Capture',
           'Retrieve',
           'Pop',
107
           'Forward')
108
109
110
111


########################################################################
#
112
# ParserError class
113
114
115
116
#
########################################################################


Eckhart Arnold's avatar
Eckhart Arnold committed
117
118
119
class ParserError(Exception):
    """
    A `ParserError` is thrown for those parser errors that allow the
120
    controlled re-entrance of the parsing process after the error occurred.
Eckhart Arnold's avatar
Eckhart Arnold committed
121
    If a reentry-rule has been configured for the parser where the error
122
    occurred, the parser guard can resume the parsing process.
Eckhart Arnold's avatar
Eckhart Arnold committed
123
124

    Currently, the only case when a `ParserError` is thrown (and not some
125
    different kind of error like `UnknownParserError`) is when a `Series`-
eckhart's avatar
eckhart committed
126
    or `Interleave`-parser detects a missing mandatory element.
Eckhart Arnold's avatar
Eckhart Arnold committed
127
    """
128
129
130
131
132
133
    def __init__(self,
                 parser: 'Parser',
                 node: Node,
                 rest: StringView,
                 error: Error, *,
                 first_throw: bool):
eckhart's avatar
eckhart committed
134
        assert node is not None
135
136
137
138
        self.parser = parser  # type: 'Parser'
        self.node = node      # type: Node
        self.rest = rest      # type: StringView
        self.error = error    # type: Error
di68kap's avatar
di68kap committed
139
        self.first_throw = first_throw   # type: bool
140
        self.attributes_locked = frozenset({'parser', 'node', 'rest', 'error', 'first_throw'})
eckhart's avatar
eckhart committed
141
        self.frozen_callstack = tuple()  # type: Tuple[CallItem, ...]  # tag_name, location
Eckhart Arnold's avatar
Eckhart Arnold committed
142

143
144
145
146
147
148
149
150
151
    def __setattr__(self, name, value):
        if name == "attributes_locked":
            self.__dict__[name] = value
        elif "attributes_locked" not in self.__dict__ \
                or name not in self.__dict__['attributes_locked']:
            self.__dict__[name] = value
        else:
            raise TypeError('Attribute %s of ParserError-object must not be reassigned!' % name)

152
    def __str__(self):
153
154
        return "%i: %s    %s (%s)" \
               % (self.node.pos, str(self.rest[:25]), repr(self.node), str(self.error))
155

Eckhart Arnold's avatar
Eckhart Arnold committed
156

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
    def new_PE(self, **kwargs):
        """Returns a new ParserError object with the same attribute values
        as `self`, except those that are reassigned in `**kwargs`.

        >>> pe = ParserError(Parser(), Node('test', ""), StringView(""), Error("", 0), first_throw=True)
        >>> pe_derived = pe.new_PE(first_throw = False)
        >>> pe.first_throw
        True
        >>> pe_derived.first_throw
        False
        """
        args = { "parser": self.parser,
                 "node": self.node,
                 "rest": self.rest,
                 "error": self.error,
                 "first_throw": self.first_throw }
        assert len(kwargs.keys() - args.keys()) == 0, str(kwargs.keys() - args.keys())
        args.update(kwargs)
        pe = ParserError(**args)
        pe.frozen_callstack = self.frozen_callstack
        return pe


180
PatternMatchType = Union[RxPatternType, str, Callable, 'Parser']
181
182
ErrorMessagesType = List[Tuple[PatternMatchType, str]]
ResumeList = List[PatternMatchType]  # list of strings or regular expressions
183
184
ReentryPointAlgorithm = Callable[[StringView, int, int], Tuple[int, int]]
# (text, start point, end point) => (reentry point, match length)
185
# A return value of (-1, x) means that no reentry point before the end of the document was found
Eckhart Arnold's avatar
Eckhart Arnold committed
186

187

di68kap's avatar
di68kap committed
188
189
# @cython.returns(cython.int)
@functools.lru_cache()
eckhart's avatar
eckhart committed
190
@cython.locals(upper_limit=cython.int, closest_match=cython.int, pos=cython.int)
191
192
193
def reentry_point(rest: StringView,
                  rules: ResumeList,
                  comment_regex,
194
                  search_window: int = -1) -> Tuple[int, Node]:
Eckhart Arnold's avatar
Eckhart Arnold committed
195
196
    """
    Finds the point where parsing should resume after a ParserError has been caught.
197
    The algorithm makes sure that this reentry-point does not lie inside a comment.
198
199
200
    The re-entry point is always the point after the end of the match of the regular
    expression defining the re-entry point. (Use look ahead, if you wand to define
    the re-entry point by what follows rather than by what text precedes the point.)
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224

    REMARK: The algorithm assumes that any stretch of the document that matches
    `comment_regex` is actually a comment. It is possible to define grammars,
    where the use of comments is restricted to certain areas and that allow to
    use constructs that look like comments (i.e. will be matched by `comment_regex`)
    but are none in other areas. For example::

            my_string = "# This is not a comment"; foo()  # This is a comment
            bar()

    Here the reentry-algorithm would overlook `foo()` and jump directly to `bar()`.
    However, since the reentry-algorithm only needs to be good enough to do its
    work, this seems acceptable.

    :param rest:  The rest of the parsed text or, in other words, the point where
        a ParserError was thrown.
    :param rules: A list of strings, regular expressions or search functions.
        The rest of the text is searched for each of these. The closest match
        is the point where parsing will be resumed.
    :param comment_regex: A regular expression object that matches comments.
    :param search_window: The maximum size of the search window for finding the
        reentry-point. A value smaller than zero means that the complete remaining
        text will be searched. A value of zero effectively turns of resuming after
        error.
225
226
    :return: A tuple of integer index of the closest reentry point and a Node
        capturing all text from ``rest`` up to this point or ``(-1, None)`` if no
227
        reentry-point was found.
Eckhart Arnold's avatar
Eckhart Arnold committed
228
    """
229
    upper_limit = len(rest) + 1
230
    closest_match = upper_limit
231
    skip_node = None
232
233
234
    comments = None  # type: Optional[Iterator]
    if search_window < 0:
        search_window = len(rest)
235

eckhart's avatar
eckhart committed
236
    @cython.locals(a=cython.int, b=cython.int)
237
    def next_comment() -> Tuple[int, int]:
238
239
240
241
        """Returns the [start, end[ intervall of the next comment in the text.
        The comment-iterator start at the beginning of the `rest` of the
        document and is reset for each search rule.
        """
242
243
244
245
246
247
248
249
250
251
        nonlocal rest, comments
        if comments:
            try:
                m = next(comments)
                a, b = m.span()
                return rest.index(a), rest.index(b)
            except StopIteration:
                comments = None
        return -1, -2

252
253
    @cython.locals(start=cython.int)
    def str_search(s, start: int = 0) -> Tuple[int, int]:
254
255
256
257
258
        """Returns the starting position of the next occurrence of `s` in
        the `rest` of the document beginning with `start` and the length
        of the match, which in this case is always the length of `s` itself.
        If their is no match, the returned starting position will be -1.
        """
259
260
        nonlocal rest
        return rest.find(s, start, start + search_window), len(s)
261

eckhart's avatar
eckhart committed
262
    @cython.locals(start=cython.int, end=cython.int)
263
    def rx_search(rx, start: int = 0) -> Tuple[int, int]:
264
265
266
267
268
        """Returns the staring position and the length of the next match of
        the regular expression `rx` in the `rest` of the document, starting
        with `start`.
        If their is no match, the returned starting position will be -1.
        """
269
        nonlocal rest
270
        m = rest.search(rx, start, start + search_window)
271
        if m:
272
273
            begin, end = m.span()
            return rest.index(begin), end - begin
274
275
        return -1, 0

276
277
278
279
    def algorithm_search(func: ReentryPointAlgorithm, start: int = 0):
        """Returns the next match as a tuple of position and length that
        the reentry-point-search-function `func` yields.
        """
280
        nonlocal rest
281
        return func(rest, start, start + search_window)
282

283
    @cython.returns(cython.int)
eckhart's avatar
eckhart committed
284
    @cython.locals(a=cython.int, b=cython.int, k=cython.int, length=cython.int)
285
    def entry_point(search_func, search_rule) -> int:
286
287
288
        """Returns the next reentry-point outside a comment that `search_func`
        yields. If no reentry point is found, the first position after the
        end of the text ("upper limit") is returned."""
289
290
        a, b = next_comment()
        k, length = search_func(search_rule)
eckhart's avatar
eckhart committed
291
        while a < b <= k + length:
292
            a, b = next_comment()
293
294
295
        # find next as long as start or end point of resume regex are inside a comment
        while (a < k < b) or (a < k + length < b):
            k, length = search_func(search_rule, b)
296
297
            while a < b <= k:
                a, b = next_comment()
298
        return k + length if k >= 0 else upper_limit
299

300
    # find closest match
Eckhart Arnold's avatar
Eckhart Arnold committed
301
    for rule in rules:
302
        comments = rest.finditer(comment_regex)
303
304
305
306
307
308
309
        if isinstance(rule, Parser):
            _node, _text = cast(Parser, rule)(rest)
            if _node:
                pos = len(rest) - len(_text)
                if pos < closest_match:
                    closest_match = pos
                    skip_node = _node
310
        else:
311
312
313
314
315
316
317
318
319
320
            if callable(rule):
                search_func = algorithm_search
            elif isinstance(rule, str):
                search_func = str_search
            else:
                search_func = rx_search
            pos = entry_point(search_func, rule)
            if pos < closest_match:
                skip_node = None
                closest_match = pos
321

322
    # in case no rule matched return -1
323
324
    if closest_match == upper_limit:
        closest_match = -1
325
326
    if skip_node is None:
        skip_node = Node(ZOMBIE_TAG, rest[:max(closest_match,0)])
327
    return closest_match, skip_node
Eckhart Arnold's avatar
Eckhart Arnold committed
328
329


330
331
332
333
334
335
336
########################################################################
#
# Parser base class
#
########################################################################


337
338
339
ParsingResult = Tuple[Optional[Node], StringView]
MemoizationDict = Dict[int, ParsingResult]

340
ApplyFunc = Callable[[List['Parser']], Optional[bool]]
341
# The return value of `True` stops any further application
eckhart's avatar
eckhart committed
342
FlagFunc = Callable[[ApplyFunc, Set[ApplyFunc]], bool]
343
ParseFunc = Callable[[StringView], ParsingResult]
eckhart's avatar
eckhart committed
344
345


346
class Parser:
347
348
349
350
351
352
353
354
355
356
357
358
359
360
    """
    (Abstract) Base class for Parser combinator parsers. Any parser
    object that is actually used for parsing (i.e. no mock parsers)
    should should be derived from this class.

    Since parsers can contain other parsers (see classes UnaryOperator
    and NaryOperator) they form a cyclical directed graph. A root
    parser is a parser from which all other parsers can be reached.
    Usually, there is one root parser which serves as the starting
    point of the parsing process. When speaking of "the root parser"
    it is this root parser object that is meant.

    There are two different types of parsers:

di68kap's avatar
di68kap committed
361
    1. *Named parsers* for which a name is set in field `parser.pname`.
362
363
364
       The results produced by these parsers can later be retrieved in
       the AST by the parser name.

365
366
    2. *Disposable parsers* where the name-field just contains the empty
       string. AST-transformation of disposable parsers can be hooked
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
       only to their class name, and not to the individual parser.

    Parser objects are callable and parsing is done by calling a parser
    object with the text to parse.

    If the parser matches it returns a tuple consisting of a node
    representing the root of the concrete syntax tree resulting from the
    match as well as the substring `text[i:]` where i is the length of
    matched text (which can be zero in the case of parsers like
    `ZeroOrMore` or `Option`). If `i > 0` then the parser has "moved
    forward".

    If the parser does not match it returns `(None, text). **Note** that
    this is not the same as an empty match `("", text)`. Any empty match
    can for example be returned by the `ZeroOrMore`-parser in case the
    contained parser is repeated zero times.

    Attributes and Properties:
385

386
        pname:  The parser's name.
387

388
389
        disposable: A property indicating that the parser returns
                anonymous nodes. For performance
390
                reasons this is implemented as an object variable rather
391
                than a property. This property should always be equal to
392
                `self.tag_name[0] == ":"`.
393

394
395
396
397
        drop_content: A property (for performance reasons implemented as
                simple field) that, if set, induces the parser not to return
                the parsed content or sub-tree if it has matched but the
                dummy `EMPTY_NODE`. In effect the parsed content will be
398
399
400
                dropped from the concrete syntax tree already. Only
                anonymous (or pseudo-anonymous) parsers are allowed to
                drop content.
401

402
        tag_name: The tag_name for the nodes that are created by
403
                the parser. If the parser is named, this is the same as
di68kap's avatar
di68kap committed
404
405
                `pname`, otherwise it is the name of the parser's type
                prefixed with a colon ":".
406

407
408
409
410
411
412
413
        eq_class: A unique number for the class of functionally
                equivalent parsers that this parser belongs to.
                (This serves the purpose of optimizing memoization,
                by tying memoization dictionaries to the classes
                of functionally equivalent parsers, rather than to
                the individual parsers themselves.)

414
415
416
417
418
419
420
421
422
423
        visited:  Mapping of places this parser has already been to
                during the current parsing process onto the results the
                parser returned at the respective place. This dictionary
                is used to implement memoizing.

        cycle_detection:  The apply()-method uses this variable to make
                sure that one and the same function will not be applied
                (recursively) a second time, if it has already been
                applied to this parser.

424
425
426
427
        proxied: The original `_parse()`-method is stored here, if a
                proxy (e.g. a tracing debugger) is installed via the
                `set_proxy()`-method.

428
        _grammar:  A reference to the Grammar object to which the parser
429
                is attached.
430
431
432
433
434

        _symbol:  The name of the closest named parser to which this
                parser is connected in a grammar. If pname is not the
                empty string, this will become the same as pname, when
                the property `symbol` is read for the first time.
435
436
    """

437
    def __init__(self) -> None:
438
        # assert isinstance(name, str), str(name)
439
        self.pname = ''               # type: str
440
        self.disposable = True        # type: bool
441
        self.drop_content = False     # type: bool
442
        self.tag_name = self.ptype    # type: str
443
        self.eq_class = id(self)      # type: int
444
        self.cycle_detection = set()  # type: Set[ApplyFunc]
445
        # this indirection is required for Cython-compatibility
446
        self._parse_proxy = self._parse  # type: ParseFunc
eckhart's avatar
eckhart committed
447
        # self.proxied = None           # type: Optional[ParseFunc]
448
        try:
449
            self._grammar = get_grammar_placeholder()  # type: Grammar
450
        except NameError:
di68kap's avatar
di68kap committed
451
            pass                      # ensures Cython-compatibility
452
        self._symbol = ''             # type: str
453
454
455
        self.reset()

    def __deepcopy__(self, memo):
456
        """Deepcopy method of the parser. Upon instantiation of a Grammar-
457
        object, parsers will be deep-copied to the Grammar object. If a
458
        derived parser-class changes the signature of the `__init__`-constructor,
459
460
461
        `__deepcopy__`-method must be replaced (i.e. overridden without
        calling the same method from the superclass) by the derived class.
        """
462
        duplicate = self.__class__()
463
        copy_parser_base_attrs(self, duplicate)
464
        return duplicate
465

466
    def __repr__(self):
di68kap's avatar
di68kap committed
467
        return self.pname + self.ptype
468
469

    def __str__(self):
di68kap's avatar
di68kap committed
470
        return self.pname + (' = ' if self.pname else '') + repr(self)
471

472
473
474
475
476
477
    @property
    def ptype(self) -> str:
        """Returns a type name for the parser. By default this is the name of
        the parser class with an added leading colon ':'. """
        return ':' + self.__class__.__name__

478
479
480
481
482
    @property
    def symbol(self) -> str:
        """Returns the symbol with which the parser is associated in a grammar.
        This is the closest parser with a pname that contains this parser."""
        if not self._symbol:
eckhart's avatar
eckhart committed
483
            try:
484
                self._symbol = self.grammar.associated_symbol__(self).pname
eckhart's avatar
eckhart committed
485
486
487
488
            except AttributeError:
                # return an empty string, if parser is not connected to grammar,
                # but be sure not to save the empty string in self._symbol
                return ''
489
490
        return self._symbol

491
492
493
    @property
    def repr(self) -> str:
        """Returns the parser's name if it has a name and self.__repr___() otherwise."""
di68kap's avatar
di68kap committed
494
        return self.pname if self.pname else self.__repr__()
495

496
497
498
499
    def reset(self):
        """Initializes or resets any parser variables. If overwritten,
        the `reset()`-method of the parent class must be called from the
        `reset()`-method of the derived class."""
500
501
        global _GRAMMAR_PLACEHOLDER
        grammar = self._grammar
502
        self.visited: MemoizationDict = grammar.get_memoization_dict__(self)
503

eckhart's avatar
eckhart committed
504
    @cython.locals(location=cython.int, gap=cython.int, i=cython.int)
505
    def __call__(self: 'Parser', text: StringView) -> ParsingResult:
506
507
        """Applies the parser to the given text. This is a wrapper method that adds
        the business intelligence that is common to all parsers. The actual parsing is
508
509
        done in the overridden method `_parse()`. This wrapper-method can be thought of
        as a "parser guard", because it guards the parsing process.
510
        """
511
        grammar = self._grammar
di68kap's avatar
di68kap committed
512
        location = grammar.document_length__ - text._len  # faster then len(text)?
513
514

        try:
515
516
            # rollback variable changing operation if parser backtracks to a position
            # before or at the location where the variable changing operation occurred
517
            if location <= grammar.last_rb__loc__:
518
519
                grammar.rollback_to__(location)

520
            # if location has already been visited by the current parser, return saved result
521
            visited = self.visited  # using local variable for better performance
di68kap's avatar
di68kap committed
522
            if location in visited:
523
                # no history recording in case of memoized results!
di68kap's avatar
di68kap committed
524
                return visited[location]
525

526
527
            memoization_state = grammar.suspend_memoization__
            grammar.suspend_memoization__ = False
528

529
            # now, the actual parser call!
530
            try:
Eckhart Arnold's avatar
Eckhart Arnold committed
531
                node, rest = self._parse_proxy(text)
532
            except ParserError as pe:
533
                # catching up with parsing after an error occurred
534
                gap = len(text) - len(pe.rest)
di68kap's avatar
di68kap committed
535
536
                rules = tuple(grammar.resume_rules__.get(
                    self.pname or grammar.associated_symbol__(self).pname, []))
537
                rest = pe.rest[len(pe.node):]
538
539
                i, skip_node = reentry_point(rest, rules, grammar.comment_rx__,
                                             grammar.reentry_search_window__)
540
                if i >= 0 or self == grammar.start_parser__:
eckhart's avatar
eckhart committed
541
542
                    # either a reentry point was found or the
                    # error has fallen through to the first level
543
                    assert pe.node._children or (not pe.node.result)
544
                    # apply reentry-rule or catch error at root-parser
eckhart's avatar
eckhart committed
545
                    if i < 0:  i = 0
546
                    try:
547
                        zombie = pe.node.pick_child(ZOMBIE_TAG)  # type: Optional[Node]
548
                    except (KeyError, ValueError):
549
550
551
                        zombie = None
                    if zombie and not zombie.result:
                        zombie.result = rest[:i]
eckhart's avatar
eckhart committed
552
                        tail = tuple()  # type: ChildrenType
553
                    else:
eckhart's avatar
eckhart committed
554
                        # nd.attr['err'] = pe.error.message
555
                        tail = (skip_node,)
556
                    rest = rest[i:]
557
558
                    if pe.first_throw:
                        node = pe.node
559
                        node.result = node._children + tail
560
                    else:
561
562
                        node = Node(
                            self.tag_name,
di68kap's avatar
di68kap committed
563
564
                            (Node(ZOMBIE_TAG, text[:gap]).with_pos(location), pe.node) + tail) \
                            .with_pos(location)
eckhart's avatar
eckhart committed
565
                # if no re-entry point was found, do any of the following:
566
                elif pe.first_throw:
eckhart's avatar
eckhart committed
567
                    # just fall through
eckhart's avatar
eckhart committed
568
                    # TODO: Is this case still needed with module "trace"?
569
                    raise pe.new_PE(first_throw=False)
570
                elif grammar.tree__.errors[-1].code == MANDATORY_CONTINUATION_AT_EOF:
eckhart's avatar
eckhart committed
571
572
                    # try to create tree as faithful as possible
                    node = Node(self.tag_name, pe.node).with_pos(location)
573
                else:
eckhart's avatar
eckhart committed
574
                    # fall through but skip the gap
575
576
                    result = (Node(ZOMBIE_TAG, text[:gap]).with_pos(location), pe.node) if gap \
                        else pe.node  # type: ResultType
577
578
                    raise pe.new_PE(node=Node(self.tag_name, result).with_pos(location),
                                    rest=text, first_throw=False)
579

580
            if node is None:
di68kap's avatar
di68kap committed
581
582
583
                if location > grammar.ff_pos__:
                    grammar.ff_pos__ = location
                    grammar.ff_parser__ = self
584
            else:
585
                node._pos = location
586
587
588
            if not grammar.suspend_memoization__:
                visited[location] = (node, rest)
                grammar.suspend_memoization__ = memoization_state
589

590
        except RecursionError:
591
            node = Node(ZOMBIE_TAG, str(text[:min(10, max(1, text.find("\n")))]) + " ...")
592
            node._pos = location
593
594
595
            error = Error("maximum recursion depth of parser reached; potentially due to too many "
                          "errors or left recursion!", location, RECURSION_DEPTH_LIMIT_HIT)
            grammar.tree__.add_error(node, error)
596
            grammar.most_recent_error__ = ParserError(self, node, text, error, first_throw=False)
597
598
599
            rest = EMPTY_STRING_VIEW

        return node, rest
600
601
602
603

    def __add__(self, other: 'Parser') -> 'Series':
        """The + operator generates a series-parser that applies two
        parsers in sequence."""
di68kap's avatar
di68kap committed
604
605
        if isinstance(other, Series):
            return cast('Series', other).__radd__(self)
606
607
608
609
610
611
        return Series(self, other)

    def __or__(self, other: 'Parser') -> 'Alternative':
        """The | operator generates an alternative parser that applies
        the first parser and, if that does not match, the second parser.
        """
di68kap's avatar
di68kap committed
612
613
        if isinstance(other, Alternative):
            return cast('Alternative', other).__ror__(self)
614
615
        return Alternative(self, other)

616
    def __mul__(self, other: 'Parser') -> 'Interleave':
di68kap's avatar
di68kap committed
617
618
619
620
621
622
623
624
        """The * operator generates an interleave parser that applies
        the first parser and the second parser in any possible order
        until both match.
        """
        if isinstance(other, Interleave):
            return cast(Interleave, other).__rmul__(self)
        return Interleave(self, other)

625
    def _parse(self, text: StringView) -> ParsingResult:
626
627
628
629
630
        """Applies the parser to the given `text` and returns a node with
        the results or None as well as the text at the position right behind
        the matching string."""
        raise NotImplementedError

eckhart's avatar
eckhart committed
631
    def is_optional(self) -> Optional[bool]:
632
        """Returns `True`, if the parser can never fail, i.e. never yields
eckhart's avatar
eckhart committed
633
634
635
636
637
        `None`, instead of a node. Returns `False`, if the parser can fail.
        Returns `None` if it is not known whether the parser can fail.
        """
        return None

638
    def set_proxy(self, proxy: Optional[ParseFunc]):
639
640
641
        """Sets a proxy that replaces the _parse()-method. Call `set_proxy`
        with `None` to remove a previously set proxy. Typical use case is
        the installation of a tracing debugger. See module `trace`.
642
643
        """
        if proxy is None:
Eckhart Arnold's avatar
Eckhart Arnold committed
644
            self._parse_proxy = self._parse
645
        else:
eckhart's avatar
eckhart committed
646
647
            if not isinstance(proxy, type(self._parse)):
                # assume that proxy is a function and bind it to self
648
649
                proxy = proxy.__get__(self, type(self))
            else:
eckhart's avatar
eckhart committed
650
                # if proxy is a method it must be a method of self
651
                assert proxy.__self__ == self
Eckhart Arnold's avatar
Eckhart Arnold committed
652
            self._parse_proxy = cast(ParseFunc, proxy)
653

Eckhart Arnold's avatar
Eckhart Arnold committed
654
    def name(self, pname: str, disposable: bool = False) -> 'Parser':
655
656
657
        """Sets the parser name to `pname` and returns `self`."""
        self.pname = pname
        self.tag_name = pname or self.ptype
Eckhart Arnold's avatar
Eckhart Arnold committed
658
        self.disposable = disposable
659
660
        return self

661
    @property
eckhart's avatar
eckhart committed
662
    def grammar(self) -> 'Grammar':
663
        try:
664
665
666
667
668
            # if not is_grammar_placeholder(self._grammar):
            #     return self._grammar
            # else:
            #     raise ValueError('Grammar has not yet been set!')
            return self._grammar
669
670
        except (AttributeError, NameError):
            raise AttributeError('Parser placeholder does not have a grammar!')
671
672
673

    @grammar.setter
    def grammar(self, grammar: 'Grammar'):
674
        try:
675
            if is_grammar_placeholder(self._grammar):
676
677
678
                self._grammar = grammar
                # self._grammar_assigned_notifier()
            elif self._grammar != grammar:
eckhart's avatar
eckhart committed
679
680
                raise AssertionError("Parser has already been assigned"
                                     "to a different Grammar object!")
681
682
        except AttributeError:
            pass  # ignore setting of grammar attribute for placeholder parser
683
        except NameError:  # Cython: No access to _GRAMMAR_PLACEHOLDER, yet :-(
684
            self._grammar = grammar
685

eckhart's avatar
eckhart committed
686
    def sub_parsers(self) -> Tuple['Parser', ...]:
687
688
689
        """Returns the list of sub-parsers if there are any.
        Overridden by Unary, Nary and Forward.
        """
eckhart's avatar
eckhart committed
690
        return tuple()
691

692
    def _apply(self, func: ApplyFunc, context: List['Parser'], flip: FlagFunc) -> bool:
693
694
        """
        Applies function `func(parser)` recursively to this parser and all
695
696
        descendant parsers as long as `func()` returns `None` or `False`.
        Otherwise stops the further application of `func` and returns `True`.
eckhart's avatar
eckhart committed
697
698
699
700
701
702
703
704

        In order to break cycles, function `flip` is called, which should
        return `True`, if this parser has already been visited. If not, it
        flips the cycle detection flag and returns `False`.

        This is a protected function and should not called from outside
        class Parser or any of its descendants. The entry point for external
        calls is the method `apply()` without underscore!
705
        """
706
707
        if not flip(func, self.cycle_detection):
            if func(context + [self]):
708
709
710
                return True
            else:
                for parser in self.sub_parsers():
711
                    if parser._apply(func, context + [self], flip):
712
713
714
                        return True
                return False
        return False
715

716
    def apply(self, func: ApplyFunc) -> Optional[bool]:
eckhart's avatar
eckhart committed
717
718
        """
        Applies function `func(parser)` recursively to this parser and all
719
720
721
722
723
724
725
726
727
728
729
730
731
732
        descendant parsers as long as `func()` returns `None` or `False`.
        Traversal is pre-order. Stops the further application of `func` and
        returns `True` once `func` has returned `True`.

        If `func` has been applied to all descendant parsers without issuing
        a stop signal by returning `True`, `False` is returned.

        This use of the return value allows to use the `apply`-method both
        to issue tests on all descendant parsers (including self) which may be
        decided already after some parsers have been visited without any need
        to visit further parsers. At the same time `apply` can be used to simply
        `apply` a procedure to all descendant parsers (including self) without
        worrying about forgetting the return value of procedure, because a
        return value of `None` means "carry on".
eckhart's avatar
eckhart committed
733
        """
734
        def positive_flip(f: ApplyFunc, flagged: AbstractSet[ApplyFunc]) -> bool:
eckhart's avatar
eckhart committed
735
736
737
738
739
740
741
742
743
            """Returns True, if function `f` has already been applied to this
            parser and sets the flag accordingly. Interprets `f in flagged == True`
            as meaning that `f` has already been applied."""
            if f in flagged:
                return True
            else:
                flagged.add(f)
                return False

744
        def negative_flip(f: ApplyFunc, flagged: AbstractSet[ApplyFunc]) -> bool:
eckhart's avatar
eckhart committed
745
746
747
748
749
750
751
752
753
754
            """Returns True, if function `f` has already been applied to this
            parser and sets the flag accordingly. Interprets `f in flagged == False`
            as meaning that `f` has already been applied."""
            if f not in flagged:
                return True
            else:
                flagged.remove(f)
                return False

        if func in self.cycle_detection:
755
            return self._apply(func, [], negative_flip)
eckhart's avatar
eckhart committed
756
        else:
757
            return self._apply(func, [], positive_flip)
758

759
760
761
762
763
764
765
766
767
768
769
770
    def _signature(self) -> Hashable:
        """This method should be implemented by any non-abstract descendant
        parser class. The implementation must make sure that all instances
        that have the same signature always yield the same parsing result
        for the same piece of text.

        It does not hurt, but only wastes an opportunity for optimization,
        if two functionally equivalent parsers provide different signatures.
        It is a serious mistake, though, if two functionally non-equivalent
        parsers have the same signature.

        By returning `id(self)` this mistake will become impossible, but
Eckhart Arnold's avatar
Eckhart Arnold committed
771
        it will turn signature-based memoization-optimization off for
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
        this parser.
        """
        return id(self)

    def signature(self) -> Hashable:
        """Returns a value that is identical for two different
        parser objects if they are functionally equivalent, i.e.
        yield the same return value for the same call parameters:

        >>> a = Text('[')
        >>> b = Text('[')
        >>> c = Text(']')
        >>> a is b
        False
        >>> a.signature() == b.signature()
        True
        >>> a.signature() == c.signature()
        False

        The purpose of parser-signatures is to optimize better
        memoization in cases of code repetition in the grammar.

        DO NOT OVERRIDE THIS METHOD. In order to implement a
        signature function, the protected method `_signature`
        should be overridden instead.
        """
        return self.pname if self.pname else self._signature()


801
    def static_error(self, msg: str, code: ErrorCode) -> 'AnalysisError':
802
        return AnalysisError(self.symbol, self, Error(msg, 0, code))
803

eckhart's avatar
eckhart committed
804
    def static_analysis(self) -> List['AnalysisError']:
805
806
        """Analyses the parser for logical errors after the grammar has been
        instantiated."""
eckhart's avatar
eckhart committed
807
        return []
eckhart's avatar
eckhart committed
808

809

810
def copy_parser_base_attrs(src: Parser, duplicate: Parser):
811
    """Duplicates all attributes of the Parser-class from `src` to `duplicate`."""
Eckhart Arnold's avatar
Eckhart Arnold committed
812
    duplicate.pname = src.pname
813
    duplicate.disposable = src.disposable
Eckhart Arnold's avatar
Eckhart Arnold committed
814
815
    duplicate.drop_content = src.drop_content
    duplicate.tag_name = src.tag_name
816
817
818
819
820
821
822
823
824
825
826
827
828
    duplicate.eq_class = src.eq_class


def determine_eq_classes(root: Parser):
    """Sorts the parsers originating in root (imperfectly) into equivalence
    classes and assigns respective the class identifier to the `eq_class`-field
    of each parser."""
    eq_classes: Dict[Hashable, int] = {}

    def assign_eq_class(parser_stack: List[Parser]) -> bool:
        nonlocal eq_classes
        p = parser_stack[-1]
        signature = p.signature()
829
        p.eq_class = eq_classes.setdefault(signature, id(p))
830
        return False
Eckhart Arnold's avatar
Eckhart Arnold committed
831

832
    root.apply(assign_eq_class)
Eckhart Arnold's avatar
Eckhart Arnold committed
833

di68kap's avatar
di68kap committed
834

eckhart's avatar
eckhart committed
835
836
def Drop(parser: Parser) -> Parser:
    """Returns the parser with the `parser.drop_content`-property set to `True`."""
837
    assert parser.disposable, "Parser must be anonymous to be allowed to drop ist content."
eckhart's avatar
eckhart committed
838
839
840
    if isinstance(parser, Forward):
        cast(Forward, parser).parser.drop_content = True
    parser.drop_content = True
Eckhart Arnold's avatar
Eckhart Arnold committed
841
    return parser
eckhart's avatar
eckhart committed
842
843


844
PARSER_PLACEHOLDER = None  # type: Optional[Parser]
845
846
# Don't access PARSER_PLACEHOLDER directly, use get_parser_placeholder() instead

847
848
849
850

def get_parser_placeholder() -> Parser:
    global PARSER_PLACEHOLDER
    if PARSER_PLACEHOLDER is None:
di68kap's avatar
di68kap committed
851
852
853
854
        PARSER_PLACEHOLDER = Parser.__new__(Parser)  # Parser()
        PARSER_PLACEHOLDER.pname = ''
        PARSER_PLACEHOLDER.disposable = False
        PARSER_PLACEHOLDER.drop_content = False
di68kap's avatar
di68kap committed
855
        PARSER_PLACEHOLDER.tag_name = ':PLACEHOLDER__'
856
        PARSER_PLACEHOLDER.eq_class = id(PARSER_PLACEHOLDER)
857
    return cast(Parser, PARSER_PLACEHOLDER)
858
859


eckhart's avatar
eckhart committed
860
861
862
863
864
def is_parser_placeholder(parser: Optional[Parser]) -> bool:
    """Returns True, if `parser` is `None` or merely a placeholder for a parser."""
    return not parser or parser.ptype == ":Parser"


eckhart's avatar
eckhart committed
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
# functions for analysing the parser tree/graph ###


def has_non_autocaptured_symbols(context: List[Parser]) -> Optional[bool]:
    """Returns True, if the context contains a Capture-Parser that is not
    shielded by a Retrieve-Parser. This is the case for captured symbols
    that are not "auto-captured" by a Retrieve-Parser.
    """
    for parser in context:
        if parser.ptype == ":Retrieve":
            break
        elif parser.ptype == ":Capture":
            p = cast(UnaryParser, parser).parser
            while p.ptype in (":Synonym", ":Forward"):
                p = cast(UnaryParser, p).parser
            if not isinstance(p, Retrieve):
                return True
    return None


885
886
887
888
889
890
########################################################################
#
# Grammar class, central administration of all parser of a grammar
#
########################################################################

891
892
def mixin_comment(whitespace: str, comment: str) -> str:
    """
893
    Returns a regular expression pattern that merges comment and whitespace
894
    regexps. Thus comments can occur wherever whitespace is allowed
895
896
897
898
899
900
    and will be skipped just as implicit whitespace.

    Note, that because this works on the level of regular expressions,
    nesting comments is not possible. It also makes it much harder to
    use directives inside comments (which isn't recommended, anyway).
    """
901
    if comment:
eckhart's avatar
eckhart committed
902
903
        whitespace = '(?:' + whitespace + ')'
        comment = '(?:' + comment + ')'
904
905
        return '(?:' + whitespace + '(?:' + comment + whitespace + ')*)'
    return whitespace
906
907


di68kap's avatar
di68kap committed
908
def mixin_nonempty(whitespace: str) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
909
    r"""
910
911
912
    Returns a regular expression pattern that matches only if the regular
    expression pattern `whitespace` matches AND if the match is not empty.

di68kap's avatar
di68kap committed
913
914
    If `whitespace`  does not match the empty string '', anyway,
    then it will be returned unaltered.
915

eckhart's avatar
eckhart committed
916
    WARNING: `minin_nonempty` does not work for regular expressions the matched
917
918
919
920
    strings of which can be followed by a symbol that can also occur at
    the start of the regular expression.

    In particular, it does not work for fixed size regular expressions,
eckhart's avatar
eckhart committed
921
922
923
    that is / / or /   / or /\t/ won't work, but / */ or /\s*/ or /\s+/
    do work. There is no test for this. Fixed-size regular expressions
    run through `mixin_nonempty` will not match at any more if they are applied
Eckhart Arnold's avatar
Eckhart Arnold committed
924
925
    to the beginning or the middle of a sequence of whitespaces!

eckhart's avatar
eckhart committed
926
    In order to be safe, your whitespace regular expressions should follow
Eckhart Arnold's avatar
Eckhart Arnold committed
927
928
    the rule: "Whitespace cannot be followed by whitespace" or "Either
    grab it all or leave it all".
929
930
931
932
933
934
935
936
937
938

    :param whitespace: a regular expression pattern
    :return: new regular expression pattern that does not match the empty
        string '' any more.
    """
    if re.match(whitespace, ''):
        return r'(?:(?=(.|\n))' + whitespace + r'(?!\1))'
    return whitespace


Eckhart Arnold's avatar
Eckhart Arnold committed
939
940
941
942
943
944
945
946
947
948
949
950
# # AnalysisError = Tuple[str, Parser, Error]      # pname, parser, error
# class AnalysisError(NamedTuple):
#     symbol: str
#     parser: Parser
#     error: Error

# collections.namedtuple needed for Cython compatbility
AnalysisError = namedtuple('AnalysisError',
    ['symbol',  # type: str
     'parser',  # type: Parser
     'error'    # type: Error
    ], module = __name__)
951
952
953
954
955
956


class GrammarError(Exception):
    """GrammarError will be raised if static analysis reveals errors
    in the grammar.
    """
957
    def __init__(self, static_analysis_result: List[AnalysisError]):
958
959
960
961
962
963
964
965
966
967
        assert static_analysis_result  # must not be empty
        self.errors = static_analysis_result

    def __str__(self):
        if len(self.errors) == 1:
            return str(self.errors[0][2])
        return '\n' + '\n'.join(("%i. " % (i + 1) + str(err_tuple[2]))
                                for i, err_tuple in enumerate(self.errors))


968
969
970
RESERVED_PARSER_NAMES = ('root__', 'dwsp__', 'wsp__', 'comment__', 'root_parser__', 'ff_parser__')


971
972
973
974
975
976
977
978
979
980
981
982
983
class Grammar:
    r"""
    Class Grammar directs the parsing process and stores global state
    information of the parsers, i.e. state information that is shared
    accross parsers.

    Grammars are basically collections of parser objects, which are
    connected to an instance object of class Grammar. There exist two
    ways of connecting parsers to grammar objects: Either by passing
    the root parser object to the constructor of a Grammar object
    ("direct instantiation"), or by assigning the root parser to the
    class variable "root__" of a descendant class of class Grammar.

eckhart's avatar
eckhart committed
984
    Example for direct instantiation of a grammar::
985

986
        >>> number = RE(r'\d+') + RE(r'\.') + RE(r'\d+') | RE(r'\d+')
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
        >>> number_parser = Grammar(number)
        >>> number_parser("3.1416").content
        '3.1416'

    Collecting the parsers that define a grammar in a descendant class of</