ParserCombinators.py 79.3 KB
Newer Older
Eckhart Arnold's avatar
Eckhart Arnold committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/usr/bin/python3

"""ParserCombinators.py - parser combinators for left-recursive grammers

Copyright 2016  by Eckhart Arnold

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
15
16
17
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
Eckhart Arnold's avatar
Eckhart Arnold committed
18
19


20
21
22
23
24
Module parser_combinators contains a number of classes that together
make up parser combinators for left-recursive grammers. For each
element of the extended Backus-Naur-Form as well as for a regular
expression token a class is defined. The set of classes can be used to
define a parser for (ambiguous) left-recursive grammers.
Eckhart Arnold's avatar
Eckhart Arnold committed
25
26
27
28
29


References and Acknowledgements:

Dominikus Herzberg: Objekt-orientierte Parser-Kombinatoren in Python,
30
31
Blog-Post, September, 18th 2008 on denkspuren. gedanken, ideen,
anregungen und links rund um informatik-themen, URL:
Eckhart Arnold's avatar
Eckhart Arnold committed
32
33
http://denkspuren.blogspot.de/2008/09/objekt-orientierte-parser-kombinatoren.html

34
35
36
Dominikus Herzberg: Eine einfache Grammatik für LaTeX, Blog-Post,
September, 18th 2008 on denkspuren. gedanken, ideen, anregungen und
links rund um informatik-themen, URL:
Eckhart Arnold's avatar
Eckhart Arnold committed
37
38
39
http://denkspuren.blogspot.de/2008/09/eine-einfache-grammatik-fr-latex.html

Dominikus Herzberg: Uniform Syntax, Blog-Post, February, 27th 2007 on
40
41
denkspuren. gedanken, ideen, anregungen und links rund um
informatik-themen, URL:
Eckhart Arnold's avatar
Eckhart Arnold committed
42
43
http://denkspuren.blogspot.de/2007/02/uniform-syntax.html

44
45
46
47
Richard A. Frost, Rahmatullah Hafiz and Paul Callaghan: Parser
Combinators for Ambiguous Left-Recursive Grammars, in: P. Hudak and
D.S. Warren (Eds.): PADL 2008, LNCS 4902, pp. 167–181, Springer-Verlag
Berlin Heidelberg 2008.
Eckhart Arnold's avatar
Eckhart Arnold committed
48
49
50

Juancarlo Añez: grako, a PEG parser generator in Python,
https://bitbucket.org/apalala/grako
51

Eckhart Arnold's avatar
Eckhart Arnold committed
52
53
"""

54
# TODO: Replace copy.deepcopy() call in GrammarBase class by custom copy()-methods in the Parser classes. Is that really better?
Eckhart Arnold's avatar
Eckhart Arnold committed
55
56
57
58


import collections
import copy
59
60
import enum
from functools import partial
61
import hashlib
Eckhart Arnold's avatar
Eckhart Arnold committed
62
63
import keyword
import os
64
from typing import NamedTuple
Eckhart Arnold's avatar
Eckhart Arnold committed
65
66
67
68
69
70
71
72
73

# try:
#     import regex as re
# except ImportError:
#     import re
import re       # as of now use `re` - even hough `regex` appears to be better
import sys


74
__version__ = '0.5.2' + '_dev' + str(os.stat(__file__).st_mtime)
75
76


77
DEBUG = "DEBUG"
Eckhart Arnold's avatar
Eckhart Arnold committed
78
79


80
81
82
def DEBUG_DIR():
    global DEBUG
    dirname = ""
Eckhart Arnold's avatar
Eckhart Arnold committed
83
    if DEBUG:
84
85
        if os.path.exists(DEBUG):
            dirname = DEBUG if os.path.isdir(DEBUG) else ""
Eckhart Arnold's avatar
Eckhart Arnold committed
86
        else:
87
88
89
            os.mkdir(DEBUG)
            dirname = DEBUG
    return dirname
Eckhart Arnold's avatar
Eckhart Arnold committed
90
91


92
93
94
def DEBUG_FILE_NAME(grammar_base):
    name = grammar_base.__class__.__name__
    return name[:-7] if name.endswith('Grammar') else name
Eckhart Arnold's avatar
Eckhart Arnold committed
95
96


97
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
98
99
100
#
# Scanner / Preprocessor support
#
101
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
102
103
104
105
106
107
108
109


RX_SCANNER_TOKEN = re.compile('\w+')
BEGIN_SCANNER_TOKEN = '\x1b'
END_SCANNER_TOKEN = '\x1c'


def make_token(token, argument=''):
110
111
112
113
114
115
116
117
118
119
120
121
    """
    Turns the ``token`` and ``argument`` into a special token that
    will be caught by the `ScannerToken`-parser.

    Args:
        token (str): The token to be wrapped into a scanner token
        argument (str): The optional argument string added to the token

    Returns (str):
        The scanner token, starting with the `BEGIN_SCANNER_TOKEN`-
        character and ending with the `END_SCANNER_TOKEN`-character.
    """
Eckhart Arnold's avatar
Eckhart Arnold committed
122
123
124
    assert RX_SCANNER_TOKEN.match(token)
    assert argument.find(BEGIN_SCANNER_TOKEN) < 0
    assert argument.find(END_SCANNER_TOKEN) < 0
125

Eckhart Arnold's avatar
Eckhart Arnold committed
126
127
128
129
130
131
    return BEGIN_SCANNER_TOKEN + token + argument + END_SCANNER_TOKEN


nil_scanner = lambda text: text


132
133

########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
134
135
136
#
# Parser tree
#
137
138
########################################################################

Eckhart Arnold's avatar
Eckhart Arnold committed
139
140
141

def line_col(text, pos):
    """
142
143
144
145
146
147
148
149
150
151
152
153
    Returns the position within a text as (line, column)-tuple.

    Args:
        text (str):  The text for which the line and column of the
                position ``pos`` shall be returned
        pos (int):  The position in the text starting with 0, e.g. 2536
                means the 2537th character. Must be lower than the
                text's length
    Returns (tuple):
        The line and column of the given position
    """
    assert pos < len(text)
Eckhart Arnold's avatar
Eckhart Arnold committed
154
155
156
157
158
    line = text.count("\n", 0, pos) + 1
    column = pos - text.rfind("\n", 0, pos)
    return line, column


159
class ZombieParser:
160
    """Serves as a substitute for a Parser instance. Required by
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
    `Node`-objects, for example. The ZOMBIE_PARSER has a name and can
    be called, but it never matches.
    """
    alive = False
    def __init__(self):
        assert not self.__class__.alive, "There can be only one!"
        assert self.__class__ == ZombieParser, "No derivatives, please!"
        self.name = "ZOMBIE"
        self.__class__.alive = True

    def __str__(self):
        return self.name

    def __call__(self, text):
        """Better call Saul ;-)"""
        return None, text


ZOMBIE_PARSER = ZombieParser()


class Error(NamedTuple):
    pos: int
    msg: str
185
186


Eckhart Arnold's avatar
Eckhart Arnold committed
187
class Node:
188
189
190
191
    """
    Represents a node in the concrete or abstract syntax tree.

    Attributes:
192
        tag (str):  The name of the node, which is either its parser's
193
194
195
196
197
                name or, if that is empty, the parser's class name
        result (str or tuple):  The result of the parser which
                generated this node, which can be either a string or a
                tuple of child nodes.
        children (tuple):  The tuple of child nodes or an empty tuple
Eckhart Arnold's avatar
Eckhart Arnold committed
198
                if there are no child nodes. READ ONLY!
199
        parser (Parser):  The parser which generated this node.
200
201
        errors (list):  A list of parser- or compiler-errors:
                tuple(position, string) attached to this node
Eckhart Arnold's avatar
Eckhart Arnold committed
202
203
204
205
206
207
        len (int):  The full length of the node's string result if the
                node is a leaf node or, otherwise, the concatenated
                string result's of its descendants. The figure always
                represents the length before AST-transformation and
                will never change through AST-transformation.
                READ ONLY!
208
        pos (int):  the position of the node within the parsed text.
209

210
211
212
                The value of ``pos`` is zero by default and is will be
                updated if the node or any of its parents is attached
                to a new parent.
Eckhart Arnold's avatar
Eckhart Arnold committed
213
214
215
216
217

                From the point of view of a client, this value should
                be considered READ ONLY. At any rate, it should never
                be reassigned only during parsing stage and never
                during or after AST-transformation.
218
219
    """

Eckhart Arnold's avatar
Eckhart Arnold committed
220
    def __init__(self, parser, result):
221
222
223
224
225
226
227
228
229
230
        """
        Initializes the ``Node``-object.

        Args:
            parser (Parser):  The `Parser`-instance which generated
                    this node
            result (Union):  The parsing result, which can either be a
                    string or a tuple of child nodes
        """
        self.result = result
Eckhart Arnold's avatar
Eckhart Arnold committed
231
        self.parser = parser or ZOMBIE_PARSER
232
        self._errors = []
233
        self.error_flag = any(r.error_flag for r in self.result) if self.children else False
Eckhart Arnold's avatar
Eckhart Arnold committed
234
235
        self._len = len(self.result) if not self.children else \
            sum(child._len for child in self.result)
Eckhart Arnold's avatar
Eckhart Arnold committed
236
237
238
239
240
241
242
        self.pos = 0

    def __str__(self):
        if self.children:
            return "".join([str(child) for child in self.result])
        return str(self.result)

243
    @property
244
    def tag(self):
245
246
        return self.parser.name or self.parser.__class__.__name__

Eckhart Arnold's avatar
Eckhart Arnold committed
247
248
249
250
251
252
    @property
    def result(self):
        return self._result

    @result.setter
    def result(self, result):
253
254
255
256
257
258
259
260
261
        assert ((isinstance(result, tuple) and all(isinstance(child, Node) for child in result))
                or isinstance(result, Node)
                or isinstance(result, str)), str(result)
        self._result = (result,) if isinstance(result, Node) else result or ''
        self._children = self._result if isinstance(self._result, tuple) else ()

    @property
    def children(self):
        return self._children
Eckhart Arnold's avatar
Eckhart Arnold committed
262

Eckhart Arnold's avatar
Eckhart Arnold committed
263
264
265
266
    @property
    def len(self):
        return self._len

Eckhart Arnold's avatar
Eckhart Arnold committed
267
268
269
270
271
272
273
274
275
276
    @property
    def pos(self):
        return self._pos

    @pos.setter
    def pos(self, pos):
        self._pos = pos
        offset = 0
        for child in self.children:
            child.pos = pos + offset
Eckhart Arnold's avatar
Eckhart Arnold committed
277
            offset += child.len
Eckhart Arnold's avatar
Eckhart Arnold committed
278

279
280
281
282
    @property
    def errors(self):
        return [Error(self.pos, err) for err in self._errors]

283
284
285
286
287
288
    def _tree_repr(self, tab, openF, closeF, dataF=lambda s: s):
        """
        Generates a tree representation of this node and its children
        in string from.

        This could be an XML-representation or a lisp-like
Eckhart Arnold's avatar
Eckhart Arnold committed
289
290
291
292
        S-expression. Exactly which form the tree representation takes is
        defined by the parameters of the function.

        Args:
293
294
295
296
297
298
299
300
301
            tab (str): The indentation string, e.g. '\t' or '    '
            openF (Node->str):  A function that returns an opening
                    string (e.g. an XML-tag) for a given node
            closeF (Node->str): A function that returns a closeF string
                    (e.g. an XML-tag) for a given node.
            dataF (str->str): A function that filters the data string
                    before printing, e.g. to add quotation marks
        Returns (str):
            A string that contains a (serialized) tree representation of the
Eckhart Arnold's avatar
Eckhart Arnold committed
302
303
304
305
306
307
308
309
310
311
312
313
314
315
            node and its children.
        """
        head = openF(self)
        tail = closeF(self)

        if not self.result:
            return head + tail

        head = head + '\n'        # place the head, tail and content
        tail = '\n' + tail        # of the node on different lines

        if self.children:
            content = []
            for child in self.result:
316
                subtree = child._tree_repr(tab, openF, closeF, dataF).split('\n')
Eckhart Arnold's avatar
Eckhart Arnold committed
317
318
319
320
321
322
323
                content.append('\n'.join((tab + s) for s in subtree))
            return head + '\n'.join(content) + tail

        return head + '\n'.join([tab + dataF(s)
                                 for s in str(self.result).split('\n')]) + tail

    def as_sexpr(self, src=None):
324
325
        """
        Returns content as S-expression, i.e. in lisp-like form.
Eckhart Arnold's avatar
Eckhart Arnold committed
326
327

        Args:
328
329
330
            src (str or None):  The source text or `None`. In case the
                    source text is given the position of the element
                    in the text will be reported as line and column.
Eckhart Arnold's avatar
Eckhart Arnold committed
331
332
        """
        def opening(node):
333
            s = '(' + node.tag
Eckhart Arnold's avatar
Eckhart Arnold committed
334
335
336
337
338
339
340
            # s += " '(pos %i)" % node.pos
            if src:
                s += " '(pos %i  %i %i)" % (node.pos, *line_col(src, node.pos))
            if node.errors:
                s += " '(err '(%s))" % ' '.join(str(err).replace('"', r'\"')
                                                for err in node.errors)
            return s
341
342
343
344
        def pretty(s):
            return '"%s"' % s if s.find('"') < 0 \
                else "'%s'" % s if s.find("'") < 0 \
                else '"%s"' % s.replace('"', r'\"')
345
        return self._tree_repr('    ', opening, lambda node: ')', pretty)
346

Eckhart Arnold's avatar
Eckhart Arnold committed
347
    def as_xml(self, src=None):
348
349
        """
        Returns content as XML-tree.
Eckhart Arnold's avatar
Eckhart Arnold committed
350
351

        Args:
352
353
354
            src (str):  The source text or `None`. In case the source
                    text is given the position will also be reported
                    as line and column.
Eckhart Arnold's avatar
Eckhart Arnold committed
355
356
357
        """

        def opening(node):
358
            s = '<' + node.tag
Eckhart Arnold's avatar
Eckhart Arnold committed
359
360
361
362
            # s += ' pos="%i"' % node.pos
            if src:
                s += ' line="%i" col="%i"' % line_col(src, node.pos)
            if node.errors:
363
                s += ' err="%s"' % ''.join(str(err).replace('"', r'\"') for err in node.errors)
Eckhart Arnold's avatar
Eckhart Arnold committed
364
365
366
367
            s += ">"
            return s

        def closing(node):
368
            s = '</' + node.tag + '>'
Eckhart Arnold's avatar
Eckhart Arnold committed
369
370
            return s

371
        return self._tree_repr('    ', opening, closing)
Eckhart Arnold's avatar
Eckhart Arnold committed
372

Eckhart Arnold's avatar
Eckhart Arnold committed
373
    def add_error(self, error_str):
374
        self._errors.append(error_str)
375
        self.error_flag = True
Eckhart Arnold's avatar
Eckhart Arnold committed
376
377
        return self

378
    def collect_errors(self, clear_errors=False):
379
380
        """
        Returns all errors of this node or any child node in the form
381
382
        of a set of tuples (position, error_message), where position
        is always relative to this node.
Eckhart Arnold's avatar
Eckhart Arnold committed
383
        """
384
385
386
387
388
389
390
391
392
393
        if self.error_flag:
            errors = self.errors
            if clear_errors:
                self._errors = []
                self.error_flag = False
            if self.children:
                for child in self.result:
                    errors.extend(child.collect_errors(clear_errors))
            return errors
        return []
Eckhart Arnold's avatar
Eckhart Arnold committed
394

395
    def navigate(self, path):
396
397
        """EXPERIMENTAL! NOT YET TESTED!!!
        Returns the first descendant element matched by `path`, e.g.
398
399
400
        'd/s' returns 'l' from (d (s l)(e (r x1) (r x2))
        'e/r' returns 'x2'
        'e'   returns (r x1)(r x2)
401
402
403
404
405
406
407

        Args:
            path (str): The path of the object, e.g. 'a/b/c'

        Returns:
            The object at the path, either a string or a Node or
            `None`, if the path did not match.
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
        """
        pl = path.strip('')
        assert pl[0] != '/', 'Path must noch start with "/"!'
        nd = self
        for p in pl:
            if isinstance(nd.result, str):
                return p if (p == nd.result) and (p == pl[-1]) else None
            for child in nd.result:
                if str(child) == p:
                    nd = child
                    break
            else:
                return None
        return child

Eckhart Arnold's avatar
Eckhart Arnold committed
423
424

def error_messages(text, errors):
425
426
427
428
429
    """
    Converts the list of ``errors`` collected from the root node of the
    parse tree of `text` into a human readable (and IDE or editor
    parsable text) with line an column numbers. Error messages are
    separated by an empty line.
Eckhart Arnold's avatar
Eckhart Arnold committed
430
431
    """
    return "\n\n".join("line: %i, column: %i, error: %s" %
432
                       (*line_col(text, err.pos), err.msg)
Eckhart Arnold's avatar
Eckhart Arnold committed
433
                       for err in sorted(list(errors)))
434
435
436
    # return "\n\n".join("line: %i, column: %i, error: %s" %
    #                    (*line_col(text, err[0]), " and ".join(err[1]))
    #                    for err in sorted(list(errors)))
Eckhart Arnold's avatar
Eckhart Arnold committed
437
438


439
440
441
# lambda compact_sexpr s : re.sub('\s(?=\))', '', re.sub('\s+', ' ', s)).strip()


442

Eckhart Arnold's avatar
Eckhart Arnold committed
443
444
445
446
447
448
449
########################################################################
#
# Abstract syntax tree support
#
########################################################################


450
451
452
453
454
455
456
457
def DEBUG_DUMP_SYNTAX_TREE(grammar_base, syntax_tree, ext):
    global DEBUG
    if DEBUG:
        st_file_name = DEBUG_FILE_NAME(grammar_base) + ext
        with open(os.path.join(DEBUG_DIR(), st_file_name), "w",  encoding="utf-8") as f:
            f.write(syntax_tree.as_sexpr())


Eckhart Arnold's avatar
Eckhart Arnold committed
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
def expand_table(compact_table):
    """Expands a table by separating keywords that are tuples or strings
    containing comma separated words into single keyword entries with
    the same values. Returns the expanded table.
    Example:
    expand_table({"a, b": 1, "b": 1, ('d','e','f'):5, "c":3}) yields
    {'a': 1, 'b': 1, 'c': 3, 'd': 5, 'e': 5, 'f': 5}
    """
    expanded_table = {}
    keys = list(compact_table.keys())
    for key in keys:
        value = compact_table[key]
        if isinstance(key, str):
            parts = (s.strip() for s in key.split(','))
        else:
            assert isinstance(key, collections.abc.Iterable)
            parts = key
        for p in parts:
            expanded_table[p] = value
    return expanded_table


def ASTTransform(node, transtable):
    """Transforms the parse tree starting with the given `node` into an
    abstract syntax tree by calling transformation functions registered in a
    transformation table.

    Args:
        node (Node): The root node of the parse tree (or sub-tree) to be
                transformed into the abstract syntax tree.
        transtable (dict): A dictionary that assigns a transformation
                transformation functions to parser name strings.
    """
    # normalize transformation entries by turning single transformations
    # into lists with a single item
    table = {name: transformation
             if isinstance(transformation, collections.abc.Sequence)
             else [transformation]
             for name, transformation in list(transtable.items())}
    table = expand_table(table)

    def recursive_ASTTransform(node):
        if node.children:
            for child in node.result:
                recursive_ASTTransform(child)
        transformation = table.get(node.parser.name,
                            table.get('~', [])) + table.get('*', [])
        for transform in transformation:
            transform(node)

    recursive_ASTTransform(node)


511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
# def preserve_errors(transformation):
#     """Wrapper that moves errors of child nodes that have been removed
#     after the application of function ``transformation()`` to the root
#     node. As an optimization, ``transformation()`` will only be called
#     if its ``node``-argument (i.e. the root-node) has children at all.
#     """
#     # requires nd.collect_errors() to return a set
#     @functools.wraps(transformation)
#     def preserve_errors_wrapper(*args, **kwds):
#         nd = kwds['node'] if 'node' in kwds else args[0]
#         if nd.children:
#             errors = nd.collect_errors()
#             transformation(*args, **kwds)
#             for err in errors - nd.collect_errors():
#                 nd.add_error(err[1], err[0])
#     return preserve_errors_wrapper


def no_transformation(node):
    pass


Eckhart Arnold's avatar
Eckhart Arnold committed
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
# ------------------------------------------------
#
# rearranging transformations:
#     - tree may be rearranged (flattened)
#     - order is preserved
#     - all leaves are kept
#
# ------------------------------------------------


def replace_by_single_child(node):
    """Remove single branch node, replacing it by its immediate descendant.
    (In case the descendant's name is empty (i.e. anonymous) the
    name of this node's parser is kept.)
    """
    if node.children and len(node.result) == 1:
        if not node.result[0].parser.name:
            node.result[0].parser.name = node.parser.name
        node.parser = node.result[0].parser
552
        node._errors.extend(node.result[0].errors)
Eckhart Arnold's avatar
Eckhart Arnold committed
553
554
555
556
557
558
559
560
        node.result = node.result[0].result


def reduce_single_child(node):
    """Reduce a single branch node, by transferring the result of its
    immediate descendant to this node, but keeping this node's parser entry.
    """
    if node.children and len(node.result) == 1:
561
        node._errors.extend(node.result[0].errors)
Eckhart Arnold's avatar
Eckhart Arnold committed
562
563
564
565
566
567
568
569
        node.result = node.result[0].result


# ------------------------------------------------
#
# destructive transformations:
#     - tree may be rearranged (flattened),
#     - order is preserved
570
571
#     - but (irrelevant) leaves may be dropped
#     - errors of dropped leaves will be lost
Eckhart Arnold's avatar
Eckhart Arnold committed
572
573
574
#
# ------------------------------------------------

575
576
577
578
579
580

def is_whitespace(node):
    return not node.result or (isinstance(node.result, str) and not node.result.strip())


def is_comment(node):
581
    return node.parser.name == WHITESPACE_KEYWORD
582
583
584
585
586
587
588
589


def is_scanner_token(node):
    return isinstance(node.parser, ScannerToken)


def is_expendable(node):
    return is_whitespace(node) or is_comment(node) or is_scanner_token(node)
Eckhart Arnold's avatar
Eckhart Arnold committed
590
591
592
593
594
595


def remove_children_if(node, condition):
    """Removes all nodes from the result field if the function `condition` evaluates
    to `True`."""
    if node.children:
596
        node.result = tuple(r for r in node.result if not condition(r))
Eckhart Arnold's avatar
Eckhart Arnold committed
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613


remove_whitespace = partial(remove_children_if, condition=is_whitespace)
remove_comments = partial(remove_children_if, condition=is_comment)
remove_scanner_tokens = partial(remove_children_if, condition=is_scanner_token)
remove_expendables = partial(remove_children_if, condition=is_expendable)


def flatten(node):
    """Recursively flattens all unnamed sub-nodes, in case there is more
    than one sub-node present. Flattening means that
    wherever a node has child nodes, the child nodes are inserted in place
    of the node. In other words, all leaves of this node and its child nodes
    are collected in-order as direct children of this node.
    This is meant to achieve the following structural transformation:
    X (+ Y + Z)  ->   X + Y + Z
    """
614
    if node.children:
Eckhart Arnold's avatar
Eckhart Arnold committed
615
        new_result = []
616
        for child in node.children:
Eckhart Arnold's avatar
Eckhart Arnold committed
617
618
619
620
621
622
623
624
625
            if not child.parser.name and child.children:
                assert child.children, node.as_sexpr()
                flatten(child)
                new_result.extend(child.result)
            else:
                new_result.append(child)
        node.result = tuple(new_result)


626
def remove_tokens(node, tokens=set()):
Eckhart Arnold's avatar
Eckhart Arnold committed
627
    """Reomoves any among a particular set of tokens from the immediate
628
629
    descendants of a node. If ``tokens`` is the empty set, all tokens
    are removed.
Eckhart Arnold's avatar
Eckhart Arnold committed
630
631
632
    """
    if node.children:
        if tokens:
633
            node.result = tuple(child for child in node.children
Eckhart Arnold's avatar
Eckhart Arnold committed
634
635
636
                                if child.parser.name != TOKEN_KEYWORD or
                                child.result not in tokens)
        else:
637
            node.result = tuple(child for child in node.children
Eckhart Arnold's avatar
Eckhart Arnold committed
638
639
640
641
642
643
644
                                if child.parser.name != TOKEN_KEYWORD)


def remove_enclosing_delimiters(node):
    """Removes the enclosing delimiters from a structure (e.g. quotation marks
    from a literal or braces from a group).
    """
645
646
647
    if len(node.children) >= 3:
        assert isinstance(node.children[0].result, str) and \
               isinstance(node.children[-1].result, str), node.as_sexpr()
Eckhart Arnold's avatar
Eckhart Arnold committed
648
649
650
651
652
653
654
655
656
657
658
659
660
        node.result = node.result[1:-1]


AST_SYMBOLS = {'replace_by_single_child', 'reduce_single_child',
               'no_transformation', 'remove_children_if', 'is_whitespace',
               'is_comment', 'is_scanner_token', 'is_expendable',
               'remove_whitespace', 'remove_comments',
               'remove_scanner_tokens', 'remove_expendables', 'flatten',
               'remove_tokens', 'remove_enclosing_delimiters',
               'TOKEN_KEYWORD', 'WHITESPACE_KEYWORD', 'partial'}



661
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
662
663
664
#
# Parser base classes
#
665
666
########################################################################

Eckhart Arnold's avatar
Eckhart Arnold committed
667

668
669
LEFT_RECURSION_DEPTH = 10   # because of pythons recursion depth limit, this
                            # value ought not to be set too high
670
MAX_DROPOUTS = 25   # stop trying to recover parsing after so many errors
Eckhart Arnold's avatar
Eckhart Arnold committed
671

672
WHITESPACE_KEYWORD = 'wsp__'
Eckhart Arnold's avatar
Eckhart Arnold committed
673
674
675
TOKEN_KEYWORD = 'token__'


676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
class HistoryRecord:
    __slots__ = ('call_stack', 'node', 'remaining')

    MATCH = "MATCH"
    ERROR = "ERROR"
    FAIL = "FAIL"

    def __init__(self, call_stack, node, remaining):
        self.call_stack = call_stack
        self.node = node
        self.remaining = remaining

    @property
    def stack(self):
        return "->".join(str(parser) for parser in self.call_stack)

    @property
    def status(self):
        return self.FAIL if self.node is None else self.ERROR if self.node._errors else self.MATCH

    @property
    def extent(self):
        return ((-self.remaining - self.node.len, -self.remaining) if self.node
                else (-self.remaining, None))


def DEBUG_DUMP_PARSING_HISTORY(grammar_base, document):
    def prepare_line(record):
        excerpt = document.__getitem__(slice(*record.extent))[:25].replace('\n','\\n')
        excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
        return (record.stack, record.status, excerpt)

    def write_log(history, log_name):
        path = os.path.join(DEBUG_DIR(), DEBUG_FILE_NAME(grammar_base) + log_name + "_parser.log")
        if history:
            with open(path, "w", encoding="utf-8") as f:
                f.write("\n".join(history))
        elif os.path.exists(path):
            os.remove(path)

    global DEBUG
    if DEBUG:
        full_history, match_history, errors_only = [], [], []
        for record in grammar_base.history:
            line = ";  ".join(prepare_line(record))
            full_history.append(line)
722
            if record.node and record.node.parser.name != WHITESPACE_KEYWORD:
723
724
725
726
727
728
729
730
731
732
733
734
735
736
                match_history.append(line)
                if record.node.errors:
                    errors_only.append(line)
        write_log(full_history, '_full')
        write_log(match_history, '_match')
        write_log(errors_only, '_errors')
        # hist = [";  ".join(prepare_line(r)) for r in grammar_base.history]
        # lines = [prepare_line(r) for r in grammar_base.history]
        # n = max(len(line[0]) for line in lines)
        # hist = ["    ".join((l[0] + ' ' * (n - len(l[0])), l[1], l[2])) for l in lines]


def add_parser_guard(parser_func):
    def guarded_call(parser, text):
Eckhart Arnold's avatar
Eckhart Arnold committed
737
738
739
740
741
742
743
        try:
            location = len(text)
            # if location has already been visited by the current parser,
            # return saved result
            if location in parser.visited:
                return parser.visited[location]
            # break left recursion at the maximum allowed depth
744
            if parser.recursion_counter.setdefault(location, 0) > LEFT_RECURSION_DEPTH:
Eckhart Arnold's avatar
Eckhart Arnold committed
745
746
747
                return None, text

            parser.recursion_counter[location] += 1
748
            grammar = parser.grammar
Eckhart Arnold's avatar
Eckhart Arnold committed
749

750
751
752
            if grammar.track_history:
                grammar.call_stack.append(parser)
                grammar.moving_forward = True
Eckhart Arnold's avatar
Eckhart Arnold committed
753
754

            # run original __call__ method
755
756
757
758
759
760
761
762
763
764
            node, rest = parser_func(parser, text)

            if grammar.track_history:
                if grammar.moving_forward:  # and result[0] == None
                    grammar.moving_forward = False
                    record = HistoryRecord(grammar.call_stack.copy(), node, len(rest))
                    grammar.history.append(record)
                grammar.call_stack.pop()

            if node is not None:
Eckhart Arnold's avatar
Eckhart Arnold committed
765
766
                # in case of a recursive call saves the result of the first
                # (or left-most) call that matches
767
768
                parser.visited[location] = (node, rest)
                grammar.last_node = node
Eckhart Arnold's avatar
Eckhart Arnold committed
769
770
771
            elif location in parser.visited:
                # if parser did non match but a saved result exits, assume
                # left recursion and use the saved result
772
                node, rest = parser.visited[location]
Eckhart Arnold's avatar
Eckhart Arnold committed
773
774
775
776
777
778
779

            parser.recursion_counter[location] -= 1

        except RecursionError:
            node = Node(None, text[:min(10, max(1, text.find("\n")))] + " ...")
            node.add_error("maximum recursion depth of parser reached; "
                           "potentially due to too many errors!")
780
            node.error_flag = True
781
            rest = ''
Eckhart Arnold's avatar
Eckhart Arnold committed
782

783
        return node, rest
Eckhart Arnold's avatar
Eckhart Arnold committed
784

785
    return guarded_call
Eckhart Arnold's avatar
Eckhart Arnold committed
786
787
788


class ParserMetaClass(type):
789
    def __init__(cls, name, bases, attrs):
Eckhart Arnold's avatar
Eckhart Arnold committed
790
791
792
        # The following condition is necessary for classes that don't override
        # the __call__() method, because in these cases the non-overridden
        # __call__()-method would be substituted a second time!
793
794
795
        guarded_parser_call = add_parser_guard(cls.__call__)
        if cls.__call__.__code__ != guarded_parser_call.__code__:
            cls.__call__ = guarded_parser_call
796
        super(ParserMetaClass, cls).__init__(name, bases, attrs)
Eckhart Arnold's avatar
Eckhart Arnold committed
797

798
799

def sane_parser_name(name):
800
    """Checks whether given name is an acceptable parser name. Parser names
801
802
803
804
805
    must not be preceeded or succeeded by a double underscore '__'!
    """
    return name and name[:2] != '__' and name[-2:] != '__'


Eckhart Arnold's avatar
Eckhart Arnold committed
806
class Parser(metaclass=ParserMetaClass):
807
808
809
    def __init__(self, name=None):
        assert name is None or isinstance(name, str), str(name)
        self.name = name or ''
810
        self.grammar = None          # center for global variables etc.
811
812
813
        self.reset()

    def reset(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
814
815
816
817
818
        self.visited = dict()
        self.recursion_counter = dict()
        self.cycle_detection = set()

    def __call__(self, text):
819
        return None, text               # default behaviour: don't match
Eckhart Arnold's avatar
Eckhart Arnold committed
820
821

    def __str__(self):
822
        return self.name or self.__class__.__name__
Eckhart Arnold's avatar
Eckhart Arnold committed
823

824
    @property
825
826
    def grammar(self):
        return self._grammar
827

828
829
830
831
    @grammar.setter
    def grammar(self, grammar_base):
        self._grammar = grammar_base
        self._grammar_assigned_notifier()
832

833
    def _grammar_assigned_notifier(self):
834
835
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
836
837
    def apply(self, func):
        """Applies function `func(parser)` recursively to this parser and all
838
839
        descendendants of the tree of parsers. The same function can never
        be applied twice between calls of the ``reset()``-method!
Eckhart Arnold's avatar
Eckhart Arnold committed
840
841
842
843
844
845
846
847
848
        """
        if func in self.cycle_detection:
            return False
        else:
            self.cycle_detection.add(func)
            func(self)
            return True


849
class GrammarBase:
850
851
    root__ = None   # should be overwritten by grammar subclass

852
    @classmethod
853
    def _assign_parser_names(cls):
854
        """Initializes the `parser.name` fields of those
855
856
        Parser objects that are directly assigned to a class field with
        the field's name, e.g.
857
            class Grammar(GrammarBase):
858
859
                ...
                symbol = RE('(?!\\d)\\w+')
860
        After the call of this method symbol.name == "symbol"
861
862
        holds. Names assigned via the `name`-parameter of the
        constructor will not be overwritten.
863
        """
864
        if cls.parser_initialization__ == "done":
865
            return
866
        cdict = cls.__dict__
867
868
869
        for entry, parser in cdict.items():
            if isinstance(parser, Parser):
                if not parser.name or parser.name == TOKEN_KEYWORD:
Eckhart Arnold's avatar
Eckhart Arnold committed
870
                    parser.name = entry
871
872
                if (isinstance(parser, Forward) and (not parser.parser.name
                    or parser.parser.name == TOKEN_KEYWORD)):
873
                    parser.parser.name = entry
874
        cls.parser_initialization__ = "done"
875
876

    def __init__(self):
877
878
        self.all_parsers = set()
        self.dirty_flag = False
879
        self.track_history = DEBUG
880
        self._reset()
881
        self._assign_parser_names()
882
        self.root__ = copy.deepcopy(self.__class__.root__)
883
884
        if self.wspL__:
            self.wsp_left_parser__ = RegExp(self.wspL__, WHITESPACE_KEYWORD)
885
            self.wsp_left_parser__.grammar = self
886
        else:
887
            self.wsp_left_parser__ = ZOMBIE_PARSER
888
889
        if self.wspR__:
            self.wsp_right_parser__ = RegExp(self.wspR__, WHITESPACE_KEYWORD)
890
            self.wsp_right_parser__.grammar = self
891
        else:
892
            self.wsp_right_parser__ = ZOMBIE_PARSER
Eckhart Arnold's avatar
Eckhart Arnold committed
893
        self.root__.apply(self._add_parser)
894
895

    def _reset(self):
896
        self.variables = dict()                 # support for Pop and Retrieve operators
Eckhart Arnold's avatar
Eckhart Arnold committed
897
        self.last_node = None
898
        self.call_stack = []                    # support for call stack tracing
899
        self.history = []                       # snapshots of call stacks
900
        self.moving_forward = True              # also needed for call stack tracing
Eckhart Arnold's avatar
Eckhart Arnold committed
901
902

    def _add_parser(self, parser):
903
        """Adds the copy of the classes parser object to this
904
        particular instance of GrammarBase.
Eckhart Arnold's avatar
Eckhart Arnold committed
905
        """
906
907
        setattr(self, parser.name, parser)
        self.all_parsers.add(parser)
908
        parser.grammar = self
Eckhart Arnold's avatar
Eckhart Arnold committed
909
910
911
912
913
914
915
916
917

    def parse(self, document):
        """Parses a document with with parser-combinators.

        Args:
            document (str): The source text to be parsed.
        Returns:
            Node: The root node ot the parse tree.
        """
918
919
        if self.root__ is None:
            raise NotImplementedError()
920
        if self.dirty_flag:
921
            self._reset()
922
923
924
925
            for parser in self.all_parsers:
                parser.reset()
        else:
            self.dirty_flag = True
Eckhart Arnold's avatar
Eckhart Arnold committed
926
        parser = self.root__
927
        result = ""
928
        stitches = []
Eckhart Arnold's avatar
Eckhart Arnold committed
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
        rest = document
        while rest and len(stitches) < MAX_DROPOUTS:
            result, rest = parser(rest)
            if rest:
                fwd = rest.find("\n") + 1 or len(rest)
                skip, rest = rest[:fwd], rest[fwd:]
                if result is None:
                    error_msg = "Parser did not match! Invalid source file?"
                else:
                    stitches.append(result)
                    error_msg = "Parser stopped before end" + \
                                ("! trying to recover..."
                                 if len(stitches) < MAX_DROPOUTS
                                 else " too often! Terminating parser.")
                stitches.append(Node(None, skip))
                stitches[-1].add_error(error_msg)
        if stitches:
            if result and stitches[-1] != result:
                stitches.append(result)
            if rest:
                stitches.append(Node(None, rest))
        return result if not stitches else Node(None, tuple(stitches))


953

954
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
955
956
957
#
# Token and Regular Expression parser classes (i.e. leaf classes)
#
958
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981


class ScannerToken(Parser):
    def __init__(self, scanner_token):
        assert isinstance(scanner_token, str) and scanner_token and \
               scanner_token.isupper()
        assert RX_SCANNER_TOKEN.match(scanner_token)
        super(ScannerToken, self).__init__(scanner_token)

    def __call__(self, text):
        if text[0:1] == BEGIN_SCANNER_TOKEN:
            end = text.find(END_SCANNER_TOKEN, 1)
            if end < 0:
                node = Node(self, '').add_error(
                    'END_SCANNER_TOKEN delimiter missing from scanner token. '
                    '(Most likely due to a scanner bug!)')
                return node, text[1:]
            elif end == 0:
                node = Node(self, '').add_error(
                    'Scanner token cannot have zero length. '
                    '(Most likely due to a scanner bug!)')
                return node, text[2:]
            elif text.find(BEGIN_SCANNER_TOKEN, 1, end) >= 0:
982
                node = Node(self, text[len(self.name) + 1:end])
Eckhart Arnold's avatar
Eckhart Arnold committed
983
984
985
986
987
                node.add_error(
                    'Scanner tokens must not be nested or contain '
                    'BEGIN_SCANNER_TOKEN delimiter as part of their argument. '
                    '(Most likely due to a scanner bug!)')
                return node, text[end:]
988
989
            if text[1:len(self.name) + 1] == self.name:
                return Node(self, text[len(self.name) + 1:end]), \
Eckhart Arnold's avatar
Eckhart Arnold committed
990
991
992
993
994
                       text[end + 1:]
        return None, text


class RegExp(Parser):
995
    def __init__(self, regexp, name=None):
996
        super(RegExp, self).__init__(name)
Eckhart Arnold's avatar
Eckhart Arnold committed
997
998
999
1000
1001
1002
1003
1004
        self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

    def __deepcopy__(self, memo):
        # this method is obsolete with the new `regex` module!
        try:
            regexp = copy.deepcopy(self.regexp)
        except TypeError:
            regexp = self.regexp.pattern
1005
        duplicate = RegExp(self.name, regexp)
1006
        duplicate.name = self.name  # this ist needed!!!!
1007
        duplicate.regexp = self.regexp
1008
        duplicate.grammar = self.grammar
Eckhart Arnold's avatar
Eckhart Arnold committed
1009
1010
1011
1012
1013
1014
1015
1016
1017
        duplicate.visited = copy.deepcopy(self.visited, memo)
        duplicate.recursion_counter = copy.deepcopy(self.recursion_counter,
                                                    memo)
        return duplicate

    def __call__(self, text):
        match = text[0:1] != BEGIN_SCANNER_TOKEN and self.regexp.match(text)  # ESC starts a scanner token.
        if match:
            end = match.end()
1018
1019
1020
1021
1022
            return Node(self, text[:end]), text[end:]
        return None, text


class RE(Parser):
Eckhart Arnold's avatar
Eckhart Arnold committed
1023
1024
    """Regular Expressions with optional leading or trailing whitespace.
    """
1025
    def __init__(self, regexp, wL=None, wR=None, name=None):
1026
        super(RE, self).__init__(name)
1027
        # assert wR or regexp == '.' or isinstance(self, Token)
1028
1029
        self.wL = wL
        self.wR = wR
1030
1031
        self.wspLeft = RegExp(wL, WHITESPACE_KEYWORD) if wL else ZOMBIE_PARSER
        self.wspRight = RegExp(wR, WHITESPACE_KEYWORD) if wR else ZOMBIE_PARSER
1032
1033
1034
1035
1036
        self.main = RegExp(regexp)

    def __call__(self, text):
        # assert self.main.regexp.pattern != "@"
        t = text
1037
        wL, t = self.wspLeft(t)
1038
1039
        main, t = self.main(t)
        if main:
1040
            wR, t = self.wspRight(t)
1041
1042
            result = tuple(nd for nd in (wL, main, wR)
                           if nd and nd.result != '')
1043
            return Node(self, result), t
Eckhart Arnold's avatar
Eckhart Arnold committed
1044
1045
1046
        return None, text

    def __str__(self):
1047
1048
1049
1050
        if self.name == TOKEN_KEYWORD:
            return 'Token "%s"' % self.main.regexp.pattern.replace('\\', '')
        return self.name or ('RE ' + ('~' if self.wL else '')
                             + '/%s/' % self.main.regexp.pattern + ('~' if self.wR else ''))
1051

1052
1053
    def _grammar_assigned_notifier(self):
        if self.grammar:
1054
            if self.wL is None:
1055
                self.wspLeft = self.grammar.wsp_left_parser__
1056
            if self.wR is None:
1057
                self.wspRight = self.grammar.wsp_right_parser__
1058

1059
1060
1061
    def apply(self, func):
        if super(RE, self).apply(func):
            if self.wL:
1062
                self.wspLeft.apply(func)
1063
            if self.wR:
1064
                self.wspRight.apply(func)
1065
            self.main.apply(func)
Eckhart Arnold's avatar
Eckhart Arnold committed
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076


def escape_re(s):
    """Returns `s` with all regular expression special characters escaped.
    """
    assert isinstance(s, str)
    re_chars = r"\.^$*+?{}[]()#<>=|!"
    for esc_ch in re_chars:
        s = s.replace(esc_ch, '\\' + esc_ch)
    return s

1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
#
# class Token(RE):
#     def __init__(self, token, wL=None, wR=None, name=None):
#         super(Token, self).__init__(escape_re(token), wL, wR, name or TOKEN_KEYWORD)
#
#     def __str__(self):
#         return self.name or 'Token "%s"' % self.main.regexp.pattern.replace('\\', '')

def Token(token, wL=None, wR=None, name=None):
    return RE(escape_re(token), wL, wR, name or TOKEN_KEYWORD)

Eckhart Arnold's avatar
Eckhart Arnold committed
1088
1089
1090
1091
1092
1093
1094
1095
1096

def mixin_comment(whitespace, comment):
    """Mixes comment-regexp into whitespace regexp.
    """
    wspc = '(?:' + whitespace + '(?:' + comment + whitespace + ')*)'
    return wspc



1097
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
1098
1099
1100
#
# Combinator parser classes (i.e. trunk classes of the parser tree)
#
1101
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
1102
1103
1104


class UnaryOperator(Parser):
1105
1106
    def __init__(self, parser, name=None):
        super(UnaryOperator, self).__init__(name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1107
1108
1109
1110
1111
1112
1113
1114
1115
        assert isinstance(parser, Parser)
        self.parser = parser

    def apply(self, func):
        if super(UnaryOperator, self).apply(func):
            self.parser.apply(func)


class NaryOperator(Parser):
1116
1117
    def __init__(self, *parsers, name=None):
        super(NaryOperator, self).__init__(name)
1118
        assert all([isinstance(parser, Parser) for parser in parsers]), str(parsers)
Eckhart Arnold's avatar
Eckhart Arnold committed
1119
1120
1121
1122
1123
1124
1125
1126
1127
        self.parsers = parsers

    def apply(self, func):
        if super(NaryOperator, self).apply(func):
            for parser in self.parsers:
                parser.apply(func)


class Optional(UnaryOperator):
1128
1129
    def __init__(self, parser, name=None):
        super(Optional, self).__init__(parser, name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1130
1131
        assert isinstance(parser, Parser)
        assert not isinstance(parser, Optional), \
1132
            "Nesting options would be redundant: %s(%s)" % \
1133
            (str(name), str(parser.name))
Eckhart Arnold's avatar
Eckhart Arnold committed
1134
        assert not isinstance(parser, Required), \
1135
            "Nestion options with required elements is contradictory: " \
1136
            "%s(%s)" % (str(name), str(parser.name))
Eckhart Arnold's avatar
Eckhart Arnold committed
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156

    def __call__(self, text):
        node, text = self.parser(text)
        if node:
            return Node(self, node), text
        return Node(self, ()), text


class ZeroOrMore(Optional):
    def __call__(self, text):
        results = ()
        while text:
            node, text = self.parser(text)
            if not node:
                break
            results += (node,)
        return Node(self, results), text


class OneOrMore(UnaryOperator):
1157
1158
    def __init__(self, parser, name=None):
        super(OneOrMore, self).__init__(parser, name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1159
        assert not isinstance(parser, Optional), \
1160
            "Use ZeroOrMore instead of nesting OneOrMore and Optional: " \
1161
            "%s(%s)" % (str(name), str(parser.name))
Eckhart Arnold's avatar
Eckhart Arnold committed
1162
1163
1164
1165
1166
1167
1168
1169
1170

    def __call__(self, text):
        results = ()
        text_ = text
        while text_:
            node, text_ = self.parser(text_)
            if not node:
                break
            results += (node,)
1171
        if results == ():
Eckhart Arnold's avatar
Eckhart Arnold committed
1172
1173
1174
1175
1176
            return None, text
        return Node(self, results), text_


class Sequence(NaryOperator):
1177
1178
    def __init__(self, *parsers, name=None):
        super(Sequence, self).__init__(*parsers, name=name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
        assert len(self.parsers) >= 1
        # commented, because sequences can be empty:
        # assert not all(isinstance(p, Optional) for p in self.parsers)

    def __call__(self, text):
        results = ()
        text_ = text
        for parser in self.parsers:
            node, text_ = parser(text_)
            if not node:
                return node, text
            if node.result:  # Nodes with zero-length result are silently omitted
                results += (node,)
1192
            if node.error_flag:
Eckhart Arnold's avatar
Eckhart Arnold committed
1193
1194
1195
1196
1197
1198
                break
        assert len(results) <= len(self.parsers)
        return Node(self, results), text_


class Alternative(NaryOperator):
1199
1200
    def __init__(self, *parsers, name=None):
        super(Alternative, self).__init__(*parsers, name=name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
        assert len(self.parsers) >= 1
        assert all(not isinstance(p, Optional) for p in self.parsers)

    def __call__(self, text):
        for parser in self.parsers:
            node, text_ = parser(text)
            if node:
                return Node(self, node), text_
        return None, text


1212
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
1213
1214
1215
#
# Flow control operators
#
1216
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
1217
1218
1219


class FlowOperator(UnaryOperator):
1220
1221
    def __init__(self, parser, name=None):
        super(FlowOperator, self).__init__(parser, name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1222
1223
1224


class Required(FlowOperator):
1225
    # TODO: Add constructor that checks for logical errors, like `Required(Optional(...))` constructs
Eckhart Arnold's avatar
Eckhart Arnold committed
1226
1227
1228
1229
1230
1231
1232
    def __call__(self, text):
        node, text_ = self.parser(text)
        if not node:
            m = re.search(r'\s(\S)', text)
            i = max(1, m.regs[1][0]) if m else 1
            node = Node(self, text[:i])
            text_ = text[i:]
Eckhart Arnold's avatar
Eckhart Arnold committed
1233
            # assert False, "*"+text[:i]+"*"
Eckhart Arnold's avatar
Eckhart Arnold committed
1234
1235
1236
1237
1238
1239
            node.add_error('%s expected; "%s..." found!' %
                           (str(self.parser), text[:10]))
        return node, text_


class Lookahead(FlowOperator):
1240
1241
    def __init__(self, parser, name=None):
        super(Lookahead, self).__init__(parser, name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1242
1243
1244
1245
1246
1247
1248
1249

    def __call__(self, text):
        node, text_ = self.parser(text)
        if self.sign(node is not None):
            return Node(self, ''), text
        else:
            return None, text

1250
1251
    def sign(self, bool_value):
        return bool_value
Eckhart Arnold's avatar
Eckhart Arnold committed
1252
1253


1254
1255
1256
class NegativeLookahead(Lookahead):
    def sign(self, bool_value):
        return not bool_value
Eckhart Arnold's avatar
Eckhart Arnold committed
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271


def iter_right_branch(node):
    """Iterates over the right branch of `node` starting with node itself.
    Iteration is stopped if either there are no child nodes any more or
    if the parser of a node is a Lookahead parser. (Reason is: Since
    lookahead nodes do not advance the parser, it does not make sense
    to look back to them.)
    """
    while node and not isinstance(node.parser, Lookahead):  # the second condition should not be necessary
        yield node                                          # for well-formed EBNF code
        node = node.children[-1] if node.children else None


class Lookbehind(FlowOperator):
1272
1273
    def __init__(self, parser, name=None):
        super(Lookbehind, self).__init__(parser, name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1274
        print("WARNING: Lookbehind Operator is experimental!")
Eckhart Arnold's avatar
Eckhart Arnold committed
1275
1276

    def __call__(self, text):
1277
        if isinstance(self.grammar.last_node, Lookahead):
Eckhart Arnold's avatar
Eckhart Arnold committed
1278
1279
1280
1281
1282
1283
1284
            return Node(self, '').add_error('Lookbehind right after Lookahead '
                                            'does not make sense!'), text
        if self.sign(self.condition()):
            return Node(self, ''), text
        else:
            return None, text