ParserCombinators_obsolete.py 80.6 KB
Newer Older
Eckhart Arnold's avatar
Eckhart Arnold committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/usr/bin/python3

"""ParserCombinators.py - parser combinators for left-recursive grammers

Copyright 2016  by Eckhart Arnold

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
15
16
17
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
Eckhart Arnold's avatar
Eckhart Arnold committed
18
19


20
21
22
23
24
Module parser_combinators contains a number of classes that together
make up parser combinators for left-recursive grammers. For each
element of the extended Backus-Naur-Form as well as for a regular
expression token a class is defined. The set of classes can be used to
define a parser for (ambiguous) left-recursive grammers.
Eckhart Arnold's avatar
Eckhart Arnold committed
25
26
27
28
29


References and Acknowledgements:

Dominikus Herzberg: Objekt-orientierte Parser-Kombinatoren in Python,
30
31
Blog-Post, September, 18th 2008 on denkspuren. gedanken, ideen,
anregungen und links rund um informatik-themen, URL:
Eckhart Arnold's avatar
Eckhart Arnold committed
32
33
http://denkspuren.blogspot.de/2008/09/objekt-orientierte-parser-kombinatoren.html

34
35
36
Dominikus Herzberg: Eine einfache Grammatik für LaTeX, Blog-Post,
September, 18th 2008 on denkspuren. gedanken, ideen, anregungen und
links rund um informatik-themen, URL:
Eckhart Arnold's avatar
Eckhart Arnold committed
37
38
39
http://denkspuren.blogspot.de/2008/09/eine-einfache-grammatik-fr-latex.html

Dominikus Herzberg: Uniform Syntax, Blog-Post, February, 27th 2007 on
40
41
denkspuren. gedanken, ideen, anregungen und links rund um
informatik-themen, URL:
Eckhart Arnold's avatar
Eckhart Arnold committed
42
43
http://denkspuren.blogspot.de/2007/02/uniform-syntax.html

44
45
46
47
Richard A. Frost, Rahmatullah Hafiz and Paul Callaghan: Parser
Combinators for Ambiguous Left-Recursive Grammars, in: P. Hudak and
D.S. Warren (Eds.): PADL 2008, LNCS 4902, pp. 167–181, Springer-Verlag
Berlin Heidelberg 2008.
Eckhart Arnold's avatar
Eckhart Arnold committed
48
49
50

Juancarlo Añez: grako, a PEG parser generator in Python,
https://bitbucket.org/apalala/grako
51

Eckhart Arnold's avatar
Eckhart Arnold committed
52
53
"""

54
# TODO: Replace copy.deepcopy() call in Grammar class by custom copy()-methods in the Parser classes. Is that really better?
Eckhart Arnold's avatar
Eckhart Arnold committed
55
56
57
58


import collections
import copy
59
import hashlib
Eckhart Arnold's avatar
Eckhart Arnold committed
60
61
import keyword
import os
62
63
from functools import partial

64
from typing import NamedTuple
65

Eckhart Arnold's avatar
Eckhart Arnold committed
66
67
68
69
try:
    import regex as re
except ImportError:
    import re
Eckhart Arnold's avatar
Eckhart Arnold committed
70
71
72
import sys


Eckhart Arnold's avatar
Eckhart Arnold committed
73
__version__ = '0.5.3' + '_dev' + str(os.stat(__file__).st_mtime)
74

75
LOGGING = "LOGS"
76

Eckhart Arnold's avatar
Eckhart Arnold committed
77

78
79
80
def LOGS_DIR():
    """Returns a path of a directory where log files will be stored.
    Usually, this is just a sub-directory named 'LOGS'. The directory
Eckhart Arnold's avatar
Eckhart Arnold committed
81
82
    will be created if it does not exist.
    """
83
84
85
86
87
88
89
    global LOGGING
    if not LOGGING:
        raise AssertionError("Cannot use LOGGING_DIR() if LOGGINGging is turned off!")
    dirname = LOGGING
    if os.path.exists(LOGGING):
        if not os.path.isdir(LOGGING):
            raise IOError('"' + LOGGING + '" cannot be used as log directory, '
Eckhart Arnold's avatar
Eckhart Arnold committed
90
91
                          'because it is not a directory!')
    else:
92
        os.mkdir(LOGGING)
93
    return dirname
Eckhart Arnold's avatar
Eckhart Arnold committed
94
95


96
97

########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
98
#
99
# syntax tree
Eckhart Arnold's avatar
Eckhart Arnold committed
100
#
101
102
########################################################################

Eckhart Arnold's avatar
Eckhart Arnold committed
103
104

def line_col(text, pos):
Eckhart Arnold's avatar
Eckhart Arnold committed
105
    """Returns the position within a text as (line, column)-tuple.
106
    """
107
    assert pos < len(text), str(pos) + " >= " + str(len(text))
Eckhart Arnold's avatar
Eckhart Arnold committed
108
109
110
111
112
    line = text.count("\n", 0, pos) + 1
    column = pos - text.rfind("\n", 0, pos)
    return line, column


113
class ZombieParser:
Eckhart Arnold's avatar
Eckhart Arnold committed
114
115
116
117
118
119
120
121
    """
    Serves as a substitute for a Parser instance.

    ``ZombieParser`` is the class of the singelton object
    ``ZOMBIE_PARSER``. The  ``ZOMBIE_PARSER`` has a name and can be
    called, but it never matches. It serves as a substitute where only
    these (or one of these properties) is needed, but no real Parser-
    object is instantiated.
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
    """
    alive = False
    def __init__(self):
        assert not self.__class__.alive, "There can be only one!"
        assert self.__class__ == ZombieParser, "No derivatives, please!"
        self.name = "ZOMBIE"
        self.__class__.alive = True

    def __str__(self):
        return self.name

    def __call__(self, text):
        """Better call Saul ;-)"""
        return None, text


ZOMBIE_PARSER = ZombieParser()


class Error(NamedTuple):
    pos: int
    msg: str
144
145


Eckhart Arnold's avatar
Eckhart Arnold committed
146
class Node:
147
148
149
150
    """
    Represents a node in the concrete or abstract syntax tree.

    Attributes:
Eckhart Arnold's avatar
Eckhart Arnold committed
151
152
        tag_name (str):  The name of the node, which is either its
            parser's name or, if that is empty, the parser's class name
153
        result (str or tuple):  The result of the parser which
Eckhart Arnold's avatar
Eckhart Arnold committed
154
155
            generated this node, which can be either a string or a
            tuple of child nodes.
156
        children (tuple):  The tuple of child nodes or an empty tuple
Eckhart Arnold's avatar
Eckhart Arnold committed
157
            if there are no child nodes. READ ONLY!
158
        parser (Parser):  The parser which generated this node.
159
        errors (list):  A list of parser- or compiler-errors:
Eckhart Arnold's avatar
Eckhart Arnold committed
160
            tuple(position, string) attached to this node
Eckhart Arnold's avatar
Eckhart Arnold committed
161
        len (int):  The full length of the node's string result if the
Eckhart Arnold's avatar
Eckhart Arnold committed
162
163
164
165
            node is a leaf node or, otherwise, the concatenated string
            result's of its descendants. The figure always represents
            the length before AST-transformation ans will never change
            through AST-transformation. READ ONLY!
166
        pos (int):  the position of the node within the parsed text.
167

168
169
170
171
172
173
174
175
176
177
178
179
            The value of ``pos`` is -1 meaning invalid by default. 
            Setting this value will set the positions of all child
            nodes relative to this value.  

            To set the pos values of all nodes in a syntax tree, the
            pos value of the root node should be set to 0 right 
            after parsing.

            Other than that, this value should be considered READ ONLY. 
            At any rate, it should only be reassigned only during
            parsing stage and never during or after the
            AST-transformation.
180
181
    """

Eckhart Arnold's avatar
Eckhart Arnold committed
182
    def __init__(self, parser, result):
Eckhart Arnold's avatar
Eckhart Arnold committed
183
184
        """Initializes the ``Node``-object with the ``Parser``-Instance
        that generated the node and the parser's result.
185
186
        """
        self.result = result
Eckhart Arnold's avatar
Eckhart Arnold committed
187
        self.parser = parser or ZOMBIE_PARSER
188
        self._errors = []
189
        self.error_flag = any(r.error_flag for r in self.result) if self.children else False
Eckhart Arnold's avatar
Eckhart Arnold committed
190
        self._len = len(self.result) if not self.children else \
191
            sum(child._len for child in self.children)
192
        # self.pos = 0  # coninuous updating of pos values
193
        self._pos = -1
Eckhart Arnold's avatar
Eckhart Arnold committed
194
195
196
197
198
199

    def __str__(self):
        if self.children:
            return "".join([str(child) for child in self.result])
        return str(self.result)

200
    @property
Eckhart Arnold's avatar
Eckhart Arnold committed
201
    def tag_name(self):
202
203
        return self.parser.name or self.parser.__class__.__name__

Eckhart Arnold's avatar
Eckhart Arnold committed
204
205
206
207
208
209
    @property
    def result(self):
        return self._result

    @result.setter
    def result(self, result):
210
211
212
213
214
215
216
217
218
        assert ((isinstance(result, tuple) and all(isinstance(child, Node) for child in result))
                or isinstance(result, Node)
                or isinstance(result, str)), str(result)
        self._result = (result,) if isinstance(result, Node) else result or ''
        self._children = self._result if isinstance(self._result, tuple) else ()

    @property
    def children(self):
        return self._children
Eckhart Arnold's avatar
Eckhart Arnold committed
219

Eckhart Arnold's avatar
Eckhart Arnold committed
220
221
    @property
    def len(self):
di68kap's avatar
di68kap committed
222
        # DEBUGGING:  print(str(self.parser), str(self.pos), str(self._len), str(self)[:10].replace('\n','.'))
Eckhart Arnold's avatar
Eckhart Arnold committed
223
224
        return self._len

Eckhart Arnold's avatar
Eckhart Arnold committed
225
226
    @property
    def pos(self):
227
        assert self._pos >= 0, "position value not initialized!"
Eckhart Arnold's avatar
Eckhart Arnold committed
228
229
230
231
232
233
234
235
        return self._pos

    @pos.setter
    def pos(self, pos):
        self._pos = pos
        offset = 0
        for child in self.children:
            child.pos = pos + offset
Eckhart Arnold's avatar
Eckhart Arnold committed
236
            offset += child.len
Eckhart Arnold's avatar
Eckhart Arnold committed
237

238
239
240
241
    @property
    def errors(self):
        return [Error(self.pos, err) for err in self._errors]

242
243
244
245
246
    def _tree_repr(self, tab, openF, closeF, dataF=lambda s: s):
        """
        Generates a tree representation of this node and its children
        in string from.

Eckhart Arnold's avatar
Eckhart Arnold committed
247
248
249
250
251
252
253
254
255
256
257
258
        The kind ot tree-representation that is determined by several
        function parameters. This could be an XML-representation or a
        lisp-like S-expression.

        Parameters:
            tab (str):  The indentation string, e.g. '\t' or '    '
            openF:  (Node->str) A function that returns an opening
                string (e.g. an XML-tag_name) for a given node
            closeF:  (Node->str) A function that returns a closeF
                string (e.g. an XML-tag_name) for a given node.
            dataF:  (str->str) A function that filters the data string
                before printing, e.g. to add quotation marks
Eckhart Arnold's avatar
Eckhart Arnold committed
259

260
        Returns (str):
Eckhart Arnold's avatar
Eckhart Arnold committed
261
262
            A string that contains a (serialized) tree representation
            of the node and its children.
Eckhart Arnold's avatar
Eckhart Arnold committed
263
264
265
266
267
268
269
270
271
272
273
274
275
        """
        head = openF(self)
        tail = closeF(self)

        if not self.result:
            return head + tail

        head = head + '\n'        # place the head, tail and content
        tail = '\n' + tail        # of the node on different lines

        if self.children:
            content = []
            for child in self.result:
276
                subtree = child._tree_repr(tab, openF, closeF, dataF).split('\n')
Eckhart Arnold's avatar
Eckhart Arnold committed
277
278
279
280
281
282
283
                content.append('\n'.join((tab + s) for s in subtree))
            return head + '\n'.join(content) + tail

        return head + '\n'.join([tab + dataF(s)
                                 for s in str(self.result).split('\n')]) + tail

    def as_sexpr(self, src=None):
284
285
        """
        Returns content as S-expression, i.e. in lisp-like form.
Eckhart Arnold's avatar
Eckhart Arnold committed
286

Eckhart Arnold's avatar
Eckhart Arnold committed
287
288
289
290
        Parameters:
            src:  The source text or `None`. In case the source text is
                given the position of the element in the text will be
                reported as line and column.
Eckhart Arnold's avatar
Eckhart Arnold committed
291
292
        """
        def opening(node):
Eckhart Arnold's avatar
Eckhart Arnold committed
293
            s = '(' + node.tag_name
Eckhart Arnold's avatar
Eckhart Arnold committed
294
295
296
297
298
299
300
            # s += " '(pos %i)" % node.pos
            if src:
                s += " '(pos %i  %i %i)" % (node.pos, *line_col(src, node.pos))
            if node.errors:
                s += " '(err '(%s))" % ' '.join(str(err).replace('"', r'\"')
                                                for err in node.errors)
            return s
301
302
303
304
        def pretty(s):
            return '"%s"' % s if s.find('"') < 0 \
                else "'%s'" % s if s.find("'") < 0 \
                else '"%s"' % s.replace('"', r'\"')
305
        return self._tree_repr('    ', opening, lambda node: ')', pretty)
306

Eckhart Arnold's avatar
Eckhart Arnold committed
307
    def as_xml(self, src=None):
308
309
        """
        Returns content as XML-tree.
Eckhart Arnold's avatar
Eckhart Arnold committed
310

Eckhart Arnold's avatar
Eckhart Arnold committed
311
312
313
314
        Parameters:
            src:  The source text or `None`. In case the source text is
                given the position will also be reported as line and
                column.
Eckhart Arnold's avatar
Eckhart Arnold committed
315
316
317
        """

        def opening(node):
Eckhart Arnold's avatar
Eckhart Arnold committed
318
            s = '<' + node.tag_name
Eckhart Arnold's avatar
Eckhart Arnold committed
319
320
321
322
            # s += ' pos="%i"' % node.pos
            if src:
                s += ' line="%i" col="%i"' % line_col(src, node.pos)
            if node.errors:
323
                s += ' err="%s"' % ''.join(str(err).replace('"', r'\"') for err in node.errors)
Eckhart Arnold's avatar
Eckhart Arnold committed
324
325
326
327
            s += ">"
            return s

        def closing(node):
Eckhart Arnold's avatar
Eckhart Arnold committed
328
            s = '</' + node.tag_name + '>'
Eckhart Arnold's avatar
Eckhart Arnold committed
329
330
            return s

331
        return self._tree_repr('    ', opening, closing)
Eckhart Arnold's avatar
Eckhart Arnold committed
332

Eckhart Arnold's avatar
Eckhart Arnold committed
333
    def add_error(self, error_str):
334
        self._errors.append(error_str)
335
        self.error_flag = True
Eckhart Arnold's avatar
Eckhart Arnold committed
336
337
        return self

338
    def collect_errors(self, clear_errors=False):
339
340
        """
        Returns all errors of this node or any child node in the form
341
342
        of a set of tuples (position, error_message), where position
        is always relative to this node.
Eckhart Arnold's avatar
Eckhart Arnold committed
343
        """
344
345
346
347
348
349
350
351
352
353
        if self.error_flag:
            errors = self.errors
            if clear_errors:
                self._errors = []
                self.error_flag = False
            if self.children:
                for child in self.result:
                    errors.extend(child.collect_errors(clear_errors))
            return errors
        return []
Eckhart Arnold's avatar
Eckhart Arnold committed
354

355
356
357
358
359
360
361
    def log(self, log_file_name, ext):
        global LOGGING
        if LOGGING:
            st_file_name = log_file_name + ext
            with open(os.path.join(LOGS_DIR(), st_file_name), "w", encoding="utf-8") as f:
                f.write(self.as_sexpr())

362
    def navigate(self, path):
363
364
        """EXPERIMENTAL! NOT YET TESTED!!!
        Returns the first descendant element matched by `path`, e.g.
365
366
367
        'd/s' returns 'l' from (d (s l)(e (r x1) (r x2))
        'e/r' returns 'x2'
        'e'   returns (r x1)(r x2)
368

Eckhart Arnold's avatar
Eckhart Arnold committed
369
370
        Parameters:
            path (str):  The path of the object, e.g. 'a/b/c'
371
372
373

        Returns:
            The object at the path, either a string or a Node or
Eckhart Arnold's avatar
Eckhart Arnold committed
374
            ``None``, if the path did not match.
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
        """
        pl = path.strip('')
        assert pl[0] != '/', 'Path must noch start with "/"!'
        nd = self
        for p in pl:
            if isinstance(nd.result, str):
                return p if (p == nd.result) and (p == pl[-1]) else None
            for child in nd.result:
                if str(child) == p:
                    nd = child
                    break
            else:
                return None
        return child

Eckhart Arnold's avatar
Eckhart Arnold committed
390
391

def error_messages(text, errors):
392
393
394
395
396
    """
    Converts the list of ``errors`` collected from the root node of the
    parse tree of `text` into a human readable (and IDE or editor
    parsable text) with line an column numbers. Error messages are
    separated by an empty line.
Eckhart Arnold's avatar
Eckhart Arnold committed
397
398
    """
    return "\n\n".join("line: %i, column: %i, error: %s" %
399
                       (*line_col(text, err.pos), err.msg)
Eckhart Arnold's avatar
Eckhart Arnold committed
400
                       for err in sorted(list(errors)))
Eckhart Arnold's avatar
Eckhart Arnold committed
401
402


403
404
405
# lambda compact_sexpr s : re.sub('\s(?=\))', '', re.sub('\s+', ' ', s)).strip()


406

Eckhart Arnold's avatar
Eckhart Arnold committed
407
408
########################################################################
#
409
# syntax tree transformation
Eckhart Arnold's avatar
Eckhart Arnold committed
410
411
412
413
414
415
416
417
418
#
########################################################################


def expand_table(compact_table):
    """Expands a table by separating keywords that are tuples or strings
    containing comma separated words into single keyword entries with
    the same values. Returns the expanded table.
    Example:
Eckhart Arnold's avatar
Eckhart Arnold committed
419
    >>> expand_table({"a, b": 1, "b": 1, ('d','e','f'):5, "c":3})
Eckhart Arnold's avatar
Eckhart Arnold committed
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
    {'a': 1, 'b': 1, 'c': 3, 'd': 5, 'e': 5, 'f': 5}
    """
    expanded_table = {}
    keys = list(compact_table.keys())
    for key in keys:
        value = compact_table[key]
        if isinstance(key, str):
            parts = (s.strip() for s in key.split(','))
        else:
            assert isinstance(key, collections.abc.Iterable)
            parts = key
        for p in parts:
            expanded_table[p] = value
    return expanded_table


def ASTTransform(node, transtable):
Eckhart Arnold's avatar
Eckhart Arnold committed
437
438
439
    """Transforms the parse tree starting with the given ``node`` into
    an abstract syntax tree by calling transformation functions
    registered in the transformation dictionary ``transtable``.
Eckhart Arnold's avatar
Eckhart Arnold committed
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
    """
    # normalize transformation entries by turning single transformations
    # into lists with a single item
    table = {name: transformation
             if isinstance(transformation, collections.abc.Sequence)
             else [transformation]
             for name, transformation in list(transtable.items())}
    table = expand_table(table)

    def recursive_ASTTransform(node):
        if node.children:
            for child in node.result:
                recursive_ASTTransform(child)
        transformation = table.get(node.parser.name,
                            table.get('~', [])) + table.get('*', [])
        for transform in transformation:
            transform(node)

    recursive_ASTTransform(node)


461
462
463
464
def no_transformation(node):
    pass


Eckhart Arnold's avatar
Eckhart Arnold committed
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
# ------------------------------------------------
#
# rearranging transformations:
#     - tree may be rearranged (flattened)
#     - order is preserved
#     - all leaves are kept
#
# ------------------------------------------------


def replace_by_single_child(node):
    """Remove single branch node, replacing it by its immediate descendant.
    (In case the descendant's name is empty (i.e. anonymous) the
    name of this node's parser is kept.)
    """
    if node.children and len(node.result) == 1:
        if not node.result[0].parser.name:
            node.result[0].parser.name = node.parser.name
        node.parser = node.result[0].parser
484
        node._errors.extend(node.result[0].errors)
Eckhart Arnold's avatar
Eckhart Arnold committed
485
486
487
488
489
490
491
492
        node.result = node.result[0].result


def reduce_single_child(node):
    """Reduce a single branch node, by transferring the result of its
    immediate descendant to this node, but keeping this node's parser entry.
    """
    if node.children and len(node.result) == 1:
493
        node._errors.extend(node.result[0].errors)
Eckhart Arnold's avatar
Eckhart Arnold committed
494
495
496
497
498
499
500
501
        node.result = node.result[0].result


# ------------------------------------------------
#
# destructive transformations:
#     - tree may be rearranged (flattened),
#     - order is preserved
502
503
#     - but (irrelevant) leaves may be dropped
#     - errors of dropped leaves will be lost
Eckhart Arnold's avatar
Eckhart Arnold committed
504
505
506
#
# ------------------------------------------------

507
508

def is_whitespace(node):
509
510
    """Removes whitespace and comments defined with the
    ``@comment``-directive."""
511
    return node.parser.name == WHITESPACE_KEYWORD
512
513


514
515
# def is_scanner_token(node):
#     return isinstance(node.parser, ScannerToken)
516
517


518
519
520
521
def is_empty(node):
    return not node.result


522
def is_expendable(node):
523
    return is_empty(node) or is_whitespace(node)  # or is_scanner_token(node)
Eckhart Arnold's avatar
Eckhart Arnold committed
524
525


526
527
528
529
def is_token(node, token_set={}):
    return node.parser.name == TOKEN_KEYWORD and (not token_set or node.result in token_set)


Eckhart Arnold's avatar
Eckhart Arnold committed
530
def remove_children_if(node, condition):
531
532
    """Removes all nodes from the result field if the function 
    ``condition(child_node)`` evaluates to ``True``."""
Eckhart Arnold's avatar
Eckhart Arnold committed
533
    if node.children:
534
        node.result = tuple(c for c in node.children if not condition(c))
Eckhart Arnold's avatar
Eckhart Arnold committed
535
536
537


remove_whitespace = partial(remove_children_if, condition=is_whitespace)
538
# remove_scanner_tokens = partial(remove_children_if, condition=is_scanner_token)
Eckhart Arnold's avatar
Eckhart Arnold committed
539
540
541
remove_expendables = partial(remove_children_if, condition=is_expendable)


542
543
544
545
546
547
548
549
def remove_tokens(node, tokens=set()):
    """Reomoves any among a particular set of tokens from the immediate
    descendants of a node. If ``tokens`` is the empty set, all tokens
    are removed.
    """
    remove_children_if(node, partial(is_token, token_set=tokens))


Eckhart Arnold's avatar
Eckhart Arnold committed
550
551
552
553
554
555
def flatten(node):
    """Recursively flattens all unnamed sub-nodes, in case there is more
    than one sub-node present. Flattening means that
    wherever a node has child nodes, the child nodes are inserted in place
    of the node. In other words, all leaves of this node and its child nodes
    are collected in-order as direct children of this node.
Eckhart Arnold's avatar
Eckhart Arnold committed
556
557
558
559
560
561
    This is meant to achieve these kinds of structural transformation:
        (1 (+ 2) (+ 3)     ->   (1 + 2 + 3)
        (1 (+ (2 + (3))))  ->   (1 + 2 + 3)
        
    Warning: Use with care. Du tue its recursive nature, flattening can
    have unexpected side-effects.
Eckhart Arnold's avatar
Eckhart Arnold committed
562
    """
563
    if node.children:
Eckhart Arnold's avatar
Eckhart Arnold committed
564
        new_result = []
565
        for child in node.children:
Eckhart Arnold's avatar
Eckhart Arnold committed
566
567
568
569
570
571
572
573
574
            if not child.parser.name and child.children:
                assert child.children, node.as_sexpr()
                flatten(child)
                new_result.extend(child.result)
            else:
                new_result.append(child)
        node.result = tuple(new_result)


575
576
def remove_brackets(node):
    """Removes any enclosing delimiters from a structure (e.g. quotation marks
Eckhart Arnold's avatar
Eckhart Arnold committed
577
578
    from a literal or braces from a group).
    """
579
    if len(node.children) >= 3:
580
        assert not node.children[0].children and not node.children[-1].children, node.as_sexpr()
Eckhart Arnold's avatar
Eckhart Arnold committed
581
582
583
584
        node.result = node.result[1:-1]


AST_SYMBOLS = {'replace_by_single_child', 'reduce_single_child',
di68kap's avatar
di68kap committed
585
               'no_operation', 'remove_children_if',
586
587
               'is_whitespace', 'is_expendable', 'remove_whitespace',
               # 'remove_scanner_tokens', 'is_scanner_token',
588
               'remove_expendables', 'flatten', 'remove_tokens',
589
               'remove_enclosing_delimiters',
Eckhart Arnold's avatar
Eckhart Arnold committed
590
591
592
593
               'TOKEN_KEYWORD', 'WHITESPACE_KEYWORD', 'partial'}



594
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
595
596
597
#
# Parser base classes
#
598
599
########################################################################

Eckhart Arnold's avatar
Eckhart Arnold committed
600

601
602
LEFT_RECURSION_DEPTH = 10   # because of pythons recursion depth limit, this
                            # value ought not to be set too high
603
MAX_DROPOUTS = 25   # stop trying to recover parsing after so many errors
Eckhart Arnold's avatar
Eckhart Arnold committed
604

605
606
WHITESPACE_KEYWORD = 'WSP__'
TOKEN_KEYWORD = 'TOKEN__'
Eckhart Arnold's avatar
Eckhart Arnold committed
607
608


609
class HistoryRecord:
di68kap's avatar
di68kap committed
610
611
612
613
614
615
616
617
618
619
620
621
    """
    Stores debugging information about one completed step in the
    parsing history. 
    
    A parsing step is "completed" when the last one of a nested
    sequence of parser-calls returns. The call stack including
    the last parser call will be frozen in the ``HistoryRecord``-
    object. In addition a reference to the generated leaf node
    (if any) will be stored and the result status of the last
    parser call, which ist either MATCH, FAIL (i.e. no match)
    or ERROR.
    """
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
    __slots__ = ('call_stack', 'node', 'remaining')

    MATCH = "MATCH"
    ERROR = "ERROR"
    FAIL = "FAIL"

    def __init__(self, call_stack, node, remaining):
        self.call_stack = call_stack
        self.node = node
        self.remaining = remaining

    @property
    def stack(self):
        return "->".join(str(parser) for parser in self.call_stack)

    @property
    def status(self):
        return self.FAIL if self.node is None else self.ERROR if self.node._errors else self.MATCH

    @property
    def extent(self):
        return ((-self.remaining - self.node.len, -self.remaining) if self.node
                else (-self.remaining, None))


def add_parser_guard(parser_func):
    def guarded_call(parser, text):
Eckhart Arnold's avatar
Eckhart Arnold committed
649
650
651
652
653
654
655
        try:
            location = len(text)
            # if location has already been visited by the current parser,
            # return saved result
            if location in parser.visited:
                return parser.visited[location]
            # break left recursion at the maximum allowed depth
656
            if parser.recursion_counter.setdefault(location, 0) > LEFT_RECURSION_DEPTH:
Eckhart Arnold's avatar
Eckhart Arnold committed
657
658
659
                return None, text

            parser.recursion_counter[location] += 1
660
            grammar = parser.grammar
Eckhart Arnold's avatar
Eckhart Arnold committed
661

662
663
664
            if grammar.track_history:
                grammar.call_stack.append(parser)
                grammar.moving_forward = True
Eckhart Arnold's avatar
Eckhart Arnold committed
665
666

            # run original __call__ method
667
668
669
670
671
672
673
674
675
676
            node, rest = parser_func(parser, text)

            if grammar.track_history:
                if grammar.moving_forward:  # and result[0] == None
                    grammar.moving_forward = False
                    record = HistoryRecord(grammar.call_stack.copy(), node, len(rest))
                    grammar.history.append(record)
                grammar.call_stack.pop()

            if node is not None:
Eckhart Arnold's avatar
Eckhart Arnold committed
677
678
                # in case of a recursive call saves the result of the first
                # (or left-most) call that matches
679
680
                parser.visited[location] = (node, rest)
                grammar.last_node = node
Eckhart Arnold's avatar
Eckhart Arnold committed
681
682
683
            elif location in parser.visited:
                # if parser did non match but a saved result exits, assume
                # left recursion and use the saved result
684
                node, rest = parser.visited[location]
Eckhart Arnold's avatar
Eckhart Arnold committed
685
686
687
688
689
690
691

            parser.recursion_counter[location] -= 1

        except RecursionError:
            node = Node(None, text[:min(10, max(1, text.find("\n")))] + " ...")
            node.add_error("maximum recursion depth of parser reached; "
                           "potentially due to too many errors!")
692
            node.error_flag = True
693
            rest = ''
Eckhart Arnold's avatar
Eckhart Arnold committed
694

695
        return node, rest
Eckhart Arnold's avatar
Eckhart Arnold committed
696

697
    return guarded_call
Eckhart Arnold's avatar
Eckhart Arnold committed
698
699
700


class ParserMetaClass(type):
701
    def __init__(cls, name, bases, attrs):
Eckhart Arnold's avatar
Eckhart Arnold committed
702
703
704
        # The following condition is necessary for classes that don't override
        # the __call__() method, because in these cases the non-overridden
        # __call__()-method would be substituted a second time!
705
706
707
        guarded_parser_call = add_parser_guard(cls.__call__)
        if cls.__call__.__code__ != guarded_parser_call.__code__:
            cls.__call__ = guarded_parser_call
708
        super(ParserMetaClass, cls).__init__(name, bases, attrs)
Eckhart Arnold's avatar
Eckhart Arnold committed
709

710

Eckhart Arnold's avatar
Eckhart Arnold committed
711
class Parser(metaclass=ParserMetaClass):
712
713
714
    def __init__(self, name=None):
        assert name is None or isinstance(name, str), str(name)
        self.name = name or ''
715
        self.grammar = None          # center for global variables etc.
716
717
718
        self.reset()

    def reset(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
719
720
721
722
723
        self.visited = dict()
        self.recursion_counter = dict()
        self.cycle_detection = set()

    def __call__(self, text):
724
        return None, text               # default behaviour: don't match
Eckhart Arnold's avatar
Eckhart Arnold committed
725
726

    def __str__(self):
727
        return self.name or self.__class__.__name__
Eckhart Arnold's avatar
Eckhart Arnold committed
728

729
    @property
730
731
    def grammar(self):
        return self._grammar
732

733
734
735
736
    @grammar.setter
    def grammar(self, grammar_base):
        self._grammar = grammar_base
        self._grammar_assigned_notifier()
737

738
    def _grammar_assigned_notifier(self):
739
740
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
741
742
    def apply(self, func):
        """Applies function `func(parser)` recursively to this parser and all
743
        descendants of the tree of parsers. The same function can never
744
        be applied twice between calls of the ``reset()``-method!
Eckhart Arnold's avatar
Eckhart Arnold committed
745
746
747
748
749
750
751
752
753
        """
        if func in self.cycle_detection:
            return False
        else:
            self.cycle_detection.add(func)
            func(self)
            return True


754
class GrammarBase:
755
756
    root__ = None   # should be overwritten by grammar subclass

757
    @classmethod
758
    def _assign_parser_names(cls):
759
        """Initializes the `parser.name` fields of those
760
761
        Parser objects that are directly assigned to a class field with
        the field's name, e.g.
762
            class Grammar(Grammar):
763
764
                ...
                symbol = RE('(?!\\d)\\w+')
765
        After the call of this method symbol.name == "symbol"
766
767
        holds. Names assigned via the `name`-parameter of the
        constructor will not be overwritten.
768
        """
769
        if cls.parser_initialization__ == "done":
770
            return
771
        cdict = cls.__dict__
772
773
774
        for entry, parser in cdict.items():
            if isinstance(parser, Parser):
                if not parser.name or parser.name == TOKEN_KEYWORD:
Eckhart Arnold's avatar
Eckhart Arnold committed
775
                    parser.name = entry
776
777
                if (isinstance(parser, Forward) and (not parser.parser.name
                    or parser.parser.name == TOKEN_KEYWORD)):
778
                    parser.parser.name = entry
779
        cls.parser_initialization__ = "done"
780
781

    def __init__(self):
782
783
        self.all_parsers = set()
        self.dirty_flag = False
784
785
786
        self.track_history = LOGGING
        name = self.__class__.__name__
        self.log_file_name = name[:-7] if name.lower().endswith('grammar') else name
787
        self._reset()
788
        self._assign_parser_names()
789
        self.root__ = copy.deepcopy(self.__class__.root__)
790
791
        if self.wspL__:
            self.wsp_left_parser__ = RegExp(self.wspL__, WHITESPACE_KEYWORD)
792
            self.wsp_left_parser__.grammar = self
793
        else:
794
            self.wsp_left_parser__ = ZOMBIE_PARSER
795
796
        if self.wspR__:
            self.wsp_right_parser__ = RegExp(self.wspR__, WHITESPACE_KEYWORD)
797
            self.wsp_right_parser__.grammar = self
798
        else:
799
            self.wsp_right_parser__ = ZOMBIE_PARSER
Eckhart Arnold's avatar
Eckhart Arnold committed
800
        self.root__.apply(self._add_parser)
801
802

    def _reset(self):
803
        self.variables = dict()                 # support for Pop and Retrieve operators
804
        self.document = ""  # source document
Eckhart Arnold's avatar
Eckhart Arnold committed
805
        self.last_node = None
806
        self.call_stack = []                    # support for call stack tracing
807
        self.history = []                       # snapshots of call stacks
808
        self.moving_forward = True              # also needed for call stack tracing
Eckhart Arnold's avatar
Eckhart Arnold committed
809
810

    def _add_parser(self, parser):
811
        """Adds the copy of the classes parser object to this
812
        particular instance of Grammar.
Eckhart Arnold's avatar
Eckhart Arnold committed
813
        """
814
815
        setattr(self, parser.name, parser)
        self.all_parsers.add(parser)
816
        parser.grammar = self
Eckhart Arnold's avatar
Eckhart Arnold committed
817
818
819
820
821
822
823
824
825

    def parse(self, document):
        """Parses a document with with parser-combinators.

        Args:
            document (str): The source text to be parsed.
        Returns:
            Node: The root node ot the parse tree.
        """
826
827
        if self.root__ is None:
            raise NotImplementedError()
828
        if self.dirty_flag:
829
            self._reset()
830
831
832
833
            for parser in self.all_parsers:
                parser.reset()
        else:
            self.dirty_flag = True
834
        self.document = document
Eckhart Arnold's avatar
Eckhart Arnold committed
835
        parser = self.root__
836
        result = ""
837
        stitches = []
Eckhart Arnold's avatar
Eckhart Arnold committed
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
        rest = document
        while rest and len(stitches) < MAX_DROPOUTS:
            result, rest = parser(rest)
            if rest:
                fwd = rest.find("\n") + 1 or len(rest)
                skip, rest = rest[:fwd], rest[fwd:]
                if result is None:
                    error_msg = "Parser did not match! Invalid source file?"
                else:
                    stitches.append(result)
                    error_msg = "Parser stopped before end" + \
                                ("! trying to recover..."
                                 if len(stitches) < MAX_DROPOUTS
                                 else " too often! Terminating parser.")
                stitches.append(Node(None, skip))
                stitches[-1].add_error(error_msg)
854
855
856
857
858
859
        if stitches:
            if rest:
                stitches.append(Node(None, rest))
            result = Node(None, tuple(stitches))
        result.pos = 0 # calculate all positions
        return result
Eckhart Arnold's avatar
Eckhart Arnold committed
860

861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
    def log_parsing_history(self):
        """Writes a log of the parsing history of the most recently parsed
        document. 
        """

        def prepare_line(record):
            excerpt = self.document.__getitem__(slice(*record.extent))[:25].replace('\n', '\\n')
            excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
            return (record.stack, record.status, excerpt)

        def write_log(history, log_name):
            path = os.path.join(LOGS_DIR(), self.log_file_name + log_name + "_parser.log")
            if history:
                with open(path, "w", encoding="utf-8") as f:
                    f.write("\n".join(history))
            elif os.path.exists(path):
                os.remove(path)

        global LOGGING
        if LOGGING:
            assert self.history
            full_history, match_history, errors_only = [], [], []
            for record in self.history:
                line = ";  ".join(prepare_line(record))
                full_history.append(line)
                if record.node and record.node.parser.name != WHITESPACE_KEYWORD:
                    match_history.append(line)
                    if record.node.errors:
                        errors_only.append(line)
            write_log(full_history, '_full')
            write_log(match_history, '_match')
            write_log(errors_only, '_errors')
Eckhart Arnold's avatar
Eckhart Arnold committed
893

Eckhart Arnold's avatar
Eckhart Arnold committed
894

895

896
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
897
898
899
#
# Token and Regular Expression parser classes (i.e. leaf classes)
#
900
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
901
902


903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925

RX_SCANNER_TOKEN = re.compile('\w+')
BEGIN_SCANNER_TOKEN = '\x1b'
END_SCANNER_TOKEN = '\x1c'


def make_token(token, argument=''):
    """Turns the ``token`` and ``argument`` into a special token that
    will be caught by the `ScannerToken`-parser.
    
    This function is a support function that should be used by scanners
    to inject scanner tokens into the source text.
    """
    assert RX_SCANNER_TOKEN.match(token)
    assert argument.find(BEGIN_SCANNER_TOKEN) < 0
    assert argument.find(END_SCANNER_TOKEN) < 0

    return BEGIN_SCANNER_TOKEN + token + argument + END_SCANNER_TOKEN


nil_scanner = lambda text: text


Eckhart Arnold's avatar
Eckhart Arnold committed
926
927
928
929
930
class ScannerToken(Parser):
    def __init__(self, scanner_token):
        assert isinstance(scanner_token, str) and scanner_token and \
               scanner_token.isupper()
        assert RX_SCANNER_TOKEN.match(scanner_token)
931
        super(ScannerToken, self).__init__(scanner_token, name=TOKEN_KEYWORD)
Eckhart Arnold's avatar
Eckhart Arnold committed
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946

    def __call__(self, text):
        if text[0:1] == BEGIN_SCANNER_TOKEN:
            end = text.find(END_SCANNER_TOKEN, 1)
            if end < 0:
                node = Node(self, '').add_error(
                    'END_SCANNER_TOKEN delimiter missing from scanner token. '
                    '(Most likely due to a scanner bug!)')
                return node, text[1:]
            elif end == 0:
                node = Node(self, '').add_error(
                    'Scanner token cannot have zero length. '
                    '(Most likely due to a scanner bug!)')
                return node, text[2:]
            elif text.find(BEGIN_SCANNER_TOKEN, 1, end) >= 0:
947
                node = Node(self, text[len(self.name) + 1:end])
Eckhart Arnold's avatar
Eckhart Arnold committed
948
949
950
951
952
                node.add_error(
                    'Scanner tokens must not be nested or contain '
                    'BEGIN_SCANNER_TOKEN delimiter as part of their argument. '
                    '(Most likely due to a scanner bug!)')
                return node, text[end:]
953
954
            if text[1:len(self.name) + 1] == self.name:
                return Node(self, text[len(self.name) + 1:end]), \
Eckhart Arnold's avatar
Eckhart Arnold committed
955
956
957
958
959
                       text[end + 1:]
        return None, text


class RegExp(Parser):
960
    def __init__(self, regexp, name=None):
961
        super(RegExp, self).__init__(name)
Eckhart Arnold's avatar
Eckhart Arnold committed
962
963
964
        self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

    def __deepcopy__(self, memo):
Eckhart Arnold's avatar
Eckhart Arnold committed
965
966
        # This method is obsolete with the new `regex` module! It's
        # being kept for compatibility with Python's standard library
Eckhart Arnold's avatar
Eckhart Arnold committed
967
968
969
970
        try:
            regexp = copy.deepcopy(self.regexp)
        except TypeError:
            regexp = self.regexp.pattern
di68kap's avatar
di68kap committed
971
        duplicate = RegExp(regexp, self.name)
972
        duplicate.name = self.name  # this ist needed!!!!
973
        duplicate.regexp = self.regexp
974
        duplicate.grammar = self.grammar
Eckhart Arnold's avatar
Eckhart Arnold committed
975
976
977
978
979
980
981
982
983
        duplicate.visited = copy.deepcopy(self.visited, memo)
        duplicate.recursion_counter = copy.deepcopy(self.recursion_counter,
                                                    memo)
        return duplicate

    def __call__(self, text):
        match = text[0:1] != BEGIN_SCANNER_TOKEN and self.regexp.match(text)  # ESC starts a scanner token.
        if match:
            end = match.end()
984
985
986
987
988
            return Node(self, text[:end]), text[end:]
        return None, text


class RE(Parser):
Eckhart Arnold's avatar
Eckhart Arnold committed
989
990
    """Regular Expressions with optional leading or trailing whitespace.
    """
991
    def __init__(self, regexp, wL=None, wR=None, name=None):
992
        super(RE, self).__init__(name)
993
        # assert wR or regexp == '.' or isinstance(self, Token)
994
995
        self.wL = wL
        self.wR = wR
996
997
        self.wspLeft = RegExp(wL, WHITESPACE_KEYWORD) if wL else ZOMBIE_PARSER
        self.wspRight = RegExp(wR, WHITESPACE_KEYWORD) if wR else ZOMBIE_PARSER
998
999
1000
1001
1002
        self.main = RegExp(regexp)

    def __call__(self, text):
        # assert self.main.regexp.pattern != "@"
        t = text
1003
        wL, t = self.wspLeft(t)
1004
1005
        main, t = self.main(t)
        if main:
1006
            wR, t = self.wspRight(t)
1007
1008
            result = tuple(nd for nd in (wL, main, wR)
                           if nd and nd.result != '')
1009
            return Node(self, result), t
Eckhart Arnold's avatar
Eckhart Arnold committed
1010
1011
1012
        return None, text

    def __str__(self):
1013
1014
1015
1016
        if self.name == TOKEN_KEYWORD:
            return 'Token "%s"' % self.main.regexp.pattern.replace('\\', '')
        return self.name or ('RE ' + ('~' if self.wL else '')
                             + '/%s/' % self.main.regexp.pattern + ('~' if self.wR else ''))
1017

1018
1019
    def _grammar_assigned_notifier(self):
        if self.grammar:
1020
            if self.wL is None:
1021
                self.wspLeft = self.grammar.wsp_left_parser__
1022
            if self.wR is None:
1023
                self.wspRight = self.grammar.wsp_right_parser__
1024

1025
1026
1027
    def apply(self, func):
        if super(RE, self).apply(func):
            if self.wL:
1028
                self.wspLeft.apply(func)
1029
            if self.wR:
1030
                self.wspRight.apply(func)
1031
            self.main.apply(func)
Eckhart Arnold's avatar
Eckhart Arnold committed
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042


def escape_re(s):
    """Returns `s` with all regular expression special characters escaped.
    """
    assert isinstance(s, str)
    re_chars = r"\.^$*+?{}[]()#<>=|!"
    for esc_ch in re_chars:
        s = s.replace(esc_ch, '\\' + esc_ch)
    return s

1043
1044
1045
1046

def Token(token, wL=None, wR=None, name=None):
    return RE(escape_re(token), wL, wR, name or TOKEN_KEYWORD)

Eckhart Arnold's avatar
Eckhart Arnold committed
1047
1048
1049
1050
1051
1052
1053
1054
1055

def mixin_comment(whitespace, comment):
    """Mixes comment-regexp into whitespace regexp.
    """
    wspc = '(?:' + whitespace + '(?:' + comment + whitespace + ')*)'
    return wspc



1056
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
1057
1058
1059
#
# Combinator parser classes (i.e. trunk classes of the parser tree)
#
1060
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
1061
1062
1063


class UnaryOperator(Parser):
1064
1065
    def __init__(self, parser, name=None):
        super(UnaryOperator, self).__init__(name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1066
1067
1068
1069
1070
1071
1072
1073
1074
        assert isinstance(parser, Parser)
        self.parser = parser

    def apply(self, func):
        if super(UnaryOperator, self).apply(func):
            self.parser.apply(func)


class NaryOperator(Parser):
1075
1076
    def __init__(self, *parsers, name=None):
        super(NaryOperator, self).__init__(name)
1077
        assert all([isinstance(parser, Parser) for parser in parsers]), str(parsers)
Eckhart Arnold's avatar
Eckhart Arnold committed
1078
1079
1080
1081
1082
1083
1084
1085
1086
        self.parsers = parsers

    def apply(self, func):
        if super(NaryOperator, self).apply(func):
            for parser in self.parsers:
                parser.apply(func)


class Optional(UnaryOperator):
1087
1088
    def __init__(self, parser, name=None):
        super(Optional, self).__init__(parser, name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1089
1090
        assert isinstance(parser, Parser)
        assert not isinstance(parser, Optional), \
1091
            "Nesting options would be redundant: %s(%s)" % \
1092
            (str(name), str(parser.name))
Eckhart Arnold's avatar
Eckhart Arnold committed
1093
        assert not isinstance(parser, Required), \
1094
            "Nestion options with required elements is contradictory: " \
1095
            "%s(%s)" % (str(name), str(parser.name))
Eckhart Arnold's avatar
Eckhart Arnold committed
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115

    def __call__(self, text):
        node, text = self.parser(text)
        if node:
            return Node(self, node), text
        return Node(self, ()), text


class ZeroOrMore(Optional):
    def __call__(self, text):
        results = ()
        while text:
            node, text = self.parser(text)
            if not node:
                break
            results += (node,)
        return Node(self, results), text


class OneOrMore(UnaryOperator):
1116
1117
    def __init__(self, parser, name=None):
        super(OneOrMore, self).__init__(parser, name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1118
        assert not isinstance(parser, Optional), \
1119
            "Use ZeroOrMore instead of nesting OneOrMore and Optional: " \
1120
            "%s(%s)" % (str(name), str(parser.name))
Eckhart Arnold's avatar
Eckhart Arnold committed
1121
1122
1123
1124
1125
1126
1127
1128
1129

    def __call__(self, text):
        results = ()
        text_ = text
        while text_:
            node, text_ = self.parser(text_)
            if not node:
                break
            results += (node,)
1130
        if results == ():
Eckhart Arnold's avatar
Eckhart Arnold committed
1131
1132
1133
1134
1135
            return None, text
        return Node(self, results), text_


class Sequence(NaryOperator):
1136
1137
    def __init__(self, *parsers, name=None):
        super(Sequence, self).__init__(*parsers, name=name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
        assert len(self.parsers) >= 1

    def __call__(self, text):
        results = ()
        text_ = text
        for parser in self.parsers:
            node, text_ = parser(text_)
            if not node:
                return node, text
            if node.result:  # Nodes with zero-length result are silently omitted
                results += (node,)
1149
            if node.error_flag:
Eckhart Arnold's avatar
Eckhart Arnold committed
1150
1151
1152
1153
1154
1155
                break
        assert len(results) <= len(self.parsers)
        return Node(self, results), text_


class Alternative(NaryOperator):
1156
1157
    def __init__(self, *parsers, name=None):
        super(Alternative, self).__init__(*parsers, name=name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
        assert len(self.parsers) >= 1
        assert all(not isinstance(p, Optional) for p in self.parsers)

    def __call__(self, text):
        for parser in self.parsers:
            node, text_ = parser(text)
            if node:
                return Node(self, node), text_
        return None, text


1169
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
1170
1171
1172
#
# Flow control operators
#
1173
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
1174
1175
1176


class FlowOperator(UnaryOperator):
1177
1178
    def __init__(self, parser, name=None):
        super(FlowOperator, self).__init__(parser, name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1179
1180
1181


class Required(FlowOperator):
1182
    # TODO: Add constructor that checks for logical errors, like `Required(Optional(...))` constructs
Eckhart Arnold's avatar
Eckhart Arnold committed
1183
1184
1185
1186
1187
1188
1189
    def __call__(self, text):
        node, text_ = self.parser(text)
        if not node:
            m = re.search(r'\s(\S)', text)
            i = max(1, m.regs[1][0]) if m else 1
            node = Node(self, text[:i])
            text_ = text[i:]
Eckhart Arnold's avatar
Eckhart Arnold committed
1190
            # assert False, "*"+text[:i]+"*"
Eckhart Arnold's avatar
Eckhart Arnold committed
1191
1192
1193
1194
1195
1196
            node.add_error('%s expected; "%s..." found!' %
                           (str(self.parser), text[:10]))
        return node, text_


class Lookahead(FlowOperator):