ParserCombinators_obsolete.py 80.7 KB
Newer Older
Eckhart Arnold's avatar
Eckhart Arnold committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#!/usr/bin/python3

"""ParserCombinators.py - parser combinators for left-recursive grammers

Copyright 2016  by Eckhart Arnold

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
15 16 17
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
Eckhart Arnold's avatar
Eckhart Arnold committed
18 19


20 21 22 23 24
Module parser_combinators contains a number of classes that together
make up parser combinators for left-recursive grammers. For each
element of the extended Backus-Naur-Form as well as for a regular
expression token a class is defined. The set of classes can be used to
define a parser for (ambiguous) left-recursive grammers.
Eckhart Arnold's avatar
Eckhart Arnold committed
25 26 27 28 29


References and Acknowledgements:

Dominikus Herzberg: Objekt-orientierte Parser-Kombinatoren in Python,
30 31
Blog-Post, September, 18th 2008 on denkspuren. gedanken, ideen,
anregungen und links rund um informatik-themen, URL:
Eckhart Arnold's avatar
Eckhart Arnold committed
32 33
http://denkspuren.blogspot.de/2008/09/objekt-orientierte-parser-kombinatoren.html

34 35 36
Dominikus Herzberg: Eine einfache Grammatik für LaTeX, Blog-Post,
September, 18th 2008 on denkspuren. gedanken, ideen, anregungen und
links rund um informatik-themen, URL:
Eckhart Arnold's avatar
Eckhart Arnold committed
37 38 39
http://denkspuren.blogspot.de/2008/09/eine-einfache-grammatik-fr-latex.html

Dominikus Herzberg: Uniform Syntax, Blog-Post, February, 27th 2007 on
40 41
denkspuren. gedanken, ideen, anregungen und links rund um
informatik-themen, URL:
Eckhart Arnold's avatar
Eckhart Arnold committed
42 43
http://denkspuren.blogspot.de/2007/02/uniform-syntax.html

44 45 46 47
Richard A. Frost, Rahmatullah Hafiz and Paul Callaghan: Parser
Combinators for Ambiguous Left-Recursive Grammars, in: P. Hudak and
D.S. Warren (Eds.): PADL 2008, LNCS 4902, pp. 167–181, Springer-Verlag
Berlin Heidelberg 2008.
Eckhart Arnold's avatar
Eckhart Arnold committed
48 49 50

Juancarlo Añez: grako, a PEG parser generator in Python,
https://bitbucket.org/apalala/grako
51

Eckhart Arnold's avatar
Eckhart Arnold committed
52 53
"""

54
# TODO: Replace copy.deepcopy() call in GrammarBase class by custom copy()-methods in the Parser classes. Is that really better?
Eckhart Arnold's avatar
Eckhart Arnold committed
55 56 57 58


import collections
import copy
59
import hashlib
Eckhart Arnold's avatar
Eckhart Arnold committed
60 61
import keyword
import os
62 63
from functools import partial

64
from typing import NamedTuple
65

Eckhart Arnold's avatar
Eckhart Arnold committed
66 67 68 69
try:
    import regex as re
except ImportError:
    import re
Eckhart Arnold's avatar
Eckhart Arnold committed
70 71 72
import sys


Eckhart Arnold's avatar
Eckhart Arnold committed
73
__version__ = '0.5.3' + '_dev' + str(os.stat(__file__).st_mtime)
74

75
LOGGING = "LOGS"
76

Eckhart Arnold's avatar
Eckhart Arnold committed
77

78 79 80
def LOGS_DIR():
    """Returns a path of a directory where log files will be stored.
    Usually, this is just a sub-directory named 'LOGS'. The directory
Eckhart Arnold's avatar
Eckhart Arnold committed
81 82
    will be created if it does not exist.
    """
83 84 85 86 87 88 89
    global LOGGING
    if not LOGGING:
        raise AssertionError("Cannot use LOGGING_DIR() if LOGGINGging is turned off!")
    dirname = LOGGING
    if os.path.exists(LOGGING):
        if not os.path.isdir(LOGGING):
            raise IOError('"' + LOGGING + '" cannot be used as log directory, '
Eckhart Arnold's avatar
Eckhart Arnold committed
90 91
                          'because it is not a directory!')
    else:
92
        os.mkdir(LOGGING)
93
    return dirname
Eckhart Arnold's avatar
Eckhart Arnold committed
94 95


96 97

########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
98
#
99
# syntax tree
Eckhart Arnold's avatar
Eckhart Arnold committed
100
#
101 102
########################################################################

Eckhart Arnold's avatar
Eckhart Arnold committed
103 104

def line_col(text, pos):
Eckhart Arnold's avatar
Eckhart Arnold committed
105
    """Returns the position within a text as (line, column)-tuple.
106
    """
107
    assert pos < len(text), str(pos) + " >= " + str(len(text))
Eckhart Arnold's avatar
Eckhart Arnold committed
108 109 110 111 112
    line = text.count("\n", 0, pos) + 1
    column = pos - text.rfind("\n", 0, pos)
    return line, column


113
class ZombieParser:
Eckhart Arnold's avatar
Eckhart Arnold committed
114 115 116 117 118 119 120 121
    """
    Serves as a substitute for a Parser instance.

    ``ZombieParser`` is the class of the singelton object
    ``ZOMBIE_PARSER``. The  ``ZOMBIE_PARSER`` has a name and can be
    called, but it never matches. It serves as a substitute where only
    these (or one of these properties) is needed, but no real Parser-
    object is instantiated.
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
    """
    alive = False
    def __init__(self):
        assert not self.__class__.alive, "There can be only one!"
        assert self.__class__ == ZombieParser, "No derivatives, please!"
        self.name = "ZOMBIE"
        self.__class__.alive = True

    def __str__(self):
        return self.name

    def __call__(self, text):
        """Better call Saul ;-)"""
        return None, text


ZOMBIE_PARSER = ZombieParser()


class Error(NamedTuple):
    pos: int
    msg: str
144 145


Eckhart Arnold's avatar
Eckhart Arnold committed
146
class Node:
147 148 149 150
    """
    Represents a node in the concrete or abstract syntax tree.

    Attributes:
Eckhart Arnold's avatar
Eckhart Arnold committed
151 152
        tag_name (str):  The name of the node, which is either its
            parser's name or, if that is empty, the parser's class name
153
        result (str or tuple):  The result of the parser which
Eckhart Arnold's avatar
Eckhart Arnold committed
154 155
            generated this node, which can be either a string or a
            tuple of child nodes.
156
        children (tuple):  The tuple of child nodes or an empty tuple
Eckhart Arnold's avatar
Eckhart Arnold committed
157
            if there are no child nodes. READ ONLY!
158
        parser (Parser):  The parser which generated this node.
159
        errors (list):  A list of parser- or compiler-errors:
Eckhart Arnold's avatar
Eckhart Arnold committed
160
            tuple(position, string) attached to this node
Eckhart Arnold's avatar
Eckhart Arnold committed
161
        len (int):  The full length of the node's string result if the
Eckhart Arnold's avatar
Eckhart Arnold committed
162 163 164 165
            node is a leaf node or, otherwise, the concatenated string
            result's of its descendants. The figure always represents
            the length before AST-transformation ans will never change
            through AST-transformation. READ ONLY!
166
        pos (int):  the position of the node within the parsed text.
167

168 169 170 171 172 173 174 175 176 177 178 179
            The value of ``pos`` is -1 meaning invalid by default. 
            Setting this value will set the positions of all child
            nodes relative to this value.  

            To set the pos values of all nodes in a syntax tree, the
            pos value of the root node should be set to 0 right 
            after parsing.

            Other than that, this value should be considered READ ONLY. 
            At any rate, it should only be reassigned only during
            parsing stage and never during or after the
            AST-transformation.
180 181
    """

Eckhart Arnold's avatar
Eckhart Arnold committed
182
    def __init__(self, parser, result):
Eckhart Arnold's avatar
Eckhart Arnold committed
183 184
        """Initializes the ``Node``-object with the ``Parser``-Instance
        that generated the node and the parser's result.
185 186
        """
        self.result = result
Eckhart Arnold's avatar
Eckhart Arnold committed
187
        self.parser = parser or ZOMBIE_PARSER
188
        self._errors = []
189
        self.error_flag = any(r.error_flag for r in self.result) if self.children else False
Eckhart Arnold's avatar
Eckhart Arnold committed
190
        self._len = len(self.result) if not self.children else \
191
            sum(child._len for child in self.children)
192
        # self.pos = 0  # coninuous updating of pos values
193
        self._pos = -1
Eckhart Arnold's avatar
Eckhart Arnold committed
194 195 196 197 198 199

    def __str__(self):
        if self.children:
            return "".join([str(child) for child in self.result])
        return str(self.result)

200
    @property
Eckhart Arnold's avatar
Eckhart Arnold committed
201
    def tag_name(self):
202 203
        return self.parser.name or self.parser.__class__.__name__

Eckhart Arnold's avatar
Eckhart Arnold committed
204 205 206 207 208 209
    @property
    def result(self):
        return self._result

    @result.setter
    def result(self, result):
210 211 212 213 214 215 216 217 218
        assert ((isinstance(result, tuple) and all(isinstance(child, Node) for child in result))
                or isinstance(result, Node)
                or isinstance(result, str)), str(result)
        self._result = (result,) if isinstance(result, Node) else result or ''
        self._children = self._result if isinstance(self._result, tuple) else ()

    @property
    def children(self):
        return self._children
Eckhart Arnold's avatar
Eckhart Arnold committed
219

Eckhart Arnold's avatar
Eckhart Arnold committed
220 221
    @property
    def len(self):
di68kap's avatar
di68kap committed
222
        # DEBUGGING:  print(str(self.parser), str(self.pos), str(self._len), str(self)[:10].replace('\n','.'))
Eckhart Arnold's avatar
Eckhart Arnold committed
223 224
        return self._len

Eckhart Arnold's avatar
Eckhart Arnold committed
225 226
    @property
    def pos(self):
227
        assert self._pos >= 0, "position value not initialized!"
Eckhart Arnold's avatar
Eckhart Arnold committed
228 229 230 231 232 233 234 235
        return self._pos

    @pos.setter
    def pos(self, pos):
        self._pos = pos
        offset = 0
        for child in self.children:
            child.pos = pos + offset
Eckhart Arnold's avatar
Eckhart Arnold committed
236
            offset += child.len
Eckhart Arnold's avatar
Eckhart Arnold committed
237

238 239 240 241
    @property
    def errors(self):
        return [Error(self.pos, err) for err in self._errors]

242 243 244 245 246
    def _tree_repr(self, tab, openF, closeF, dataF=lambda s: s):
        """
        Generates a tree representation of this node and its children
        in string from.

Eckhart Arnold's avatar
Eckhart Arnold committed
247 248 249 250 251 252 253 254 255 256 257 258
        The kind ot tree-representation that is determined by several
        function parameters. This could be an XML-representation or a
        lisp-like S-expression.

        Parameters:
            tab (str):  The indentation string, e.g. '\t' or '    '
            openF:  (Node->str) A function that returns an opening
                string (e.g. an XML-tag_name) for a given node
            closeF:  (Node->str) A function that returns a closeF
                string (e.g. an XML-tag_name) for a given node.
            dataF:  (str->str) A function that filters the data string
                before printing, e.g. to add quotation marks
Eckhart Arnold's avatar
Eckhart Arnold committed
259

260
        Returns (str):
Eckhart Arnold's avatar
Eckhart Arnold committed
261 262
            A string that contains a (serialized) tree representation
            of the node and its children.
Eckhart Arnold's avatar
Eckhart Arnold committed
263 264 265 266 267 268 269 270 271 272 273 274 275
        """
        head = openF(self)
        tail = closeF(self)

        if not self.result:
            return head + tail

        head = head + '\n'        # place the head, tail and content
        tail = '\n' + tail        # of the node on different lines

        if self.children:
            content = []
            for child in self.result:
276
                subtree = child._tree_repr(tab, openF, closeF, dataF).split('\n')
Eckhart Arnold's avatar
Eckhart Arnold committed
277 278 279 280 281 282 283
                content.append('\n'.join((tab + s) for s in subtree))
            return head + '\n'.join(content) + tail

        return head + '\n'.join([tab + dataF(s)
                                 for s in str(self.result).split('\n')]) + tail

    def as_sexpr(self, src=None):
284 285
        """
        Returns content as S-expression, i.e. in lisp-like form.
Eckhart Arnold's avatar
Eckhart Arnold committed
286

Eckhart Arnold's avatar
Eckhart Arnold committed
287 288 289 290
        Parameters:
            src:  The source text or `None`. In case the source text is
                given the position of the element in the text will be
                reported as line and column.
Eckhart Arnold's avatar
Eckhart Arnold committed
291 292
        """
        def opening(node):
Eckhart Arnold's avatar
Eckhart Arnold committed
293
            s = '(' + node.tag_name
Eckhart Arnold's avatar
Eckhart Arnold committed
294 295 296 297 298 299 300
            # s += " '(pos %i)" % node.pos
            if src:
                s += " '(pos %i  %i %i)" % (node.pos, *line_col(src, node.pos))
            if node.errors:
                s += " '(err '(%s))" % ' '.join(str(err).replace('"', r'\"')
                                                for err in node.errors)
            return s
301 302 303 304
        def pretty(s):
            return '"%s"' % s if s.find('"') < 0 \
                else "'%s'" % s if s.find("'") < 0 \
                else '"%s"' % s.replace('"', r'\"')
305
        return self._tree_repr('    ', opening, lambda node: ')', pretty)
306

Eckhart Arnold's avatar
Eckhart Arnold committed
307
    def as_xml(self, src=None):
308 309
        """
        Returns content as XML-tree.
Eckhart Arnold's avatar
Eckhart Arnold committed
310

Eckhart Arnold's avatar
Eckhart Arnold committed
311 312 313 314
        Parameters:
            src:  The source text or `None`. In case the source text is
                given the position will also be reported as line and
                column.
Eckhart Arnold's avatar
Eckhart Arnold committed
315 316 317
        """

        def opening(node):
Eckhart Arnold's avatar
Eckhart Arnold committed
318
            s = '<' + node.tag_name
Eckhart Arnold's avatar
Eckhart Arnold committed
319 320 321 322
            # s += ' pos="%i"' % node.pos
            if src:
                s += ' line="%i" col="%i"' % line_col(src, node.pos)
            if node.errors:
323
                s += ' err="%s"' % ''.join(str(err).replace('"', r'\"') for err in node.errors)
Eckhart Arnold's avatar
Eckhart Arnold committed
324 325 326 327
            s += ">"
            return s

        def closing(node):
Eckhart Arnold's avatar
Eckhart Arnold committed
328
            s = '</' + node.tag_name + '>'
Eckhart Arnold's avatar
Eckhart Arnold committed
329 330
            return s

331
        return self._tree_repr('    ', opening, closing)
Eckhart Arnold's avatar
Eckhart Arnold committed
332

Eckhart Arnold's avatar
Eckhart Arnold committed
333
    def add_error(self, error_str):
334
        self._errors.append(error_str)
335
        self.error_flag = True
Eckhart Arnold's avatar
Eckhart Arnold committed
336 337
        return self

338
    def collect_errors(self, clear_errors=False):
339 340
        """
        Returns all errors of this node or any child node in the form
341 342
        of a set of tuples (position, error_message), where position
        is always relative to this node.
Eckhart Arnold's avatar
Eckhart Arnold committed
343
        """
344 345 346 347 348 349 350 351 352 353
        if self.error_flag:
            errors = self.errors
            if clear_errors:
                self._errors = []
                self.error_flag = False
            if self.children:
                for child in self.result:
                    errors.extend(child.collect_errors(clear_errors))
            return errors
        return []
Eckhart Arnold's avatar
Eckhart Arnold committed
354

355 356 357 358 359 360 361
    def log(self, log_file_name, ext):
        global LOGGING
        if LOGGING:
            st_file_name = log_file_name + ext
            with open(os.path.join(LOGS_DIR(), st_file_name), "w", encoding="utf-8") as f:
                f.write(self.as_sexpr())

362
    def navigate(self, path):
363 364
        """EXPERIMENTAL! NOT YET TESTED!!!
        Returns the first descendant element matched by `path`, e.g.
365 366 367
        'd/s' returns 'l' from (d (s l)(e (r x1) (r x2))
        'e/r' returns 'x2'
        'e'   returns (r x1)(r x2)
368

Eckhart Arnold's avatar
Eckhart Arnold committed
369 370
        Parameters:
            path (str):  The path of the object, e.g. 'a/b/c'
371 372 373

        Returns:
            The object at the path, either a string or a Node or
Eckhart Arnold's avatar
Eckhart Arnold committed
374
            ``None``, if the path did not match.
375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
        """
        pl = path.strip('')
        assert pl[0] != '/', 'Path must noch start with "/"!'
        nd = self
        for p in pl:
            if isinstance(nd.result, str):
                return p if (p == nd.result) and (p == pl[-1]) else None
            for child in nd.result:
                if str(child) == p:
                    nd = child
                    break
            else:
                return None
        return child

Eckhart Arnold's avatar
Eckhart Arnold committed
390 391

def error_messages(text, errors):
392 393 394 395 396
    """
    Converts the list of ``errors`` collected from the root node of the
    parse tree of `text` into a human readable (and IDE or editor
    parsable text) with line an column numbers. Error messages are
    separated by an empty line.
Eckhart Arnold's avatar
Eckhart Arnold committed
397 398
    """
    return "\n\n".join("line: %i, column: %i, error: %s" %
399
                       (*line_col(text, err.pos), err.msg)
Eckhart Arnold's avatar
Eckhart Arnold committed
400
                       for err in sorted(list(errors)))
Eckhart Arnold's avatar
Eckhart Arnold committed
401 402


403 404 405
# lambda compact_sexpr s : re.sub('\s(?=\))', '', re.sub('\s+', ' ', s)).strip()


406

Eckhart Arnold's avatar
Eckhart Arnold committed
407 408
########################################################################
#
409
# syntax tree transformation
Eckhart Arnold's avatar
Eckhart Arnold committed
410 411 412 413 414 415 416 417 418
#
########################################################################


def expand_table(compact_table):
    """Expands a table by separating keywords that are tuples or strings
    containing comma separated words into single keyword entries with
    the same values. Returns the expanded table.
    Example:
Eckhart Arnold's avatar
Eckhart Arnold committed
419
    >>> expand_table({"a, b": 1, "b": 1, ('d','e','f'):5, "c":3})
Eckhart Arnold's avatar
Eckhart Arnold committed
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
    {'a': 1, 'b': 1, 'c': 3, 'd': 5, 'e': 5, 'f': 5}
    """
    expanded_table = {}
    keys = list(compact_table.keys())
    for key in keys:
        value = compact_table[key]
        if isinstance(key, str):
            parts = (s.strip() for s in key.split(','))
        else:
            assert isinstance(key, collections.abc.Iterable)
            parts = key
        for p in parts:
            expanded_table[p] = value
    return expanded_table


def ASTTransform(node, transtable):
Eckhart Arnold's avatar
Eckhart Arnold committed
437 438 439
    """Transforms the parse tree starting with the given ``node`` into
    an abstract syntax tree by calling transformation functions
    registered in the transformation dictionary ``transtable``.
Eckhart Arnold's avatar
Eckhart Arnold committed
440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
    """
    # normalize transformation entries by turning single transformations
    # into lists with a single item
    table = {name: transformation
             if isinstance(transformation, collections.abc.Sequence)
             else [transformation]
             for name, transformation in list(transtable.items())}
    table = expand_table(table)

    def recursive_ASTTransform(node):
        if node.children:
            for child in node.result:
                recursive_ASTTransform(child)
        transformation = table.get(node.parser.name,
                            table.get('~', [])) + table.get('*', [])
        for transform in transformation:
            transform(node)

    recursive_ASTTransform(node)


461 462 463 464
def no_transformation(node):
    pass


Eckhart Arnold's avatar
Eckhart Arnold committed
465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483
# ------------------------------------------------
#
# rearranging transformations:
#     - tree may be rearranged (flattened)
#     - order is preserved
#     - all leaves are kept
#
# ------------------------------------------------


def replace_by_single_child(node):
    """Remove single branch node, replacing it by its immediate descendant.
    (In case the descendant's name is empty (i.e. anonymous) the
    name of this node's parser is kept.)
    """
    if node.children and len(node.result) == 1:
        if not node.result[0].parser.name:
            node.result[0].parser.name = node.parser.name
        node.parser = node.result[0].parser
484
        node._errors.extend(node.result[0].errors)
Eckhart Arnold's avatar
Eckhart Arnold committed
485 486 487 488 489 490 491 492
        node.result = node.result[0].result


def reduce_single_child(node):
    """Reduce a single branch node, by transferring the result of its
    immediate descendant to this node, but keeping this node's parser entry.
    """
    if node.children and len(node.result) == 1:
493
        node._errors.extend(node.result[0].errors)
Eckhart Arnold's avatar
Eckhart Arnold committed
494 495 496 497 498 499 500 501
        node.result = node.result[0].result


# ------------------------------------------------
#
# destructive transformations:
#     - tree may be rearranged (flattened),
#     - order is preserved
502 503
#     - but (irrelevant) leaves may be dropped
#     - errors of dropped leaves will be lost
Eckhart Arnold's avatar
Eckhart Arnold committed
504 505 506
#
# ------------------------------------------------

507 508

def is_whitespace(node):
509 510
    """Removes whitespace and comments defined with the
    ``@comment``-directive."""
511
    return node.parser.name == WHITESPACE_KEYWORD
512 513


514 515
# def is_scanner_token(node):
#     return isinstance(node.parser, ScannerToken)
516 517


518 519 520 521
def is_empty(node):
    return not node.result


522
def is_expendable(node):
523
    return is_empty(node) or is_whitespace(node)  # or is_scanner_token(node)
Eckhart Arnold's avatar
Eckhart Arnold committed
524 525


526 527 528 529
def is_token(node, token_set={}):
    return node.parser.name == TOKEN_KEYWORD and (not token_set or node.result in token_set)


Eckhart Arnold's avatar
Eckhart Arnold committed
530
def remove_children_if(node, condition):
531 532
    """Removes all nodes from the result field if the function 
    ``condition(child_node)`` evaluates to ``True``."""
Eckhart Arnold's avatar
Eckhart Arnold committed
533
    if node.children:
534
        node.result = tuple(c for c in node.children if not condition(c))
Eckhart Arnold's avatar
Eckhart Arnold committed
535 536 537


remove_whitespace = partial(remove_children_if, condition=is_whitespace)
538
# remove_scanner_tokens = partial(remove_children_if, condition=is_scanner_token)
Eckhart Arnold's avatar
Eckhart Arnold committed
539 540 541
remove_expendables = partial(remove_children_if, condition=is_expendable)


542 543 544 545 546 547 548 549
def remove_tokens(node, tokens=set()):
    """Reomoves any among a particular set of tokens from the immediate
    descendants of a node. If ``tokens`` is the empty set, all tokens
    are removed.
    """
    remove_children_if(node, partial(is_token, token_set=tokens))


Eckhart Arnold's avatar
Eckhart Arnold committed
550 551 552 553 554 555
def flatten(node):
    """Recursively flattens all unnamed sub-nodes, in case there is more
    than one sub-node present. Flattening means that
    wherever a node has child nodes, the child nodes are inserted in place
    of the node. In other words, all leaves of this node and its child nodes
    are collected in-order as direct children of this node.
Eckhart Arnold's avatar
Eckhart Arnold committed
556 557 558 559 560 561
    This is meant to achieve these kinds of structural transformation:
        (1 (+ 2) (+ 3)     ->   (1 + 2 + 3)
        (1 (+ (2 + (3))))  ->   (1 + 2 + 3)
        
    Warning: Use with care. Du tue its recursive nature, flattening can
    have unexpected side-effects.
Eckhart Arnold's avatar
Eckhart Arnold committed
562
    """
563
    if node.children:
Eckhart Arnold's avatar
Eckhart Arnold committed
564
        new_result = []
565
        for child in node.children:
Eckhart Arnold's avatar
Eckhart Arnold committed
566 567 568 569 570 571 572 573 574
            if not child.parser.name and child.children:
                assert child.children, node.as_sexpr()
                flatten(child)
                new_result.extend(child.result)
            else:
                new_result.append(child)
        node.result = tuple(new_result)


575 576
def remove_brackets(node):
    """Removes any enclosing delimiters from a structure (e.g. quotation marks
Eckhart Arnold's avatar
Eckhart Arnold committed
577 578
    from a literal or braces from a group).
    """
579
    if len(node.children) >= 3:
580
        assert not node.children[0].children and not node.children[-1].children, node.as_sexpr()
Eckhart Arnold's avatar
Eckhart Arnold committed
581 582 583 584
        node.result = node.result[1:-1]


AST_SYMBOLS = {'replace_by_single_child', 'reduce_single_child',
di68kap's avatar
di68kap committed
585
               'no_operation', 'remove_children_if',
586 587
               'is_whitespace', 'is_expendable', 'remove_whitespace',
               # 'remove_scanner_tokens', 'is_scanner_token',
588
               'remove_expendables', 'flatten', 'remove_tokens',
589
               'remove_enclosing_delimiters',
Eckhart Arnold's avatar
Eckhart Arnold committed
590 591 592 593
               'TOKEN_KEYWORD', 'WHITESPACE_KEYWORD', 'partial'}



594
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
595 596 597
#
# Parser base classes
#
598 599
########################################################################

Eckhart Arnold's avatar
Eckhart Arnold committed
600

601 602
LEFT_RECURSION_DEPTH = 10   # because of pythons recursion depth limit, this
                            # value ought not to be set too high
603
MAX_DROPOUTS = 25   # stop trying to recover parsing after so many errors
Eckhart Arnold's avatar
Eckhart Arnold committed
604

605 606
WHITESPACE_KEYWORD = 'WSP__'
TOKEN_KEYWORD = 'TOKEN__'
Eckhart Arnold's avatar
Eckhart Arnold committed
607 608


609
class HistoryRecord:
di68kap's avatar
di68kap committed
610 611 612 613 614 615 616 617 618 619 620 621
    """
    Stores debugging information about one completed step in the
    parsing history. 
    
    A parsing step is "completed" when the last one of a nested
    sequence of parser-calls returns. The call stack including
    the last parser call will be frozen in the ``HistoryRecord``-
    object. In addition a reference to the generated leaf node
    (if any) will be stored and the result status of the last
    parser call, which ist either MATCH, FAIL (i.e. no match)
    or ERROR.
    """
622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
    __slots__ = ('call_stack', 'node', 'remaining')

    MATCH = "MATCH"
    ERROR = "ERROR"
    FAIL = "FAIL"

    def __init__(self, call_stack, node, remaining):
        self.call_stack = call_stack
        self.node = node
        self.remaining = remaining

    @property
    def stack(self):
        return "->".join(str(parser) for parser in self.call_stack)

    @property
    def status(self):
        return self.FAIL if self.node is None else self.ERROR if self.node._errors else self.MATCH

    @property
    def extent(self):
        return ((-self.remaining - self.node.len, -self.remaining) if self.node
                else (-self.remaining, None))


def add_parser_guard(parser_func):
    def guarded_call(parser, text):
Eckhart Arnold's avatar
Eckhart Arnold committed
649 650 651 652 653 654 655
        try:
            location = len(text)
            # if location has already been visited by the current parser,
            # return saved result
            if location in parser.visited:
                return parser.visited[location]
            # break left recursion at the maximum allowed depth
656
            if parser.recursion_counter.setdefault(location, 0) > LEFT_RECURSION_DEPTH:
Eckhart Arnold's avatar
Eckhart Arnold committed
657 658 659
                return None, text

            parser.recursion_counter[location] += 1
660
            grammar = parser.grammar
Eckhart Arnold's avatar
Eckhart Arnold committed
661

662 663 664
            if grammar.track_history:
                grammar.call_stack.append(parser)
                grammar.moving_forward = True
Eckhart Arnold's avatar
Eckhart Arnold committed
665 666

            # run original __call__ method
667 668 669 670 671 672 673 674 675 676
            node, rest = parser_func(parser, text)

            if grammar.track_history:
                if grammar.moving_forward:  # and result[0] == None
                    grammar.moving_forward = False
                    record = HistoryRecord(grammar.call_stack.copy(), node, len(rest))
                    grammar.history.append(record)
                grammar.call_stack.pop()

            if node is not None:
Eckhart Arnold's avatar
Eckhart Arnold committed
677 678
                # in case of a recursive call saves the result of the first
                # (or left-most) call that matches
679 680
                parser.visited[location] = (node, rest)
                grammar.last_node = node
Eckhart Arnold's avatar
Eckhart Arnold committed
681 682 683
            elif location in parser.visited:
                # if parser did non match but a saved result exits, assume
                # left recursion and use the saved result
684
                node, rest = parser.visited[location]
Eckhart Arnold's avatar
Eckhart Arnold committed
685 686 687 688 689 690 691

            parser.recursion_counter[location] -= 1

        except RecursionError:
            node = Node(None, text[:min(10, max(1, text.find("\n")))] + " ...")
            node.add_error("maximum recursion depth of parser reached; "
                           "potentially due to too many errors!")
692
            node.error_flag = True
693
            rest = ''
Eckhart Arnold's avatar
Eckhart Arnold committed
694

695
        return node, rest
Eckhart Arnold's avatar
Eckhart Arnold committed
696

697
    return guarded_call
Eckhart Arnold's avatar
Eckhart Arnold committed
698 699 700


class ParserMetaClass(type):
701
    def __init__(cls, name, bases, attrs):
Eckhart Arnold's avatar
Eckhart Arnold committed
702 703 704
        # The following condition is necessary for classes that don't override
        # the __call__() method, because in these cases the non-overridden
        # __call__()-method would be substituted a second time!
705 706 707
        guarded_parser_call = add_parser_guard(cls.__call__)
        if cls.__call__.__code__ != guarded_parser_call.__code__:
            cls.__call__ = guarded_parser_call
708
        super(ParserMetaClass, cls).__init__(name, bases, attrs)
Eckhart Arnold's avatar
Eckhart Arnold committed
709

710

Eckhart Arnold's avatar
Eckhart Arnold committed
711
class Parser(metaclass=ParserMetaClass):
712 713 714
    def __init__(self, name=None):
        assert name is None or isinstance(name, str), str(name)
        self.name = name or ''
715
        self.grammar = None          # center for global variables etc.
716 717 718
        self.reset()

    def reset(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
719 720 721 722 723
        self.visited = dict()
        self.recursion_counter = dict()
        self.cycle_detection = set()

    def __call__(self, text):
724
        return None, text               # default behaviour: don't match
Eckhart Arnold's avatar
Eckhart Arnold committed
725 726

    def __str__(self):
727
        return self.name or self.__class__.__name__
Eckhart Arnold's avatar
Eckhart Arnold committed
728

729
    @property
730 731
    def grammar(self):
        return self._grammar
732

733 734 735 736
    @grammar.setter
    def grammar(self, grammar_base):
        self._grammar = grammar_base
        self._grammar_assigned_notifier()
737

738
    def _grammar_assigned_notifier(self):
739 740
        pass

Eckhart Arnold's avatar
Eckhart Arnold committed
741 742
    def apply(self, func):
        """Applies function `func(parser)` recursively to this parser and all
743
        descendants of the tree of parsers. The same function can never
744
        be applied twice between calls of the ``reset()``-method!
Eckhart Arnold's avatar
Eckhart Arnold committed
745 746 747 748 749 750 751 752 753
        """
        if func in self.cycle_detection:
            return False
        else:
            self.cycle_detection.add(func)
            func(self)
            return True


754
class GrammarBase:
755 756
    root__ = None   # should be overwritten by grammar subclass

757
    @classmethod
758
    def _assign_parser_names(cls):
759
        """Initializes the `parser.name` fields of those
760 761
        Parser objects that are directly assigned to a class field with
        the field's name, e.g.
762
            class Grammar(GrammarBase):
763 764
                ...
                symbol = RE('(?!\\d)\\w+')
765
        After the call of this method symbol.name == "symbol"
766 767
        holds. Names assigned via the `name`-parameter of the
        constructor will not be overwritten.
768
        """
769
        if cls.parser_initialization__ == "done":
770
            return
771
        cdict = cls.__dict__
772 773 774
        for entry, parser in cdict.items():
            if isinstance(parser, Parser):
                if not parser.name or parser.name == TOKEN_KEYWORD:
Eckhart Arnold's avatar
Eckhart Arnold committed
775
                    parser.name = entry
776 777
                if (isinstance(parser, Forward) and (not parser.parser.name
                    or parser.parser.name == TOKEN_KEYWORD)):
778
                    parser.parser.name = entry
779
        cls.parser_initialization__ = "done"
780 781

    def __init__(self):
782 783
        self.all_parsers = set()
        self.dirty_flag = False
784 785 786
        self.track_history = LOGGING
        name = self.__class__.__name__
        self.log_file_name = name[:-7] if name.lower().endswith('grammar') else name
787
        self._reset()
788
        self._assign_parser_names()
789
        self.root__ = copy.deepcopy(self.__class__.root__)
790 791
        if self.wspL__:
            self.wsp_left_parser__ = RegExp(self.wspL__, WHITESPACE_KEYWORD)
792
            self.wsp_left_parser__.grammar = self
793
        else:
794
            self.wsp_left_parser__ = ZOMBIE_PARSER
795 796
        if self.wspR__:
            self.wsp_right_parser__ = RegExp(self.wspR__, WHITESPACE_KEYWORD)
797
            self.wsp_right_parser__.grammar = self
798
        else:
799
            self.wsp_right_parser__ = ZOMBIE_PARSER
Eckhart Arnold's avatar
Eckhart Arnold committed
800
        self.root__.apply(self._add_parser)
801 802

    def _reset(self):
803
        self.variables = dict()                 # support for Pop and Retrieve operators
804
        self.document = ""  # source document
Eckhart Arnold's avatar
Eckhart Arnold committed
805
        self.last_node = None
806
        self.call_stack = []                    # support for call stack tracing
807
        self.history = []                       # snapshots of call stacks
808
        self.moving_forward = True              # also needed for call stack tracing
Eckhart Arnold's avatar
Eckhart Arnold committed
809 810

    def _add_parser(self, parser):
811
        """Adds the copy of the classes parser object to this
812
        particular instance of GrammarBase.
Eckhart Arnold's avatar
Eckhart Arnold committed
813
        """
814 815
        setattr(self, parser.name, parser)
        self.all_parsers.add(parser)
816
        parser.grammar = self
Eckhart Arnold's avatar
Eckhart Arnold committed
817 818 819 820 821 822 823 824 825

    def parse(self, document):
        """Parses a document with with parser-combinators.

        Args:
            document (str): The source text to be parsed.
        Returns:
            Node: The root node ot the parse tree.
        """
826 827
        if self.root__ is None:
            raise NotImplementedError()
828
        if self.dirty_flag:
829
            self._reset()
830 831 832 833
            for parser in self.all_parsers:
                parser.reset()
        else:
            self.dirty_flag = True
834
        self.document = document
Eckhart Arnold's avatar
Eckhart Arnold committed
835
        parser = self.root__
836
        result = ""
837
        stitches = []
Eckhart Arnold's avatar
Eckhart Arnold committed
838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853
        rest = document
        while rest and len(stitches) < MAX_DROPOUTS:
            result, rest = parser(rest)
            if rest:
                fwd = rest.find("\n") + 1 or len(rest)
                skip, rest = rest[:fwd], rest[fwd:]
                if result is None:
                    error_msg = "Parser did not match! Invalid source file?"
                else:
                    stitches.append(result)
                    error_msg = "Parser stopped before end" + \
                                ("! trying to recover..."
                                 if len(stitches) < MAX_DROPOUTS
                                 else " too often! Terminating parser.")
                stitches.append(Node(None, skip))
                stitches[-1].add_error(error_msg)
854 855 856 857 858 859
        if stitches:
            if rest:
                stitches.append(Node(None, rest))
            result = Node(None, tuple(stitches))
        result.pos = 0 # calculate all positions
        return result
Eckhart Arnold's avatar
Eckhart Arnold committed
860

861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892
    def log_parsing_history(self):
        """Writes a log of the parsing history of the most recently parsed
        document. 
        """

        def prepare_line(record):
            excerpt = self.document.__getitem__(slice(*record.extent))[:25].replace('\n', '\\n')
            excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
            return (record.stack, record.status, excerpt)

        def write_log(history, log_name):
            path = os.path.join(LOGS_DIR(), self.log_file_name + log_name + "_parser.log")
            if history:
                with open(path, "w", encoding="utf-8") as f:
                    f.write("\n".join(history))
            elif os.path.exists(path):
                os.remove(path)

        global LOGGING
        if LOGGING:
            assert self.history
            full_history, match_history, errors_only = [], [], []
            for record in self.history:
                line = ";  ".join(prepare_line(record))
                full_history.append(line)
                if record.node and record.node.parser.name != WHITESPACE_KEYWORD:
                    match_history.append(line)
                    if record.node.errors:
                        errors_only.append(line)
            write_log(full_history, '_full')
            write_log(match_history, '_match')
            write_log(errors_only, '_errors')
Eckhart Arnold's avatar
Eckhart Arnold committed
893

Eckhart Arnold's avatar
Eckhart Arnold committed
894

895

896
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
897 898 899
#
# Token and Regular Expression parser classes (i.e. leaf classes)
#
900
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
901 902


903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925

RX_SCANNER_TOKEN = re.compile('\w+')
BEGIN_SCANNER_TOKEN = '\x1b'
END_SCANNER_TOKEN = '\x1c'


def make_token(token, argument=''):
    """Turns the ``token`` and ``argument`` into a special token that
    will be caught by the `ScannerToken`-parser.
    
    This function is a support function that should be used by scanners
    to inject scanner tokens into the source text.
    """
    assert RX_SCANNER_TOKEN.match(token)
    assert argument.find(BEGIN_SCANNER_TOKEN) < 0
    assert argument.find(END_SCANNER_TOKEN) < 0

    return BEGIN_SCANNER_TOKEN + token + argument + END_SCANNER_TOKEN


nil_scanner = lambda text: text


Eckhart Arnold's avatar
Eckhart Arnold committed
926 927 928 929 930
class ScannerToken(Parser):
    def __init__(self, scanner_token):
        assert isinstance(scanner_token, str) and scanner_token and \
               scanner_token.isupper()
        assert RX_SCANNER_TOKEN.match(scanner_token)
931
        super(ScannerToken, self).__init__(scanner_token, name=TOKEN_KEYWORD)
Eckhart Arnold's avatar
Eckhart Arnold committed
932 933 934 935 936 937 938 939 940 941 942 943 944 945 946

    def __call__(self, text):
        if text[0:1] == BEGIN_SCANNER_TOKEN:
            end = text.find(END_SCANNER_TOKEN, 1)
            if end < 0:
                node = Node(self, '').add_error(
                    'END_SCANNER_TOKEN delimiter missing from scanner token. '
                    '(Most likely due to a scanner bug!)')
                return node, text[1:]
            elif end == 0:
                node = Node(self, '').add_error(
                    'Scanner token cannot have zero length. '
                    '(Most likely due to a scanner bug!)')
                return node, text[2:]
            elif text.find(BEGIN_SCANNER_TOKEN, 1, end) >= 0:
947
                node = Node(self, text[len(self.name) + 1:end])
Eckhart Arnold's avatar
Eckhart Arnold committed
948 949 950 951 952
                node.add_error(
                    'Scanner tokens must not be nested or contain '
                    'BEGIN_SCANNER_TOKEN delimiter as part of their argument. '
                    '(Most likely due to a scanner bug!)')
                return node, text[end:]
953 954
            if text[1:len(self.name) + 1] == self.name:
                return Node(self, text[len(self.name) + 1:end]), \
Eckhart Arnold's avatar
Eckhart Arnold committed
955 956 957 958 959
                       text[end + 1:]
        return None, text


class RegExp(Parser):
960
    def __init__(self, regexp, name=None):
961
        super(RegExp, self).__init__(name)
Eckhart Arnold's avatar
Eckhart Arnold committed
962 963 964
        self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

    def __deepcopy__(self, memo):
Eckhart Arnold's avatar
Eckhart Arnold committed
965 966
        # This method is obsolete with the new `regex` module! It's
        # being kept for compatibility with Python's standard library
Eckhart Arnold's avatar
Eckhart Arnold committed
967 968 969 970
        try:
            regexp = copy.deepcopy(self.regexp)
        except TypeError:
            regexp = self.regexp.pattern
di68kap's avatar
di68kap committed
971
        duplicate = RegExp(regexp, self.name)
972
        duplicate.name = self.name  # this ist needed!!!!
973
        duplicate.regexp = self.regexp
974
        duplicate.grammar = self.grammar
Eckhart Arnold's avatar
Eckhart Arnold committed
975 976 977 978 979 980 981 982 983
        duplicate.visited = copy.deepcopy(self.visited, memo)
        duplicate.recursion_counter = copy.deepcopy(self.recursion_counter,
                                                    memo)
        return duplicate

    def __call__(self, text):
        match = text[0:1] != BEGIN_SCANNER_TOKEN and self.regexp.match(text)  # ESC starts a scanner token.
        if match:
            end = match.end()
984 985 986 987 988
            return Node(self, text[:end]), text[end:]
        return None, text


class RE(Parser):
Eckhart Arnold's avatar
Eckhart Arnold committed
989 990
    """Regular Expressions with optional leading or trailing whitespace.
    """
991
    def __init__(self, regexp, wL=None, wR=None, name=None):
992
        super(RE, self).__init__(name)
993
        # assert wR or regexp == '.' or isinstance(self, Token)
994 995
        self.wL = wL
        self.wR = wR
996 997
        self.wspLeft = RegExp(wL, WHITESPACE_KEYWORD) if wL else ZOMBIE_PARSER
        self.wspRight = RegExp(wR, WHITESPACE_KEYWORD) if wR else ZOMBIE_PARSER
998 999 1000 1001 1002
        self.main = RegExp(regexp)

    def __call__(self, text):
        # assert self.main.regexp.pattern != "@"
        t = text
1003
        wL, t = self.wspLeft(t)
1004 1005
        main, t = self.main(t)
        if main:
1006
            wR, t = self.wspRight(t)
1007 1008
            result = tuple(nd for nd in (wL, main, wR)
                           if nd and nd.result != '')
1009
            return Node(self, result), t
Eckhart Arnold's avatar
Eckhart Arnold committed
1010 1011 1012
        return None, text

    def __str__(self):
1013 1014 1015 1016
        if self.name == TOKEN_KEYWORD:
            return 'Token "%s"' % self.main.regexp.pattern.replace('\\', '')
        return self.name or ('RE ' + ('~' if self.wL else '')
                             + '/%s/' % self.main.regexp.pattern + ('~' if self.wR else ''))
1017

1018 1019
    def _grammar_assigned_notifier(self):
        if self.grammar:
1020
            if self.wL is None:
1021
                self.wspLeft = self.grammar.wsp_left_parser__
1022
            if self.wR is None:
1023
                self.wspRight = self.grammar.wsp_right_parser__
1024

1025 1026 1027
    def apply(self, func):
        if super(RE, self).apply(func):
            if self.wL:
1028
                self.wspLeft.apply(func)
1029
            if self.wR:
1030
                self.wspRight.apply(func)
1031
            self.main.apply(func)
Eckhart Arnold's avatar
Eckhart Arnold committed
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042


def escape_re(s):
    """Returns `s` with all regular expression special characters escaped.
    """
    assert isinstance(s, str)
    re_chars = r"\.^$*+?{}[]()#<>=|!"
    for esc_ch in re_chars:
        s = s.replace(esc_ch, '\\' + esc_ch)
    return s

1043 1044 1045 1046

def Token(token, wL=None, wR=None, name=None):
    return RE(escape_re(token), wL, wR, name or TOKEN_KEYWORD)

Eckhart Arnold's avatar
Eckhart Arnold committed
1047 1048 1049 1050 1051 1052 1053 1054 1055

def mixin_comment(whitespace, comment):
    """Mixes comment-regexp into whitespace regexp.
    """
    wspc = '(?:' + whitespace + '(?:' + comment + whitespace + ')*)'
    return wspc



1056
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
1057 1058 1059
#
# Combinator parser classes (i.e. trunk classes of the parser tree)
#
1060
########################################################################
Eckhart Arnold's avatar
Eckhart Arnold committed
1061 1062 1063


class UnaryOperator(Parser):
1064 1065
    def __init__(self, parser, name=None):
        super(UnaryOperator, self).__init__(name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1066 1067 1068 1069 1070 1071 1072 1073 1074
        assert isinstance(parser, Parser)
        self.parser = parser

    def apply(self, func):
        if super(UnaryOperator, self).apply(func):
            self.parser.apply(func)


class NaryOperator(Parser):
1075 1076
    def __init__(self, *parsers, name=None):
        super(NaryOperator, self).__init__(name)
1077
        assert all([isinstance(parser, Parser) for parser in parsers]), str(parsers)
Eckhart Arnold's avatar
Eckhart Arnold committed
1078 1079 1080 1081 1082 1083 1084 1085 1086
        self.parsers = parsers

    def apply(self, func):
        if super(NaryOperator, self).apply(func):
            for parser in self.parsers:
                parser.apply(func)


class Optional(UnaryOperator):
1087 1088
    def __init__(self, parser, name=None):
        super(Optional, self).__init__(parser, name)
Eckhart Arnold's avatar
Eckhart Arnold committed
1089 1090
        assert isinstance(parser, Parser)
        assert not isinstance(parser, Optional), \
1091
            "Nesting options would be redundant: %s(%s)" % \
1092
            (str(name), str(parser.name))
Eckhart Arnold's avatar
Eckhart Arnold committed
1093
        assert not isinstance(parser, Required), \
1094
            "Nestion options with required elements is contradictory: " \