syntaxtree.py 42.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
# syntaxtree.py - syntax tree classes for DHParser
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
17 18 19


"""
20 21
Module ``syntaxtree`` defines the ``Node``-class for syntax trees as well
as an abstract base class for parser-objects. The latter is defined
eckhart's avatar
eckhart committed
22
here, because node-objects refer to parser-objects. All concrete
23 24 25
parser classes are defined in the ``parse`` module.
"""

26
from collections import OrderedDict
eckhart's avatar
eckhart committed
27
import copy
28

eckhart's avatar
eckhart committed
29
from DHParser.error import Error, ErrorCode, linebreaks, line_col
30
from DHParser.stringview import StringView
di68kap's avatar
di68kap committed
31 32
from DHParser.toolkit import re
from typing import Callable, cast, Iterator, List, AbstractSet, Set, Union, Tuple, Optional
di68kap's avatar
di68kap committed
33

34

di68kap's avatar
di68kap committed
35
__all__ = ('WHITESPACE_PTYPE',
36
           'TOKEN_PTYPE',
37 38
           'ZOMBIE_TAG',
           'PLACEHOLDER',
eckhart's avatar
eckhart committed
39 40 41
           'ResultType',
           'StrictResultType',
           'ChildrenType',
42
           'Node',
43
           'FrozenNode',
di68kap's avatar
di68kap committed
44
           'RootNode',
45
           'parse_sxpr',
46
           'parse_xml',
eckhart's avatar
eckhart committed
47
           'parse_tree',
48 49
           'flatten_sxpr',
           'flatten_xml')
Eckhart Arnold's avatar
Eckhart Arnold committed
50 51


52 53 54 55 56 57 58 59
#######################################################################
#
# parser base and mock parsers
#
#######################################################################


WHITESPACE_PTYPE = ':Whitespace'
60
TOKEN_PTYPE = ':Token'
61

62
ZOMBIE_TAG = "__ZOMBIE__"
63

64 65 66 67 68 69 70
#######################################################################
#
# syntaxtree nodes
#
#######################################################################


71
ChildrenType = Tuple['Node', ...]
Eckhart Arnold's avatar
Eckhart Arnold committed
72
NoChildren = cast(ChildrenType, ())  # type: ChildrenType
eckhart's avatar
eckhart committed
73
StrictResultType = Union[ChildrenType, StringView, str]
74
ResultType = Union[ChildrenType, 'Node', StringView, str, None]
75 76


Eckhart Arnold's avatar
Eckhart Arnold committed
77
def flatten_sxpr(sxpr: str) -> str:
78 79
    """
    Returns S-expression ``sxpr`` as a one-liner without unnecessary
Eckhart Arnold's avatar
Eckhart Arnold committed
80 81 82
    whitespace.

    Example:
Eckhart Arnold's avatar
Eckhart Arnold committed
83
    >>> flatten_sxpr('(a\\n    (b\\n        c\\n    )\\n)\\n')
Eckhart Arnold's avatar
Eckhart Arnold committed
84 85
    '(a (b c))'
    """
86

87
    return re.sub(r'\s(?=\))', '', re.sub(r'\s+', ' ', sxpr)).strip()
Eckhart Arnold's avatar
Eckhart Arnold committed
88 89


90
def flatten_xml(xml: str) -> str:
91 92
    """
    Returns an XML-tree as a one liner without unnecessary whitespace,
93
    i.e. only whitespace within leaf-nodes is preserved.
94 95
    A more precise alternative to `flatten_xml` is to use Node.as_xml()
    ans passing a set containing the top level tag to parameter `inline_tags`.
96
    """
97

98 99 100 101
    # works only with regex
    # return re.sub(r'\s+(?=<\w)', '', re.sub(r'(?<=</\w+>)\s+', '', xml))
    def tag_only(m):
        return m.groupdict()['closing_tag']
102
    return re.sub(r'\s+(?=<[\w:])', '', re.sub(r'(?P<closing_tag></:?\w+>)\s+', tag_only, xml))
103 104


eckhart's avatar
eckhart committed
105
RX_AMP = re.compile(r'&(?!\w+;)')
106 107


Eckhart Arnold's avatar
Eckhart Arnold committed
108
class Node:  # (collections.abc.Sized): Base class omitted for cython-compatibility
109 110 111
    """
    Represents a node in the concrete or abstract syntax tree.

112 113
    TODO: Add some documentation and doc-tests here...

114
    Attributes:
115 116
        tag_name (str):  The name of the node, which is either its
            parser's name or, if that is empty, the parser's class name
117

118 119 120
        result (str or tuple):  The result of the parser which
            generated this node, which can be either a string or a
            tuple of child nodes.
121

122 123
        children (tuple):  The tuple of child nodes or an empty tuple
            if there are no child nodes. READ ONLY!
124

eckhart's avatar
eckhart committed
125 126 127 128
        content (str):  Yields the contents of the tree as string. The
            difference to ``str(node)`` is that ``node.content`` does
            not add the error messages to the returned string.

Eckhart Arnold's avatar
Eckhart Arnold committed
129
        parser (Parser):  The parser which generated this node.
130 131
            WARNING: In case you use mock syntax trees for testing or
            parser replacement during the AST-transformation: DO NOT
Eckhart Arnold's avatar
Eckhart Arnold committed
132 133
            rely on this being a real parser object in any phase after
            parsing (i.e. AST-transformation and compiling), for
134
            example by calling ``isinstance(node.parer, ...)``.
135

136 137 138
        len (int):  The full length of the node's string result if the
            node is a leaf node or, otherwise, the concatenated string
            result's of its descendants. The figure always represents
139
            the length before AST-transformation and will never change
140
            through AST-transformation. READ ONLY!
141

142 143
        pos (int):  the position of the node within the parsed text.

Eckhart Arnold's avatar
Eckhart Arnold committed
144
            The value of ``pos`` is -1 meaning invalid by default.
145
            Setting this value will set the positions of all child
Eckhart Arnold's avatar
Eckhart Arnold committed
146
            nodes relative to this value.
147 148

            To set the pos values of all nodes in a syntax tree, the
Eckhart Arnold's avatar
Eckhart Arnold committed
149
            pos value of the root node should be set to 0 right
150 151
            after parsing.

Eckhart Arnold's avatar
Eckhart Arnold committed
152
            Other than that, this value should be considered READ ONLY.
153 154
            At any rate, it should only be reassigned during the parsing
            stage and never during or after the AST-transformation.
155

156 157
        attr (dict): An optional dictionary of XML-attr. This
            dictionary is created lazily upon first usage. The attr
158 159
            will only be shown in the XML-Representation, not in the
            S-Expression-output.
160
    """
161

162
    __slots__ = '_result', 'children', '_len', '_pos', 'tag_name', '_xml_attr', '_content'
163

164
    def __init__(self, tag_name: str, result: ResultType, leafhint: bool = False) -> None:
165 166
        """
        Initializes the ``Node``-object with the ``Parser``-Instance
167 168
        that generated the node and the parser's result.
        """
eckhart's avatar
eckhart committed
169
        self._pos = -1                  # type: int
170
        # Assignment to self.result initializes the attr _result, children and _len
di68kap's avatar
di68kap committed
171
        # The following if-clause is merely an optimization, i.e. a fast-path for leaf-Nodes
172
        if leafhint:
eckhart's avatar
eckhart committed
173
            self._result = result       # type: StrictResultType  # cast(StrictResultType, result)
eckhart's avatar
eckhart committed
174
            self._content = None        # type: Optional[str]
175
            self.children = NoChildren  # type: ChildrenType
176
            self._len = -1              # type: int  # lazy evaluation
177 178
        else:
            self.result = result
179
        self.tag_name = tag_name        # type: str
180

eckhart's avatar
eckhart committed
181 182
    def __deepcopy__(self, memo):
        if self.children:
183
            duplicate = self.__class__(self.tag_name, copy.deepcopy(self.children), False)
eckhart's avatar
eckhart committed
184
        else:
185
            duplicate = self.__class__(self.tag_name, self.result, True)
eckhart's avatar
eckhart committed
186 187
        duplicate._pos = self._pos
        duplicate._len = self._len
eckhart's avatar
eckhart committed
188
        if self.attr_active():
di68kap's avatar
di68kap committed
189 190
            duplicate.attr.update(copy.deepcopy(self._xml_attr))
            # duplicate._xml_attr = copy.deepcopy(self._xml_attr)  # this is not cython compatible
eckhart's avatar
eckhart committed
191
        return duplicate
Eckhart Arnold's avatar
Eckhart Arnold committed
192

193
    def __str__(self):
194 195
        if isinstance(self, RootNode):
            root = cast(RootNode, self)
196
            errors = root.errors_sorted
197 198 199 200 201 202
            if errors:
                e_pos = errors[0].pos
                return self.content[:e_pos] + \
                   ' <<< Error on "%s" | %s >>> ' % \
                   (self.content[e_pos - self.pos:], '; '.join(e.message for e in errors))
        return self.content
Eckhart Arnold's avatar
Eckhart Arnold committed
203

204
    def __repr__(self):
205
        # mpargs = {'name': self.parser.name, 'ptype': self.parser.ptype}
206 207
        # name, ptype = (self._tag_name.split(':') + [''])[:2]
        # parg = "MockParser({name}, {ptype})".format(name=name, ptype=ptype)
208
        rarg = str(self) if not self.children else \
209
            "(" + ", ".join(child.__repr__() for child in self.children) + ")"
210
        return "Node(%s, %s)" % (self.tag_name, rarg)
211

Eckhart Arnold's avatar
Eckhart Arnold committed
212

213
    def __len__(self):
214
        if self._len < 0:
di68kap's avatar
di68kap committed
215
            self._len = sum(len(child) for child in self.children) \
216
                if self.children else len(self._result)
217 218 219 220 221 222 223 224
        return self._len


    def __bool__(self):
        # A node that is not None is always True, even if it's empty
        return True


225
    def __eq__(self, other):
226 227
        """
        Equality of nodes: Two nodes are considered as equal, if their tag
228 229
        name is the same, if their results are equal and if their attributes
        and attribute values are the same.
230
        """
231 232
        return self.tag_name == other.tag_name and self.result == other.result \
            and self.compare_attr(other)
233

Eckhart Arnold's avatar
Eckhart Arnold committed
234

235
    def __hash__(self):
236
        return hash(self.tag_name)
237

Eckhart Arnold's avatar
Eckhart Arnold committed
238

239 240 241
    def __getitem__(self, index_or_tagname: Union[int, str]) -> Union['Node', Iterator['Node']]:
        """
        Returns the child node with the given index if ``index_or_tagname`` is
242
        an integer or the first child node with the given tag name. Examples::
243

244
            >>> tree = parse_sxpr('(a (b "X") (X (c "d")) (e (X "F")))')
245 246
            >>> flatten_sxpr(tree[0].as_sxpr())
            '(b "X")'
247 248
            >>> flatten_sxpr(tree["X"].as_sxpr())
            '(X (c "d"))'
249 250 251 252

        Args:
            index_or_tagname(str): Either an index of a child node or a
                tag name.
253
        Returns:
254 255
            Node: All nodes which have a given tag name.
        """
256 257 258
        if self.children:
            if isinstance(index_or_tagname, int):
                return self.children[index_or_tagname]
259
            else:
260 261 262 263 264
                for child in self.children:
                    if child.tag_name == index_or_tagname:
                        return child
                raise KeyError(index_or_tagname)
        raise ValueError('Leave nodes have no children that can be indexed!')
265 266 267 268


    def __contains__(self, tag_name: str) -> bool:
        """
269
        Returns true if a child with the given tag name exists.
270
        Args:
271 272
            tag_name (str): tag_name which will be searched among to immediate
                descendants of this node.
273 274 275 276
        Returns:
            bool:  True, if at least one descendant node with the given tag
                name exists, False otherwise
        """
277 278 279 280 281
        # assert isinstance(tag_name, str)
        if self.children:
            for child in self.children:
                if child.tag_name == tag_name:
                    return True
282
            return False
283
        raise ValueError('Leave node cannot contain other nodes')
284 285


286 287 288 289 290 291 292 293 294 295 296 297 298 299
    def get(self, index_or_tagname: Union[int, str],
            surrogate: Union['Node', Iterator['Node']]) -> Union['Node', Iterator['Node']]:
        """Returns the child node with the given index if ``index_or_tagname``
        is an integer or the first child node with the given tag name. If no
        child with the given index or tag_name exists, the ``surrogate`` is
        returned instead. This mimics the behaviour of Python's dictionary's
        get-method.
        """
        try:
            return self[index_or_tagname]
        except KeyError:
            return surrogate


300
    def is_anonymous(self):
Eckhart Arnold's avatar
Eckhart Arnold committed
301
        return not self.tag_name or self.tag_name[0] == ':'
302

Eckhart Arnold's avatar
Eckhart Arnold committed
303

304
    @property
Eckhart Arnold's avatar
Eckhart Arnold committed
305
    def result(self) -> StrictResultType:
306 307 308 309 310
        """
        Returns the result from the parser that created the node.
        Error messages are not included in the result. Use `self.content()`
        if the result plus any error messages is needed.
        """
311 312
        return self._result

313

314
    @result.setter
Eckhart Arnold's avatar
Eckhart Arnold committed
315
    def result(self, result: ResultType):
Eckhart Arnold's avatar
Eckhart Arnold committed
316
        # # made obsolete by static type checking with mypy
Eckhart Arnold's avatar
Eckhart Arnold committed
317 318 319 320
        assert ((isinstance(result, tuple) and all(isinstance(child, Node) for child in result))
                or isinstance(result, Node)
                or isinstance(result, str)
                or isinstance(result, StringView)), "%s (%s)" % (str(result), str(type(result)))
Eckhart Arnold's avatar
Eckhart Arnold committed
321 322
        # Possible optimization: Do not allow single nodes as argument:
        # assert not isinstance(result, Node)
323 324
        self._len = -1        # lazy evaluation
        self._content = None
325 326 327 328 329 330 331 332 333
        if isinstance(result, Node):
            self.children = (result,)
            self._result = self.children
        else:
            if isinstance(result, tuple):
                self.children = result
                self._result = result or ''
            else:
                self.children = NoChildren
eckhart's avatar
eckhart committed
334
                self._result = result  # cast(StrictResultType, result)
335

336 337

    @property
eckhart's avatar
eckhart committed
338
    def content(self) -> str:
339
        """
340 341 342
        Returns content as string. If the node has child-nodes, the
        string content of the child-nodes is recursively read and then
        concatenated.
343
        """
344 345 346 347
        if self._content is None:
            if self.children:
                self._content = "".join(child.content for child in self.children)
            else:
348
                # self._content = self._result
349 350 351
                self._content = str(self._result)
                self._result = self._content  # self._result might be more efficient as a string!?
        return self._content
352 353 354 355 356
    #
    #
    # @content.setter
    # def content(self, content: str):
    #     self.result = content
di68kap's avatar
di68kap committed
357 358


359 360 361 362 363 364
    @property
    def structure(self) -> str:
        """
        Return structure (and content) as S-expression on a single line
        without any line breaks.
        """
365
        return flatten_sxpr(self.as_sxpr())
366 367


368
    @property
369
    def pos(self) -> int:
eckhart's avatar
eckhart committed
370 371
        """Returns the position of the Node's content in the source text."""
        if self._pos < 0:
Eckhart Arnold's avatar
Eckhart Arnold committed
372
            raise AssertionError("Position value not initialized! Use Node.with_pos()")
373 374
        return self._pos

eckhart's avatar
eckhart committed
375

Eckhart Arnold's avatar
Eckhart Arnold committed
376
    def with_pos(self, pos: int) -> 'Node':
eckhart's avatar
eckhart committed
377
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
378
        Initialize position value. Usually, the parser guard
eckhart's avatar
eckhart committed
379 380
        (`parsers.add_parser_guard()`) takes care of assigning the
        position in the document to newly created nodes. However,
Eckhart Arnold's avatar
Eckhart Arnold committed
381
        when Nodes are created outside the reach of the parser
eckhart's avatar
eckhart committed
382
        guard, their document-position must be assigned manually.
Eckhart Arnold's avatar
Eckhart Arnold committed
383 384
        Position values of the child nodes are assigned recursively, too.
        Returns the node itself for convenience.
eckhart's avatar
eckhart committed
385
        """
386 387 388 389
        # condition self.pos == pos cannot be assumed when tokens or whitespace
        # are dropped early!
        # assert self._pos < 0 or self.pos == pos, ("pos mismatch %i != %i at Node: %s"
        #                                           % (self._pos, pos, repr(self)))
Eckhart Arnold's avatar
Eckhart Arnold committed
390 391 392 393 394 395 396 397 398 399
        if pos != self._pos >= 0:
            raise AssertionError("Position value cannot be reassigned to a different value!")
        if self._pos < 0:
            self._pos = pos
            # recursively adjust pos-values of all children
            offset = self.pos
            for child in self.children:
                if child._pos < 0:
                    child.with_pos(offset)
                offset = child.pos + len(child)
eckhart's avatar
eckhart committed
400 401
        return self

402

403 404 405 406 407 408 409 410 411 412 413 414 415
    @property
    def attr(self):
        """
        Returns a dictionary of XML-attr attached to the node.
        """
        try:
            if self._xml_attr is None:          # cython compatibility
                self._xml_attr = OrderedDict()
        except AttributeError:
            self._xml_attr = OrderedDict()
        return self._xml_attr


eckhart's avatar
eckhart committed
416 417 418 419 420 421 422 423 424 425 426 427 428
    def attr_active(self) -> bool:
        """
        Returns True, if XML-Attributes of this node have ever been set
        or queried, even if unsuccessfully.
        """
        try:
            if self._xml_attr is not None:
                return True
        except AttributeError:
            pass
        return False


429
    def compare_attr(self, other: 'Node') -> bool:
430
        """
431 432
        Returns True, if `self` and `other` have the same attributes with the
        same attribute values.
433
        """
434 435 436 437 438 439 440 441 442
        if self.attr_active():
            if other.attr_active():
                return self.attr == other.attr
            return len(self.attr) == 0
            # self has empty dictionary and other has no attributes
        elif other.attr_active():
            return len(other.attr) == 0
            # other has empty attribute dictionary and self as no attributes
        return True  # neither self nor other have any attributes
443

Eckhart Arnold's avatar
Eckhart Arnold committed
444

445
    def _tree_repr(self, tab, open_fn, close_fn, data_fn=lambda i: i,
446
                   density=0, inline=False, inline_fn=lambda node: False) -> str:
447 448 449 450 451 452 453 454
        """
        Generates a tree representation of this node and its children
        in string from.

        The kind ot tree-representation that is determined by several
        function parameters. This could be an XML-representation or a
        lisp-like S-expression.

455
        Args:
456
            tab (str):  The indentation string, e.g. '\t' or '    '
eckhart's avatar
eckhart committed
457
            open_fn:   (Node->str) A function that returns an opening
458
                string (e.g. an XML-tag_name) for a given node
eckhart's avatar
eckhart committed
459
            close_fn:  (Node->str) A function that returns a closeF
460
                string (e.g. an XML-tag_name) for a given node.
eckhart's avatar
eckhart committed
461
            data_fn:   (str->str) A function that filters the data string
462 463 464 465 466 467
                before printing, e.g. to add quotation marks

        Returns (str):
            A string that contains a (serialized) tree representation
            of the node and its children.
        """
eckhart's avatar
eckhart committed
468 469
        head = open_fn(self)
        tail = close_fn(self)
470 471

        if not self.result:
472
            return head.rstrip() + tail.lstrip()
473

eckhart's avatar
eckhart committed
474
        tail = tail.lstrip(None if density & 2 else '')
475

476
        inline = inline or inline_fn(self)
477 478 479 480 481
        if inline:
            head = head.rstrip()
            tail = tail.lstrip()
            usetab, sep = '', ''
        else:
eckhart's avatar
eckhart committed
482 483
            usetab = tab if head else ''    # no indentation if tag is already omitted
            sep = '\n'
484

485 486
        if self.children:
            content = []
487
            for child in self.children:
488
                subtree = child._tree_repr(tab, open_fn, close_fn, data_fn,
489
                                           density, inline, inline_fn)
eckhart's avatar
eckhart committed
490
                if subtree:
eckhart's avatar
eckhart committed
491 492
                    st = [subtree] if inline else subtree.split('\n')
                    content.append((sep + usetab).join(s for s in st))
493
            return head + usetab + (sep + usetab).join(content) + tail
494

eckhart's avatar
eckhart committed
495
        res = self.content
eckhart's avatar
eckhart committed
496
        if not inline and not head:
eckhart's avatar
eckhart committed
497
            # strip whitespace for omitted non inline node, e.g. CharData in mixed elements
eckhart's avatar
eckhart committed
498
            res = res.strip()
499 500
        if density & 1 and res.find('\n') < 0:  # and head[0] == "<":
            # except for XML, add a gap between opening statement and content
501
            gap = ' ' if not inline and head and head.rstrip()[-1:] != '>' else ''
eckhart's avatar
eckhart committed
502
            return head.rstrip() + gap + data_fn(res) + tail.lstrip()
503
        else:
504
            return head + '\n'.join([usetab + data_fn(s) for s in res.split('\n')]) + tail
505

Eckhart Arnold's avatar
Eckhart Arnold committed
506

507 508
    def as_sxpr(self, src: str = None,
                indentation: int = 2,
509
                compact: bool = False) -> str:
510
        """
511 512
        Returns content as S-expression, i.e. in lisp-like form. If this
        method is callad on a RootNode-object,
513

514
        Args:
515 516 517
            src:  The source text or `None`. In case the source text is
                given the position of the element in the text will be
                reported as line and column.
518 519
            indentation: The number of whitespaces for indentation
            compact:  If True, a compact representation is returned where
520
                brackets are omitted and only the indentation indicates the
Eckhart Arnold's avatar
Eckhart Arnold committed
521
                tree structure.
522 523
        """

eckhart's avatar
eckhart committed
524
        left_bracket, right_bracket, density = ('', '', 1) if compact else ('(', '\n)', 0)
eckhart's avatar
eckhart committed
525
        lbreaks = linebreaks(src) if src else []  # type: List[int]
526
        root = cast(RootNode, self) if isinstance(self, RootNode) else None  # type: Optional[Node]
527

528
        def opening(node) -> str:
eckhart's avatar
eckhart committed
529
            """Returns the opening string for the representation of `node`."""
eckhart's avatar
eckhart committed
530
            txt = [left_bracket, node.tag_name]
531
            # s += " '(pos %i)" % node.add_pos
eckhart's avatar
eckhart committed
532
            if node.attr_active():
533
                txt.extend(' `(%s "%s")' % (k, v) for k, v in node.attr.items())
534
            if src:
535 536
                line, col = line_col(lbreaks, node.pos)
                txt.append(" `(pos %i %i %i)" % (node.pos, line, col))
537 538
            if root and id(node) in root.error_nodes:
                txt.append(" `(err `%s)" % ' '.join(str(err) for err in root.get_errors(node)))
539
            return "".join(txt) + '\n'
540

eckhart's avatar
eckhart committed
541 542 543
        def closing(node) -> str:
            """Returns the closing string for the representation of `node`."""
            return right_bracket
544

eckhart's avatar
eckhart committed
545 546 547 548 549
        def pretty(strg):
            """Encloses `strg` with the right kind of quotation marks."""
            return '"%s"' % strg if strg.find('"') < 0 \
                else "'%s'" % strg if strg.find("'") < 0 \
                else '"%s"' % strg.replace('"', r'\"')
550

551
        return self._tree_repr(' ' * indentation, opening, closing, pretty, density=density)
Eckhart Arnold's avatar
Eckhart Arnold committed
552

eckhart's avatar
eckhart committed
553

554 555
    def as_xml(self, src: str = None,
               indentation: int = 2,
eckhart's avatar
eckhart committed
556 557 558
               inline_tags: Set[str] = set(),
               omit_tags: Set[str] = set(),
               empty_tags: Set[str] = set()) -> str:
559 560 561
        """
        Returns content as XML-tree.

562
        Args:
563 564 565
            src:  The source text or `None`. In case the source text is
                given the position will also be reported as line and
                column.
566
            indentation: The number of whitespaces for indentation
567 568 569
            inline_tags:  A set of tag names, the content of which will always be written
                on a single line, unless it contains explicit line feeds ('\n').
            omit_tags:  A set of tags from which only the content will be printed, but
570
                neither the opening tag nor its attr nor the closing tag. This
571 572 573
                allows producing a mix of plain text and child tags in the output,
                which otherwise is not supported by the Node object, because it
                requires its content to be either a tuple of children or string content.
574 575
            empty_tags:  A set of tags which shall be rendered as empty elements, e.g.
                "<empty/>" instead of "<empty><empty>".
576
        """
577
        root = cast(RootNode, self) if isinstance(self, RootNode) else None  # type: Optional[Node]
578

579
        def opening(node) -> str:
580 581 582
            """Returns the opening string for the representation of `node`."""
            if node.tag_name in omit_tags:
                return ''
eckhart's avatar
eckhart committed
583
            txt = ['<', node.tag_name]
eckhart's avatar
eckhart committed
584
            has_reserved_attrs = node.attr_active() \
eckhart's avatar
eckhart committed
585
                and any(r in node.attr for r in {'err', 'line', 'col'})
eckhart's avatar
eckhart committed
586
            if node.attr_active():
587
                txt.extend(' %s="%s"' % (k, v) for k, v in node.attr.items())
588
            if src and not has_reserved_attrs:
eckhart's avatar
eckhart committed
589
                txt.append(' line="%i" col="%i"' % line_col(line_breaks, node.pos))
590
            if root and id(node) in root.error_nodes and not has_reserved_attrs:
eckhart's avatar
eckhart committed
591
                txt.append(' err="%s"' % ''.join(str(err).replace('"', r'\"')
592
                                                 for err in root.get_error(node)))
593 594 595
            if node.tag_name in empty_tags:
                assert not node.result, ("Node %s with content %s is not an empty element!" %
                                         (node.tag_name, str(node)))
eckhart's avatar
eckhart committed
596
                ending = "/>\n" if not node.tag_name[0] == '?' else "?>\n"
597 598 599
            else:
                ending = ">\n"
            return "".join(txt + [ending])
600 601

        def closing(node):
602
            """Returns the closing string for the representation of `node`."""
603
            if node.tag_name in omit_tags or node.tag_name in empty_tags:
604
                return ''
605
            return ('\n</') + node.tag_name + '>'
606

607 608 609 610 611 612 613
        def sanitizer(content: str) -> str:
            """Substitute "&", "<", ">" in XML-content by the respective entities."""
            content = RX_AMP.sub('&amp;', content)
            content = content.replace('<', '&lt;').replace('>', '&gt;')
            return content


614 615 616 617 618
        def inlining(node):
            """Returns True, if `node`'s tag name is contained in `inline_tags`,
            thereby signalling that the children of this node shall not be
            printed on several lines to avoid unwanted gaps in the output.
            """
eckhart's avatar
eckhart committed
619
            return node.tag_name in inline_tags \
eckhart's avatar
eckhart committed
620
                or (node.attr_active()
eckhart's avatar
eckhart committed
621
                    and node.attr.get('xml:space', 'default') == 'preserve')
622

623
        line_breaks = linebreaks(src) if src else []
624
        return self._tree_repr(' ' * indentation, opening, closing, sanitizer,
625
                               density=1, inline_fn=inlining)
626

Eckhart Arnold's avatar
Eckhart Arnold committed
627

eckhart's avatar
eckhart committed
628
    def select(self, match_function: Callable, include_root: bool = False, reverse: bool = False) \
629
            -> Iterator['Node']:
eckhart's avatar
eckhart committed
630
        """
631
        Finds nodes in the tree that fulfill a given criterion.
eckhart's avatar
eckhart committed
632

633
        `select` is a generator that yields all nodes for which the
di68kap's avatar
di68kap committed
634
        given `match_function` evaluates to True. The tree is
635 636 637
        traversed pre-order.

        See function `Node.select_by_tag` for some examples.
eckhart's avatar
eckhart committed
638

639 640 641
        Args:
            match_function (function): A function  that takes as Node
                object as argument and returns True or False
642 643
            include_root (bool): If False, only descendant nodes will be
                checked for a match.
644 645
            reverse (bool): If True, the tree will be walked in reverse
                order, i.e. last children first.
646
        Yields:
647
            Node: All nodes of the tree for which
648 649
            ``match_function(node)`` returns True
        """
650
        if include_root and match_function(self):
651
            yield self
652 653 654
        child_iterator = reversed(self.children) if reverse else self.children
        for child in child_iterator:
            for node in child.select(match_function, True, reverse):
eckhart's avatar
eckhart committed
655
                yield node
656

Eckhart Arnold's avatar
Eckhart Arnold committed
657

eckhart's avatar
eckhart committed
658
    def select_by_tag(self, tag_names: Union[str, AbstractSet[str]],
eckhart's avatar
eckhart committed
659
                      include_root: bool = False) -> Iterator['Node']:
660
        """
661 662 663 664
        Returns an iterator that runs through all descendants that have one
        of the given tag names.

        Examples::
665

666
            >>> tree = parse_sxpr('(a (b "X") (X (c "d")) (e (X "F")))')
667
            >>> list(flatten_sxpr(item.as_sxpr()) for item in tree.select_by_tag("X", False))
668
            ['(X (c "d"))', '(X "F")']
669
            >>> list(flatten_sxpr(item.as_sxpr()) for item in tree.select_by_tag({"X", "b"}, False))
670
            ['(b "X")', '(X (c "d"))', '(X "F")']
671
            >>> any(tree.select_by_tag('a', False))
672
            False
673
            >>> list(flatten_sxpr(item.as_sxpr()) for item in tree.select_by_tag('a', True))
674
            ['(a (b "X") (X (c "d")) (e (X "F")))']
675 676
            >>> flatten_sxpr(next(tree.select_by_tag("X", False)).as_sxpr())
            '(X (c "d"))'
677 678

        Args:
679 680 681 682
            tag_name(set): A tag name or set of tag names that is being
                searched for
            include_root (bool): If False, only descendant nodes will be
                checked for a match.
683 684 685
        Yields:
            Node: All nodes which have a given tag name.
        """
686
        if isinstance(tag_names, str):
eckhart's avatar
eckhart committed
687
            tag_names = frozenset({tag_names})
688
        return self.select(lambda node: node.tag_name in tag_names, include_root)
689 690


691 692 693 694
    def pick(self, tag_names: Union[str, Set[str]]) -> Optional['Node']:
        """
        Picks the first descendant with one of the given tag_names.

695
        This function is mostly just syntactic sugar for
696 697 698 699 700 701 702 703 704 705
        ``next(node.select_by_tag(tag_names, False))``. However, rather than
        raising a StopIterationError if no descendant with the given tag-name
        exists, it returns None.
        """
        try:
            return next(self.select_by_tag(tag_names, False))
        except StopIteration:
            return None


706
    def tree_size(self) -> int:
eckhart's avatar
eckhart committed
707 708 709
        """
        Recursively counts the number of nodes in the tree including the root node.
        """
710 711 712
        return sum(child.tree_size() for child in self.children) + 1


713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730
class FrozenNode(Node):

    def __init__(self, tag_name: str, result: ResultType) -> None:
        if isinstance(result, str) or isinstance(result, StringView):
            result = str(result)
        else:
            raise TypeError('FrozenNode only accepts string as results. '
                            '(Only leaf-nodes can be frozen nodes.)')
        super(FrozenNode, self).__init__(tag_name, result, True)

    @property
    def result(self) -> StrictResultType:
        return self._result

    @result.setter
    def result(self, result: ResultType):
        raise TypeError('FrozenNode does not allow re-assignment of results.')

eckhart's avatar
eckhart committed
731 732 733 734
    @property
    def attr(self):
        raise AssertionError("Attributes cannot be accessed on a frozen node")

735 736 737 738 739 740 741 742
    # @property
    # def errors(self) -> List[Error]:
    #     return ()
    #
    # @errors.setter
    # def errors(self, errors: List[Error]):
    #     if errors:
    #         raise AssertionError('Cannot assign error list to frozen node')
eckhart's avatar
eckhart committed
743

Eckhart Arnold's avatar
Eckhart Arnold committed
744
    def with_pos(self, pos: int) -> 'Node':
745 746 747 748
        pass


PLACEHOLDER = Node('__PLACEHOLDER__', '')
di68kap's avatar
di68kap committed
749 750


di68kap's avatar
di68kap committed
751
class RootNode(Node):
752
    """TODO: Add Documentation!!!
di68kap's avatar
di68kap committed
753

754
        errors (list):  A list of all errors that have occured so far during
755 756
                processing (i.e. parsing, AST-transformation, compiling)
                of this tree.
di68kap's avatar
di68kap committed
757

758 759
        error_flag (int):  the highest warning or error level of all errors
                that occurred.
di68kap's avatar
di68kap committed
760
    """
761

eckhart's avatar
eckhart committed
762
    def __init__(self, node: Optional[Node] = None):
763
        super().__init__('__not_yet_ready__', '')
764
        self.errors = []           # type: List[Error]
765 766
        self.error_nodes = dict()      # type: Dict[int, List[Error]]  # id(node) -> error list
        self.error_positions = dict()  # type: Dict[int, Set[int]]  # pos -> set of id(node)
di68kap's avatar
di68kap committed
767
        self.error_flag = 0
eckhart's avatar
eckhart committed
768 769
        if node is not None:
            self.swallow(node)
770
        # customization for XML-Representation
eckhart's avatar
eckhart committed
771 772 773
        self.inline_tags = set()  # type: Set[str]
        self.omit_tags = set()  # type: Set[str]
        self.empty_tags = set()  # type: Set[str]
di68kap's avatar
di68kap committed
774

775 776 777 778 779 780 781 782 783 784
    def __deepcopy__(self, memodict={}):
        duplicate = self.__class__(None)
        if self.children:
            duplicate.children = copy.deepcopy(self.children)
            duplicate._result = duplicate.children
        else:
            duplicate.children = NoChildren
            duplicate._result = self._result
        duplicate._pos = self._pos
        duplicate._len = self._len
eckhart's avatar
eckhart committed
785
        if self.attr_active():
di68kap's avatar
di68kap committed
786
            duplicate.attr.update(copy.deepcopy(self._xml_attr))
787
            # duplicate._xml_attr = copy.deepcopy(self._xml_attr)  # this is blocked by cython
788
        duplicate.errors = copy.copy(self.errors)
789 790
        duplicate.error_nodes = copy.copy(self.error_nodes)
        duplicate.error_positions = copy.deepcopy(self.error_positions)
791 792 793 794
        duplicate.error_flag = self.error_flag
        duplicate.inline_tags = self.inline_tags
        duplicate.omit_tags = self.omit_tags
        duplicate.empty_tags = self.empty_tags
795
        duplicate.tag_name = self.tag_name
796 797 798
        return duplicate


799
    def swallow(self, node: Node) -> 'RootNode':
800 801
        """
        Put `self` in the place of `node` by copying all its data.
802 803 804 805 806 807 808 809 810
        Returns self.

        This is done by the parse.Grammar object after
        parsing has finished, so that the Grammar object always
        returns a syntax tree rooted in a RootNode object.

        It is possible to add errors to a RootNode object, before it
        has actually swallowed the root of the syntax tree.
        """
di68kap's avatar
di68kap committed
811 812 813 814
        self._result = node._result
        self.children = node.children
        self._len = node._len
        self._pos = node._pos
815
        self.tag_name = node.tag_name
eckhart's avatar
eckhart committed
816
        if node.attr_active():
817
            self._xml_attr = node._xml_attr
di68kap's avatar
di68kap committed
818
        self._content = node._content
819 820
        if id(node) in self.error_nodes:
            self.error_nodes[id(self)] = self.error_nodes[id(node)]
821
        return self
di68kap's avatar
di68kap committed
822

eckhart's avatar
eckhart committed
823
    def add_error(self, node: Node, error: Error) -> 'RootNode':
824 825 826
        """
        Adds an Error object to the tree, locating it at a specific node.
        """
eckhart's avatar
eckhart committed
827
        assert not isinstance(node, FrozenNode)
828
        assert node.pos == error.pos
829
        self.errors.append(error)
di68kap's avatar
di68kap committed
830
        self.error_flag = max(self.error_flag, error.code)
831 832
        self.error_nodes.setdefault(id(node), []).append(error)
        self.error_positions.setdefault(node.pos, set()).add(id(node))
di68kap's avatar
di68kap committed