parse.py 72.6 KB
Newer Older
1
# parse.py - parser combinators for DHParser
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
Eckhart Arnold's avatar
Eckhart Arnold committed
17

18 19

"""
20
Module ``parse`` contains the python classes and functions for
21 22
DHParser's packrat-parser. It's central class is the
``Grammar``-class, which is the base class for any concrete
23 24 25 26 27 28 29 30 31
Grammar. Grammar-objects are callable and parsing is done by
calling a Grammar-object with a source text as argument.

The different parsing functions are callable descendants of class
``Parser``. Usually, they are organized in a tree and defined
within the namespace of a grammar-class. See ``ebnf.EBNFGrammar``
for an example.
"""

32

33
from collections import defaultdict
34
import copy
35 36 37 38

from DHParser.error import Error, linebreaks
from DHParser.log import is_logging, HistoryRecord
from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME
39
from DHParser.stringview import StringView, EMPTY_STRING_VIEW
40
from DHParser.syntaxtree import Node, RootNode, ParserBase, WHITESPACE_PTYPE, \
41
    TOKEN_PTYPE, ZOMBIE_PARSER
42
from DHParser.toolkit import sane_parser_name, escape_control_characters, re, typing
43
from typing import Callable, cast, Dict, DefaultDict, List, Set, Tuple, Union, Optional
44

45

46
__all__ = ('Parser',
47
           'UnknownParserError',
48 49
           'Grammar',
           'PreprocessorToken',
50
           'Token',
51
           'RegExp',
52 53
           'RE',
           'TKN',
54
           'Whitespace',
55 56 57 58 59 60 61 62 63 64 65 66
           'mixin_comment',
           # 'UnaryOperator',
           # 'NaryOperator',
           'Synonym',
           'Option',
           'ZeroOrMore',
           'OneOrMore',
           'Series',
           'Alternative',
           'AllOf',
           'SomeOf',
           'Unordered',
eckhart's avatar
eckhart committed
67
           'Required',
68 69 70 71 72 73 74 75 76 77
           'Lookahead',
           'NegativeLookahead',
           'Lookbehind',
           'NegativeLookbehind',
           'last_value',
           'counterpart',
           'accumulate',
           'Capture',
           'Retrieve',
           'Pop',
78
           'Forward')
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102


########################################################################
#
# Grammar and parsing infrastructure
#
########################################################################


LEFT_RECURSION_DEPTH = 8  # type: int
# because of python's recursion depth limit, this value ought not to be
# set too high. PyPy allows higher values than CPython
MAX_DROPOUTS = 3  # type: int
# stop trying to recover parsing after so many errors


def add_parser_guard(parser_func):
    """
    Add a wrapper function to a parser functions (i.e. Parser.__call__ method)
    that takes care of memoizing, left recursion and, optionally, tracing
    (aka "history tracking") of parser calls. Returns the wrapped call.
    """
    def guarded_call(parser: 'Parser', text: StringView) -> Tuple[Optional[Node], StringView]:
        try:
Eckhart Arnold's avatar
Eckhart Arnold committed
103 104
            grammar = parser.grammar
            location = grammar.document_length__ - len(text)
105

Eckhart Arnold's avatar
Eckhart Arnold committed
106
            if grammar.last_rb__loc__ >= location:
107 108 109 110 111
                grammar.rollback_to__(location)

            # if location has already been visited by the current parser,
            # return saved result
            if location in parser.visited:
112
                # no history recording in case of memoized results
113 114 115 116 117 118 119 120
                return parser.visited[location]

            if grammar.history_tracking__:
                grammar.call_stack__.append(parser)
                grammar.moving_forward__ = True

            # break left recursion at the maximum allowed depth
            if grammar.left_recursion_handling__:
121
                if parser.recursion_counter[location] > LEFT_RECURSION_DEPTH:
122 123 124 125
                    grammar.recursion_locations__.add(location)
                    return None, text
                parser.recursion_counter[location] += 1

126
            # PARSER CALL: run original __call__ method
127 128 129 130
            node, rest = parser_func(parser, text)

            if grammar.left_recursion_handling__:
                parser.recursion_counter[location] -= 1
131
                # don't clear recursion_locations__ !!!
132 133 134 135 136 137 138 139 140 141 142 143

            if node is None:
                # retrieve an earlier match result (from left recursion) if it exists
                if location in grammar.recursion_locations__:
                    if location in parser.visited:
                        node, rest = parser.visited[location]
                        # TODO: maybe add a warning about occurrence of left-recursion here?
                    # don't overwrite any positive match (i.e. node not None) in the cache
                    # and don't add empty entries for parsers returning from left recursive calls!
                elif grammar.memoization__:
                    # otherwise also cache None-results
                    parser.visited[location] = (None, rest)
eckhart's avatar
eckhart committed
144
            else:
145
                assert node._pos < 0
Eckhart Arnold's avatar
Eckhart Arnold committed
146 147 148
                node._pos = location
                assert node._pos >= 0, str("%i < %i" % (grammar.document_length__, location))
                if (grammar.last_rb__loc__ < location
eckhart's avatar
eckhart committed
149 150 151 152 153
                        and (grammar.memoization__ or location in grammar.recursion_locations__)):
                    # - variable manipulating parsers will not be entered into the cache,
                    #   because caching would interfere with changes of variable state
                    # - in case of left recursion, the first recursive step that
                    #   matches will store its result in the cache
154
                    # TODO: need a unit-test concerning interference of variable manipulation and left recursion algorithm?
eckhart's avatar
eckhart committed
155
                    parser.visited[location] = (node, rest)
156

157 158
            # Mind that memoized parser calls will not appear in the history record!
            # Does this make sense? Or should it be changed?
159 160
            if grammar.history_tracking__:
                # don't track returning parsers except in case an error has occurred
161
                # remaining = len(rest)
di68kap's avatar
di68kap committed
162
                if (grammar.moving_forward__ or (node and node.errors)):
163 164 165 166 167 168 169 170
                    record = HistoryRecord(grammar.call_stack__, node, text)
                    grammar.history__.append(record)
                    # print(record.stack, record.status, rest[:20].replace('\n', '|'))
                grammar.moving_forward__ = False
                grammar.call_stack__.pop()

        except RecursionError:
            node = Node(None, str(text[:min(10, max(1, text.find("\n")))]) + " ...")
171
            node._pos = location
eckhart's avatar
eckhart committed
172
            grammar.tree__.new_error(node, "maximum recursion depth of parser reached; "
173
                                     "potentially due to too many errors!")
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
            rest = EMPTY_STRING_VIEW

        return node, rest

    return guarded_call


class Parser(ParserBase):
    """
    (Abstract) Base class for Parser combinator parsers. Any parser
    object that is actually used for parsing (i.e. no mock parsers)
    should should be derived from this class.

    Since parsers can contain other parsers (see classes UnaryOperator
    and NaryOperator) they form a cyclical directed graph. A root
    parser is a parser from which all other parsers can be reached.
    Usually, there is one root parser which serves as the starting
    point of the parsing process. When speaking of "the root parser"
    it is this root parser object that is meant.

    There are two different types of parsers:

    1. *Named parsers* for which a name is set in field `parser.name`.
       The results produced by these parsers can later be retrieved in
       the AST by the parser name.

    2. *Anonymous parsers* where the name-field just contains the empty
       string. AST-transformation of Anonymous parsers can be hooked
       only to their class name, and not to the individual parser.

    Parser objects are callable and parsing is done by calling a parser
    object with the text to parse.

    If the parser matches it returns a tuple consisting of a node
    representing the root of the concrete syntax tree resulting from the
    match as well as the substring `text[i:]` where i is the length of
    matched text (which can be zero in the case of parsers like
    `ZeroOrMore` or `Option`). If `i > 0` then the parser has "moved
    forward".

    If the parser does not match it returns `(None, text). **Note** that
    this is not the same as an empty match `("", text)`. Any empty match
    can for example be returned by the `ZeroOrMore`-parser in case the
    contained parser is repeated zero times.

    Attributes and Properties:
        visited:  Mapping of places this parser has already been to
                during the current parsing process onto the results the
                parser returned at the respective place. This dictionary
                is used to implement memoizing.

        recursion_counter:  Mapping of places to how often the parser
                has already been called recursively at this place. This
                is needed to implement left recursion. The number of
                calls becomes irrelevant once a resault has been memoized.

        cycle_detection:  The apply()-method uses this variable to make
                sure that one and the same function will not be applied
                (recursively) a second time, if it has already been
                applied to this parser.

        grammar:  A reference to the Grammar object to which the parser
                is attached.
    """

    ApplyFunc = Callable[['Parser'], None]

241
    def __init__(self) -> None:
242
        # assert isinstance(name, str), str(name)
243
        super().__init__()
eckhart's avatar
eckhart committed
244
        self._grammar = None  # type: Optional['Grammar']
245 246 247 248
        self.reset()

        # add "aspect oriented" wrapper around parser calls
        # for memoizing, left recursion and tracing
di68kap's avatar
di68kap committed
249
        if not isinstance(self, Forward):  # should Forward-Parser not be guarded? Not sure...
250 251 252 253 254 255
            guarded_parser_call = add_parser_guard(self.__class__.__call__)
            # The following check is necessary for classes that don't override
            # the __call__() method, because in these cases the non-overridden
            # __call__()-method would be substituted a second time!
            if self.__class__.__call__.__code__ != guarded_parser_call.__code__:
                self.__class__.__call__ = guarded_parser_call
256 257 258 259 260 261 262 263

    def __deepcopy__(self, memo):
        """Deepcopy method of the parser. Upon instantiation of a Grammar-
        object, parsers will be deep-copied to the Grammar object. If a
        derived parser-class changes the signature of the constructor,
        `__deepcopy__`-method must be replaced (i.e. overridden without
        calling the same method from the superclass) by the derived class.
        """
264 265
        duplicate = self.__class__()
        duplicate.name = self.name
266
        duplicate.ptype =   self.ptype
267
        return duplicate
268 269 270 271 272

    def reset(self):
        """Initializes or resets any parser variables. If overwritten,
        the `reset()`-method of the parent class must be called from the
        `reset()`-method of the derived class."""
273 274 275
        self.visited = dict()  # type: Dict[int, Tuple[Optional[Node], StringView]]
        self.recursion_counter = defaultdict(lambda :0)  # type: DefaultDict[int, int]
        self.cycle_detection = set()  # type: Set[Callable]
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        """Applies the parser to the given `text` and returns a node with
        the results or None as well as the text at the position right behind
        the matching string."""
        return None, text  # default behaviour: don't match

    def __add__(self, other: 'Parser') -> 'Series':
        """The + operator generates a series-parser that applies two
        parsers in sequence."""
        return Series(self, other)

    def __or__(self, other: 'Parser') -> 'Alternative':
        """The | operator generates an alternative parser that applies
        the first parser and, if that does not match, the second parser.
        """
        return Alternative(self, other)

    @property
    def grammar(self) -> 'Grammar':
        return self._grammar

    @grammar.setter
    def grammar(self, grammar: 'Grammar'):
        if self._grammar is None:
            self._grammar = grammar
            self._grammar_assigned_notifier()
        else:
            assert self._grammar == grammar, \
                "Parser has already been assigned to a different Grammar object!"

    def _grammar_assigned_notifier(self):
        """A function that notifies the parser object that it has been
        assigned to a grammar."""
        pass

    def apply(self, func: ApplyFunc) -> bool:
        """
        Applies function `func(parser)` recursively to this parser and all
        descendant parsers if any exist. The same function can never
        be applied twice between calls of the ``reset()``-method!
        Returns `True`, if function has been applied, `False` if function
        had been applied earlier already and thus has not been applied again.
        """
        if func in self.cycle_detection:
            return False
        else:
            assert not self.visited, "No calls to Parser.apply() during or " \
                                     "after ongoing parsing process. (Call Parser.reset() first.)"
            self.cycle_detection.add(func)
            func(self)
            return True


def mixin_comment(whitespace: str, comment: str) -> str:
    """
    Returns a regular expression that merges comment and whitespace
    regexps. Thus comments cann occur whereever whitespace is allowed
    and will be skipped just as implicit whitespace.

    Note, that because this works on the level of regular expressions,
    nesting comments is not possible. It also makes it much harder to
    use directives inside comments (which isn't recommended, anyway).
    """
    wspc = '(?:' + whitespace + '(?:' + comment + whitespace + ')*)'
    return wspc


344
class UnknownParserError(KeyError):
eckhart's avatar
eckhart committed
345
    """UnknownParserError is raised if a Grammar object is called with a
346
    parser that does not exist or if in the course of parsing a parser
eckhart's avatar
eckhart committed
347
    is referred to that does not exist."""
348 349


350 351 352 353 354 355 356 357 358 359 360 361 362
class Grammar:
    r"""
    Class Grammar directs the parsing process and stores global state
    information of the parsers, i.e. state information that is shared
    accross parsers.

    Grammars are basically collections of parser objects, which are
    connected to an instance object of class Grammar. There exist two
    ways of connecting parsers to grammar objects: Either by passing
    the root parser object to the constructor of a Grammar object
    ("direct instantiation"), or by assigning the root parser to the
    class variable "root__" of a descendant class of class Grammar.

eckhart's avatar
eckhart committed
363
    Example for direct instantiation of a grammar::
364

365
        >>> number = RE('\d+') + RE('\.') + RE('\d+') | RE('\d+')
366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
        >>> number_parser = Grammar(number)
        >>> number_parser("3.1416").content
        '3.1416'

    Collecting the parsers that define a grammar in a descendant class of
    class Grammar and assigning the named parsers to class variables
    rather than global variables has several advantages:

    1. It keeps the namespace clean.

    2. The parser names of named parsers do not need to be passed to the
       constructor of the Parser object explicitly, but it suffices to
       assign them to class variables, which results in better
       readability of the Python code.

    3. The parsers in the class do not necessarily need to be connected
       to one single root parser, which is helpful for testing and
       building up a parser successively of several components.

    As a consequence, though, it is highly recommended that a Grammar
    class should not define any other variables or methods with names
    that are legal parser names. A name ending with a double
    underscore '__' is *not* a legal parser name and can safely be
    used.

391
    Example::
392 393 394 395 396 397 398 399 400

        class Arithmetic(Grammar):
            # special fields for implicit whitespace and comment configuration
            COMMENT__ = r'#.*(?:\n|$)'  # Python style comments
            wspR__ = mixin_comment(whitespace=r'[\t ]*', comment=COMMENT__)

            # parsers
            expression = Forward()
            INTEGER = RE('\\d+')
401 402 403
            factor = INTEGER | TKN("(") + expression + TKN(")")
            term = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
            root__ = expression

    Upon instantiation the parser objects are deep-copied to the
    Grammar object and assigned to object variables of the same name.
    Any parser that is directly assigned to a class variable is a
    'named' parser and its field `parser.name` contains the variable
    name after instantiation of the Grammar class. All other parsers,
    i.e. parsers that are defined within a `named` parser, remain
    "anonymous parsers" where `parser.name` is the empty string, unless
    a name has been passed explicitly upon instantiation.
    If one and the same parser is assigned to several class variables
    such as, for example the parser `expression` in the example above,
    the first name sticks.

    Grammar objects are callable. Calling a grammar object with a UTF-8
    encoded document, initiates the parsing of the document with the
    root parser. The return value is the concrete syntax tree. Grammar
    objects can be reused (i.e. called again) after parsing. Thus, it
    is not necessary to instantiate more than one Grammar object per
    thread.

    Grammar classes contain a few special class fields for implicit
    whitespace and comments that should be overwritten, if the defaults
    (no comments, horizontal right aligned whitespace) don't fit:

429 430 431 432 433
    Attributes:
        root__:  The root parser of the grammar. Theoretically, all parsers of the
                 grammar should be reachable by the root parser. However, for testing
                 of yet incomplete grammars class Grammar does not assume that this
                 is the case.
434

435 436 437 438 439
        parser_initializiation__:  Before the parser class (!) has been initialized,
                 which happens upon the first time it is instantiated (see
                 :func:_assign_parser_names()` for an explanation), this class
                 field contains a value other than "done". A value of "done" indicates
                 that the class has already been initialized.
440

eckhart's avatar
eckhart committed
441 442 443 444
        python__src__:  For the purpose of debugging and inspection, this field can
                 take the python src of the concrete grammar class
                 (see `dsl.grammar_provider`).

445 446 447 448 449 450 451 452 453 454 455 456 457
    Attributes:
        all_parsers__:  A set of all parsers connected to this grammar object

        history_tracking__:  A flag indicating that the parsing history shall
                be tracked

        _dirty_flag__:  A flag indicating that the Grammar has been called at
                least once so that the parsing-variables need to be reset
                when it is called again.

        document__:  the text that has most recently been parsed or that is
                currently being parsed.

eckhart's avatar
eckhart committed
458 459
        document_length__:  the length of the document.

460 461 462
        document_lbreaks__:  list of linebreaks within the document, starting
                with -1 and ending with EOF. This helps generating line
                and column number for history recording and will only be
463
                initialized if :attr:`history_tracking__` is true.
464

465 466 467 468 469
        tree__: The root-node of the parsing tree. This variable is available
               for error-reporting already during parsing  via
               ``self.grammar.tree__.add_error``, but it references the full
               parsing tree only after parsing has been finished.

470
        _reversed__:  the same text in reverse order - needed by the `Lookbehind`-
471 472 473
                parsers.

        variables__:  A mapping for variable names to a stack of their respective
474 475
                string values - needed by the :class:`Capture`-, :class:`Retrieve`-
                and :class:`Pop`-parsers.
476 477

        rollback__:  A list of tuples (location, rollback-function) that are
478 479 480 481
                deposited by the :class:`Capture`- and :class:`Pop`-parsers.
                If the parsing process reaches a dead end then all
                rollback-functions up to the point to which it retreats will be
                called and the state of the variable stack restored accordingly.
482 483 484

        last_rb__loc__:  The last, i.e. most advanced location in the text
                where a variable changing operation occurred. If the parser
485 486
                backtracks to a location at or before last_rb__loc__ (i.e.
                location <= last_rb__loc__) then a rollback of all variable
487 488
                changing operations is necessary that occurred after the
                location to which the parser backtracks. This is done by
489
                calling method :func:`rollback_to__(location)`.
490 491 492 493 494 495 496 497 498 499 500 501 502

        call_stack__:  A stack of all parsers that have been called. This
                is required for recording the parser history (for debugging)
                and, eventually, i.e. one day in the future, for tracing through
                the parsing process.

        history__:  A list of parser-call-stacks. A parser-call-stack is
                appended to the list each time a parser either matches, fails
                or if a parser-error occurs.

        moving_forward__: This flag indicates that the parsing process is currently
                moving forward . It is needed to reduce noise in history recording
                and should not be considered as having a valid value if history
503 504
                recording is turned off! (See :func:`add_parser_guard` and its local
                function :func:`guarded_call`)
505 506 507 508

        recursion_locations__:  Stores the locations where left recursion was
                detected. Needed to provide minimal memoization for the left
                recursion detection algorithm, but, strictly speaking, superfluous
509 510
                if full memoization is enabled. (See :func:`add_parser_guard` and its
                local function :func:`guarded_call`)
511 512 513 514 515 516 517 518 519 520

        memoization__:  Turns full memoization on or off. Turning memoization off
                results in less memory usage and sometimes reduced parsing time.
                In some situations it may drastically increase parsing time, so
                it is safer to leave it on. (Default: on)

        left_recursion_handling__:  Turns left recursion handling on or off.
                If turned off, a recursion error will result in case of left
                recursion.
    """
eckhart's avatar
eckhart committed
521
    python_src__ = ''  # type: str
522 523 524 525
    root__ = ZOMBIE_PARSER  # type: ParserBase
    # root__ must be overwritten with the root-parser by grammar subclass
    parser_initialization__ = "pending"  # type: str
    # some default values
526 527
    # COMMENT__ = r''  # type: str  # r'#.*(?:\n|$)'
    # WSP_RE__ = mixin_comment(whitespace=r'[\t ]*', comment=COMMENT__)  # type: str
528 529 530 531 532 533 534


    @classmethod
    def _assign_parser_names__(cls):
        """
        Initializes the `parser.name` fields of those
        Parser objects that are directly assigned to a class field with
535
        the field's name, e.g.::
536

537 538
            class Grammar(Grammar):
                ...
539
                symbol = RE('(?!\\d)\\w+')
540

di68kap's avatar
di68kap committed
541 542 543
        After the call of this method symbol.name == "symbol" holds.
        Parser names starting or ending with a double underscore like
        ``root__`` will be ignored. See :func:`sane_parser_name()`
544 545 546 547 548 549 550 551 552 553 554 555 556 557

        This is done only once, upon the first instantiation of the
        grammar class!

        Attention: If there exists more than one reference to the same
        parser, only the first one will be chosen for python versions
        greater or equal 3.6.  For python version <= 3.5 an arbitrarily
        selected reference will be chosen. See PEP 520
        (www.python.org/dev/peps/pep-0520/) for an explanation of why.
        """
        if cls.parser_initialization__ != "done":
            cdict = cls.__dict__
            for entry, parser in cdict.items():
                if isinstance(parser, Parser) and sane_parser_name(entry):
di68kap's avatar
di68kap committed
558
                    if isinstance(parser, Forward):
di68kap's avatar
di68kap committed
559
                        if not cast(Forward, parser).parser.name:
di68kap's avatar
di68kap committed
560 561
                            cast(Forward, parser).parser.name = entry
                    else:   # if not parser.name:
562
                        parser.name = entry
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601
            cls.parser_initialization__ = "done"


    def __init__(self, root: Parser = None) -> None:
        # if not hasattr(self.__class__, 'parser_initialization__'):
        #     self.__class__.parser_initialization__ = "pending"
        # if not hasattr(self.__class__, 'wspL__'):
        #     self.wspL__ = ''
        # if not hasattr(self.__class__, 'wspR__'):
        #     self.wspR__ = ''
        self.all_parsers__ = set()             # type: Set[ParserBase]
        self._dirty_flag__ = False             # type: bool
        self.history_tracking__ = False        # type: bool
        self.memoization__ = True              # type: bool
        self.left_recursion_handling__ = True  # type: bool
        self._reset__()

        # prepare parsers in the class, first
        self._assign_parser_names__()

        # then deep-copy the parser tree from class to instance;
        # parsers not connected to the root object will be copied later
        # on demand (see Grammar.__getitem__()). Usually, the need to
        # do so only arises during testing.
        self.root__ = copy.deepcopy(root) if root else copy.deepcopy(self.__class__.root__)
        self.root__.apply(self._add_parser__)


    def __getitem__(self, key):
        try:
            return self.__dict__[key]
        except KeyError:
            parser_template = getattr(self, key, None)
            if parser_template:
                # add parser to grammar object on the fly...
                parser = copy.deepcopy(parser_template)
                parser.apply(self._add_parser__)
                # assert self[key] == parser
                return self[key]
602
            raise UnknownParserError('Unknown parser "%s" !' % key)
603 604 605


    def _reset__(self):
606
        self.tree__ = RootNode()              # type: RootNode
607 608
        self.document__ = EMPTY_STRING_VIEW   # type: StringView
        self._reversed__ = EMPTY_STRING_VIEW  # type: StringView
eckhart's avatar
eckhart committed
609
        self.document_length__ = 0            # type: int
610 611
        self.document_lbreaks__ = []          # type: List[int]
        # variables stored and recalled by Capture and Retrieve parsers
612
        self.variables__ = defaultdict(lambda :[])  # type: DefaultDict[str, List[str]]
613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
        self.rollback__ = []                  # type: List[Tuple[int, Callable]]
        self.last_rb__loc__ = -1              # type: int
        # support for call stack tracing
        self.call_stack__ = []                # type: List[Parser]
        # snapshots of call stacks
        self.history__ = []                   # type: List[HistoryRecord]
        # also needed for call stack tracing
        self.moving_forward__ = False         # type: bool
        self.recursion_locations__ = set()    # type: Set[int]


    @property
    def reversed__(self) -> StringView:
        """
        Returns a reversed version of the currently parsed document. As
        about the only case where this is needed is the Lookbehind-parser,
        this is done lazily.
        """
        if not self._reversed__:
            self._reversed__ = StringView(self.document__.text[::-1])
        return self._reversed__


    def _add_parser__(self, parser: Parser) -> None:
        """
        Adds the particular copy of the parser object to this
        particular instance of Grammar.
        """
        if parser.name:
            # prevent overwriting instance variables or parsers of a different class
            assert parser.name not in self.__dict__ or \
644
                isinstance(self.__dict__[parser.name], parser.__class__), \
645
                ('Cannot add parser "%s" because a field with the same name '
di68kap's avatar
di68kap committed
646 647
                 'already exists in grammar object: %s!'
                 % (parser.name, str(self.__dict__[parser.name])))
648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
            setattr(self, parser.name, parser)
        self.all_parsers__.add(parser)
        parser.grammar = self


    def __call__(self, document: str, start_parser="root__") -> Node:
        """
        Parses a document with with parser-combinators.

        Args:
            document (str): The source text to be parsed.
            start_parser (str): The name of the parser with which to
                start. This is useful for testing particular parsers
                (i.e. particular parts of the EBNF-Grammar.)
        Returns:
            Node: The root node ot the parse tree.
        """
665 666

        def tail_pos(predecessors: Union[List[Node], Tuple[Node, ...]]) -> int:
eckhart's avatar
eckhart committed
667 668
            """Adds the position after the last node in the list of
            predecessors to the node."""
eckhart's avatar
eckhart committed
669
            return predecessors[-1].pos + len(predecessors[-1]) if predecessors else 0
eckhart's avatar
eckhart committed
670

671 672 673 674 675 676 677 678 679 680 681
        # assert isinstance(document, str), type(document)
        if self.root__ is None:
            raise NotImplementedError()
        if self._dirty_flag__:
            self._reset__()
            for parser in self.all_parsers__:
                parser.reset()
        else:
            self._dirty_flag__ = True
        self.history_tracking__ = is_logging()
        self.document__ = StringView(document)
eckhart's avatar
eckhart committed
682
        self.document_length__ = len(self.document__)
683
        self.document_lbreaks__ = linebreaks(document) if self.history_tracking__ else []
Eckhart Arnold's avatar
Eckhart Arnold committed
684
        self.last_rb__loc__ = -1  # rollback location
685 686 687 688 689 690 691 692 693
        parser = self[start_parser] if isinstance(start_parser, str) else start_parser
        assert parser.grammar == self, "Cannot run parsers from a different grammar object!" \
                                       " %s vs. %s" % (str(self), str(parser.grammar))
        result = None  # type: Optional[Node]
        stitches = []  # type: List[Node]
        rest = self.document__
        if not rest:
            result, _ = parser(rest)
            if result is None:
eckhart's avatar
eckhart committed
694
                result = Node(None, '').init_pos(0)
eckhart's avatar
eckhart committed
695
                self.tree__.new_error(result,
di68kap's avatar
di68kap committed
696 697
                                      'Parser "%s" did not match empty document.' % str(parser),
                                      Error.PARSER_DID_NOT_MATCH)
698 699 700 701 702 703 704 705 706 707
        while rest and len(stitches) < MAX_DROPOUTS:
            result, rest = parser(rest)
            if rest:
                fwd = rest.find("\n") + 1 or len(rest)
                skip, rest = rest[:fwd], rest[fwd:]
                if result is None:
                    error_msg = 'Parser did not match! Invalid source file?' \
                                '\n    Most advanced: %s\n    Last match:    %s;' % \
                                (str(HistoryRecord.most_advanced_match(self.history__)),
                                 str(HistoryRecord.last_match(self.history__)))
di68kap's avatar
di68kap committed
708
                    error_code = Error.PARSER_DID_NOT_MATCH
709 710 711 712 713 714 715 716
                else:
                    stitches.append(result)
                    error_msg = "Parser stopped before end" + \
                                (("! trying to recover" +
                                  (" but stopping history recording at this point."
                                   if self.history_tracking__ else "..."))
                                 if len(stitches) < MAX_DROPOUTS
                                 else " too often! Terminating parser.")
di68kap's avatar
di68kap committed
717
                    error_code = Error.PARSER_STOPPED_BEFORE_END
eckhart's avatar
eckhart committed
718
                stitches.append(Node(None, skip).init_pos(tail_pos(stitches)))
di68kap's avatar
di68kap committed
719
                self.tree__.new_error(stitches[-1], error_msg, error_code)
720
                if self.history_tracking__:
eckhart's avatar
eckhart committed
721 722 723 724 725 726 727
                    # # some parsers may have matched and left history records with nodes != None.
                    # # Because these are not connected to the stitched root node, their pos-
                    # # properties will not be initialized by setting the root node's pos property
                    # # to zero. Therefore, their pos properties need to be initialized here
                    # for record in self.history__:
                    #     if record.node and record.node._pos < 0:
                    #         record.node.init_pos(0)
728 729 730 731 732 733
                    record = HistoryRecord(self.call_stack__.copy(), stitches[-1], rest)
                    self.history__.append(record)
                    # stop history tracking when parser returned too early
                    self.history_tracking__ = False
        if stitches:
            if rest:
Eckhart Arnold's avatar
Eckhart Arnold committed
734 735
                stitches.append(Node(None, rest))
            result = Node(None, tuple(stitches)).init_pos(0)
736
        if any(self.variables__.values()):
di68kap's avatar
di68kap committed
737
            error_msg = "Capture-retrieve-stack not empty after end of parsing: " \
738
                + str(self.variables__)
di68kap's avatar
di68kap committed
739
            error_code = Error.CAPTURE_STACK_NOT_EMPTY
740 741 742 743 744
            if result:
                if result.children:
                    # add another child node at the end to ensure that the position
                    # of the error will be the end of the text. Otherwise, the error
                    # message above ("...after end of parsing") would appear illogical.
eckhart's avatar
eckhart committed
745
                    error_node = Node(ZOMBIE_PARSER, '').init_pos(tail_pos(result.children))
di68kap's avatar
di68kap committed
746
                    self.tree__.new_error(error_node, error_msg, error_code)
eckhart's avatar
eckhart committed
747
                    result.result = result.children + (error_node,)
748
                else:
di68kap's avatar
di68kap committed
749
                    self.tree__.new_error(result, error_msg, error_code)
eckhart's avatar
eckhart committed
750
        # result.pos = 0  # calculate all positions
751
        # result.collect_errors(self.document__)
752 753
        self.tree__.swallow(result)
        return self.tree__
754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771


    def push_rollback__(self, location, func):
        """
        Adds a rollback function that either removes or re-adds
        values on the variable stack (`self.variables`) that have been
        added (or removed) by Capture or Pop Parsers, the results of
        which have been dismissed.
        """
        self.rollback__.append((location, func))
        self.last_rb__loc__ = location


    def rollback_to__(self, location):
        """
        Rolls back the variable stacks (`self.variables`) to its
        state at an earlier location in the parsed document.
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
772
        while self.rollback__ and self.rollback__[-1][0] >= location:
773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806
            _, rollback_func = self.rollback__.pop()
            # assert not loc > self.last_rb__loc__, \
            #     "Rollback confusion: line %i, col %i < line %i, col %i" % \
            #     (*line_col(self.document__, len(self.document__) - loc),
            #      *line_col(self.document__, len(self.document__) - self.last_rb__loc__))
            rollback_func()
        self.last_rb__loc__ == self.rollback__[-1][0] if self.rollback__ \
            else (len(self.document__) + 1)


def dsl_error_msg(parser: Parser, error_str: str) -> str:
    """
    Returns an error message for errors in the parser configuration,
    e.g. errors that result in infinite loops.

    Args:
        parser (Parser):  The parser where the error was noticed. Note
            that this is not necessarily the parser that caused the
            error but only where the error became apparent.
        error_str (str):  A short string describing the error.
    Returns:
        str: An error message including the call stack if history
        tacking has been turned in the grammar object.
    """
    msg = ["DSL parser specification error:", error_str, 'Caught by parser "%s".' % str(parser)]
    if parser.grammar.history__:
        msg.extend(["\nCall stack:", parser.grammar.history__[-1].stack])
    else:
        msg.extend(["\nEnable history tracking in Grammar object to display call stack."])
    return " ".join(msg)


########################################################################
#
807
# _Token and Regular Expression parser classes (i.e. leaf classes)
808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
#
########################################################################


class PreprocessorToken(Parser):
    """
    Parses tokens that have been inserted by a preprocessor.

    Preprocessors can generate Tokens with the ``make_token``-function.
    These tokens start and end with magic characters that can only be
    matched by the PreprocessorToken Parser. Such tokens can be used to
    insert BEGIN - END delimiters at the beginning or ending of a
    quoted block, for example.
    """

    def __init__(self, token: str) -> None:
        assert token and token.isupper()
825
        assert RX_TOKEN_NAME.match(token)
826 827
        super().__init__()
        self.name = token
828

829 830 831
    def __deepcopy__(self, memo):
        duplicate = self.__class__(self.name)
        duplicate.name = self.name
832
        duplicate.ptype =   self.ptype
833 834
        return duplicate

835 836 837 838
    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        if text[0:1] == BEGIN_TOKEN:
            end = text.find(END_TOKEN, 1)
            if end < 0:
839
                node = Node(self, '')
eckhart's avatar
eckhart committed
840
                self.grammar.tree__.new_error(node,
841 842
                    'END_TOKEN delimiter missing from preprocessor token. '
                    '(Most likely due to a preprocessor bug!)')  # type: Node
843
                return node, text[1:]
844
            elif end == 0:
845
                node = Node(self, '')
eckhart's avatar
eckhart committed
846
                self.grammar.tree__.new_error(node,
847 848
                    'Preprocessor-token cannot have zero length. '
                    '(Most likely due to a preprocessor bug!)')
849
                return node, text[2:]
850 851
            elif text.find(BEGIN_TOKEN, 1, end) >= 0:
                node = Node(self, text[len(self.name) + 1:end])
eckhart's avatar
eckhart committed
852
                self.grammar.tree__.new_error(node,
853 854 855 856 857
                    'Preprocessor-tokens must not be nested or contain '
                    'BEGIN_TOKEN delimiter as part of their argument. '
                    '(Most likely due to a preprocessor bug!)')
                return node, text[end:]
            if text[1:len(self.name) + 1] == self.name:
858
                return Node(self, text[len(self.name) + 2:end]), text[end + 1:]
859 860 861
        return None, text


862
class Token(Parser):
eckhart's avatar
eckhart committed
863
    """
864
    Parses plain text strings. (Could be done by RegExp as well, but is faster.)
eckhart's avatar
eckhart committed
865

eckhart's avatar
eckhart committed
866 867
    Example::

868
        >>> while_token = Token("while")
eckhart's avatar
eckhart committed
869 870
        >>> Grammar(while_token)("while").content
        'while'
eckhart's avatar
eckhart committed
871
    """
872
    assert TOKEN_PTYPE == ":Token"
eckhart's avatar
eckhart committed
873

874 875
    def __init__(self, text: str) -> None:
        super().__init__()
eckhart's avatar
eckhart committed
876
        self.text = text
877
        self.len = len(text)
eckhart's avatar
eckhart committed
878 879

    def __deepcopy__(self, memo):
880 881
        duplicate = self.__class__(self.text)
        duplicate.name = self.name
882
        duplicate.ptype = self.ptype
883
        return duplicate
eckhart's avatar
eckhart committed
884 885 886

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        if text.startswith(self.text):
887
            return Node(self, self.text, True), text[self.len:]
eckhart's avatar
eckhart committed
888 889
        return None, text

890 891 892
    def __repr__(self):
        return ("'%s'" if self.text.find("'") <= 0 else '"%s"') % self.text

eckhart's avatar
eckhart committed
893

894 895 896 897 898 899 900 901 902
class RegExp(Parser):
    r"""
    Regular expression parser.

    The RegExp-parser parses text that matches a regular expression.
    RegExp can also be considered as the "atomic parser", because all
    other parsers delegate part of the parsing job to other parsers,
    but do not match text directly.

903 904 905 906 907
    Example::

        >>> word = RegExp(r'\w+')
        >>> Grammar(word)("Haus").content
        'Haus'
908

909
    EBNF-Notation:  ``/ ... /``
910

911
    EBNF-Example:   ``word = /\w+/``
912 913
    """

914 915
    def __init__(self, regexp) -> None:
        super().__init__()
916 917 918 919 920 921 922 923
        self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

    def __deepcopy__(self, memo):
        # `regex` supports deep copies, but not `re`
        try:
            regexp = copy.deepcopy(self.regexp, memo)
        except TypeError:
            regexp = self.regexp.pattern
924 925
        duplicate = self.__class__(regexp)
        duplicate.name = self.name
926
        duplicate.ptype =   self.ptype
927
        return duplicate
928 929

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
930 931 932 933
        match = text.match(self.regexp)
        if match:
            capture = match.group(0)
            end = text.index(match.end())
934 935
            # regular expression must never match preprocessor-tokens!
            # TODO: Find a better solution here? e.g. static checking/re-mangling at compile time
936 937 938 939 940
            i = capture.find(BEGIN_TOKEN)
            if i >= 0:
                capture = capture[:i]
                end = i
            return Node(self, capture, True), text[end:]
941 942 943
        return None, text

    def __repr__(self):
944
        return escape_control_characters('/%s/' % self.regexp.pattern)
945 946


947 948
def withWS(parser_factory, wsL='', wsR='\s*'):
    """Syntactic Sugar for 'Series(Whitespace(wsL), parser_factory(), Whitespace(wsR))'.
949
    """
950 951 952 953 954 955 956 957 958 959 960 961
    if wsL and isinstance(wsL, str):
        wsL = Whitespace(wsL)
    if wsR and isinstance(wsR, str):
        wsR = Whitespace(wsR)
    if wsL and wsR:
        return Series(wsL, parser_factory(), wsR)
    elif wsL:
        return Series(wsL, parser_factory())
    elif wsR:
        return Series(parser_factory(), wsR)
    else:
        return parser_factory()
962

963

964 965 966
def RE(regexp, wsL='', wsR='\s*'):
    """Syntactic Sugar for 'Series(Whitespace(wsL), RegExp(regexp), Whitespace(wsR))'"""
    return withWS(lambda : RegExp(regexp), wsL, wsR)
967 968


969 970 971
def TKN(token, wsL='', wsR='\s*'):
    """Syntactic Sugar for 'Series(Whitespace(wsL), Token(token), Whitespace(wsR))'"""
    return withWS(lambda: Token(token), wsL, wsR)
972 973


974 975 976 977
class Whitespace(RegExp):
    """An variant of RegExp that signifies through its class name that it
    is a RegExp-parser for whitespace."""
    assert WHITESPACE_PTYPE == ":Whitespace"
eckhart's avatar
eckhart committed
978

eckhart's avatar
eckhart committed
979 980 981
    def __repr__(self):
        return '~'

982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001

########################################################################
#
# Containing parser classes, i.e. parsers that contain other parsers
# to which they delegate parsing
#
########################################################################


class UnaryOperator(Parser):
    """
    Base class of all unary parser operators, i.e. parser that contains
    one and only one other parser, like the optional parser for example.

    The UnaryOperator base class supplies __deepcopy__ and apply
    methods for unary parser operators. The __deepcopy__ method needs
    to be overwritten, however, if the constructor of a derived class
    has additional parameters.
    """

1002 1003
    def __init__(self, parser: Parser) -> None:
        super(UnaryOperator, self).__init__()
1004 1005 1006 1007 1008
        assert isinstance(parser, Parser), str(parser)
        self.parser = parser  # type: Parser

    def __deepcopy__(self, memo):
        parser = copy.deepcopy(self.parser, memo)
1009 1010
        duplicate = self.__class__(parser)
        duplicate.name = self.name
1011
        duplicate.ptype =   self.ptype
1012
        return duplicate
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032

    def apply(self, func: Parser.ApplyFunc) -> bool:
        if super().apply(func):
            self.parser.apply(func)
            return True
        return False


class NaryOperator(Parser):
    """
    Base class of all Nnary parser operators, i.e. parser that
    contains one or more other parsers, like the alternative
    parser for example.

    The NnaryOperator base class supplies __deepcopy__ and apply methods
    for unary parser operators. The __deepcopy__ method needs to be
    overwritten, however, if the constructor of a derived class has
    additional parameters.
    """

1033 1034
    def __init__(self, *parsers: Parser) -> None:
        super().__init__()
1035 1036 1037 1038 1039
        assert all([isinstance(parser, Parser) for parser in parsers]), str(parsers)
        self.parsers = parsers  # type: Tuple[Parser, ...]

    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
1040 1041
        duplicate = self.__class__(*parsers)
        duplicate.name = self.name
1042
        duplicate.ptype =   self.ptype
1043
        return duplicate
1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054

    def apply(self, func: Parser.ApplyFunc) -> bool:
        if super().apply(func):
            for parser in self.parsers:
                parser.apply(func)
            return True
        return False


class Option(UnaryOperator):
    r"""
1055
    Parser ``Option`` always matches, even if its child-parser
1056 1057
    did not match.

1058
    If the child-parser did not match ``Option`` returns a node
1059 1060
    with no content and does not move forward in the text.

1061
    If the child-parser did match, ``Option`` returns the a node
eckhart's avatar
eckhart committed
1062
    with the node returned by the child-parser as its single
1063 1064 1065
    child and the text at the position where the child-parser
    left it.

1066 1067
    Examples::

1068
        >>> number = Option(TKN('-')) + RegExp(r'\d+') + Option(RegExp(r'\.\d+'))
1069 1070 1071 1072 1073 1074
        >>> Grammar(number)('3.14159').content
        '3.14159'
        >>> Grammar(number)('3.14159').structure
        '(:Series (:Option) (:RegExp "3") (:Option (:RegExp ".14159")))'
        >>> Grammar(number)('-1').content
        '-1'
1075

1076
    EBNF-Notation: ``[ ... ]``
1077

1078
    EBNF-Example:  ``number = ["-"]  /\d+/  [ /\.\d+/ ]``
1079 1080
    """

1081 1082
    def __init__(self, parser: Parser) -> None:
        super().__init__(parser)
1083 1084
        # assert isinstance(parser, Parser)
        assert not isinstance(parser, Option), \
di68kap's avatar
di68kap committed
1085
            "Redundant nesting of options: %s" % (str(self.ptype), str(parser.name))
1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        node, text = self.parser(text)
        if node:
            return Node(self, node), text
        return Node(self, ()), text

    def __repr__(self):
        return '[' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + ']'


class ZeroOrMore(Option):
    r"""
    `ZeroOrMore` applies a parser repeatedly as long as this parser
    matches. Like `Option` the `ZeroOrMore` parser always matches. In
    case of zero repetitions, the empty match `((), text)` is returned.

1104 1105
    Examples::

1106
        >>> sentence = ZeroOrMore(RE(r'\w+,?')) + TKN('.')
1107 1108 1109 1110
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> Grammar(sentence)('.').content  # an empty sentence also matches
        '.'
1111

1112
    EBNF-Notation: ``{ ... }``
1113

1114
    EBNF-Example:  ``sentence = { /\w+,?/ } "."``
1115 1116 1117 1118
    """

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
di68kap's avatar
di68kap committed
1119 1120
        n = len(text) + 1  # type: int
        infinite_loop_error = None  # type: Optional[Error]
1121 1122 1123 1124 1125 1126
        while text and len(text) < n:
            n = len(text)
            node, text = self.parser(text)
            if not node:
                break
            if len(text) == n:
Eckhart Arnold's avatar
Eckhart Arnold committed
1127
                node.errors.append(Error("Infinite loop!", node.pos, Error.MESSAGE))
eckhart's avatar
eckhart committed
1128
                infinite_loop_error = Error(dsl_error_msg(self, 'Infinite Loop encountered.'),
Eckhart Arnold's avatar
Eckhart Arnold committed
1129
                                            node.pos)
1130
            results += (node,)
di68kap's avatar
di68kap committed
1131 1132
        node = Node(self, results)
        if infinite_loop_error:
eckhart's avatar
eckhart committed
1133
            self.grammar.tree__.add_error(node, infinite_loop_error)
di68kap's avatar
di68kap committed
1134
        return node, text
1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + '}'


class OneOrMore(UnaryOperator):
    r"""
    `OneOrMore` applies a parser repeatedly as long as this parser
    matches. Other than `ZeroOrMore` which always matches, at least
    one match is required by `OneOrMore`.

1147 1148
    Examples::

1149
        >>> sentence = OneOrMore(RE(r'\w+,?')) + TKN('.')
1150 1151 1152 1153
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> str(Grammar(sentence)('.'))  # an empty sentence also matches
        ' <<< Error on "." | Parser did not match! Invalid source file?\n    Most advanced: None\n    Last match:    None; >>> '
1154

1155
    EBNF-Notation: ``{ ... }+``
1156

1157
    EBNF-Example:  ``sentence = { /\w+,?/ }+``
1158 1159
    """

1160 1161
    def __init__(self, parser: Parser) -> None:
        super().__init__(parser)
1162 1163
        assert not isinstance(parser, Option), \
            "Use ZeroOrMore instead of nesting OneOrMore and Option: " \
di68kap's avatar
di68kap committed
1164
            "%s(%s)" % (str(self.ptype), str(parser.name))
1165 1166 1167 1168

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
di68kap's avatar
di68kap committed
1169 1170
        n = len(text) + 1  # type: int
        infinite_loop_error = None  # type: Optional[Error]
1171 1172 1173 1174 1175 1176
        while text_ and len(text_) < n:
            n = len(text_)
            node, text_ = self.parser(text_)
            if not node:
                break
            if len(text_) == n:
Eckhart Arnold's avatar
Eckhart Arnold committed
1177
                node.errors.append(Error("Infinite loop!", node.pos, Error.MESSAGE))
eckhart's avatar
eckhart committed
1178
                infinite_loop_error = Error(dsl_error_msg(self, 'Infinite Loop encountered.'),
Eckhart Arnold's avatar
Eckhart Arnold committed
1179
                                            node.pos)
1180 1181 1182
            results += (node,)
        if results == ():
            return None, text
di68kap's avatar
di68kap committed
1183 1184
        node = Node(self, results)
        if infinite_loop_error:
eckhart's avatar
eckhart committed
1185
            self.grammar.tree__.add_error(node, infinite_loop_error)
di68kap's avatar
di68kap committed
1186
        return node, text_
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + '}+'


class Series(NaryOperator):
    r"""
    Matches if each of a series of parsers matches exactly in the order of
    the series.

1198 1199
    Example::

1200
        >>> variable_name = RegExp('(?!\d)\w') + RE('\w*')
1201 1202 1203 1204
        >>> Grammar(variable_name)('variable_1').content
        'variable_1'
        >>> str(Grammar(variable_name)('1_variable'))
        ' <<< Error on "1_variable" | Parser did not match! Invalid source file?\n    Most advanced: None\n    Last match:    None; >>> '
1205

1206
    EBNF-Notation: ``... ...``    (sequence of parsers separated by a blank or new line)
1207

1208
    EBNF-Example:  ``series = letter letter_or_digit``
1209 1210 1211 1212
    """
    RX_ARGUMENT = re.compile(r'\s(\S)')
    NOPE = 1000

1213 1214
    def __init__(self, *parsers: Parser, mandatory: int = NOPE) -> None:
        super().__init__(*parsers)
1215 1216 1217 1218 1219 1220 1221 1222 1223 1224
        length = len(self.parsers)
        assert 1 <= length < Series.NOPE, \
            'Length %i of series exceeds maximum length of %i' % (length, Series.NOPE)
        if mandatory < 0:
            mandatory += length
        assert 0 <= mandatory < length or mandatory == Series.NOPE
        self.mandatory = mandatory

    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
eckhart's avatar