parse.py 119 KB
Newer Older
eckhart's avatar
eckhart committed
2001
2002
2003
2004
2005
2006
2007

    EBNF-Notation: ``... ...``    (sequence of parsers separated by a blank or new line)

    EBNF-Example:  ``series = letter letter_or_digit``
    """
    RX_ARGUMENT = re.compile(r'\s(\S)')

2008
2009
    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
2010
2011
        duplicate = self.__class__(*parsers, mandatory=self.mandatory,
                                   err_msgs=self.err_msgs, skip=self.skip)
2012
        copy_parser_attrs(self, duplicate)
2013
        return duplicate
2014

eckhart's avatar
eckhart committed
2015
    @cython.locals(pos=cython.int, reloc=cython.int)
2016
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2017
2018
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
2019
        error = None  # type: Optional[Error]
2020
        for pos, parser in enumerate(self.parsers):
2021
            node, text_ = parser(text_)
di68kap's avatar
di68kap committed
2022
            if node is None:
2023
2024
2025
                if pos < self.mandatory:
                    return None, text
                else:
eckhart's avatar
eckhart committed
2026
2027
2028
                    reloc = self.get_reentry_point(text_)
                    error, node, text_ = self.mandatory_violation(
                        text_, isinstance(parser, Lookahead), parser.repr, reloc)
2029
                    # check if parsing of the series can be resumed somewhere
2030
                    if reloc >= 0:
2031
                        nd, text_ = parser(text_)  # try current parser again
di68kap's avatar
di68kap committed
2032
                        if nd is not None:
2033
2034
2035
2036
2037
                            results += (node,)
                            node = nd
                    else:
                        results += (node,)
                        break
2038
            if node._result or not node.tag_name.startswith(':'):  # drop anonymous empty nodes
2039
                results += (node,)
eckhart's avatar
eckhart committed
2040
        # assert len(results) <= len(self.parsers) \
2041
        #        or len(self.parsers) >= len([p for p in results if p.tag_name != ZOMBIE_TAG])
eckhart's avatar
eckhart committed
2042
        ret_node = self._return_values(results)  # type: Node
2043
        if error and reloc < 0:
2044
            raise ParserError(ret_node.with_pos(self.grammar.document_length__ - len(text_)),
2045
                              text, error, first_throw=True)
eckhart's avatar
eckhart committed
2046
        return ret_node, text_
2047
2048
2049

    def __repr__(self):
        return " ".join([parser.repr for parser in self.parsers[:self.mandatory]]
2050
                        + (['§'] if self.mandatory != NO_MANDATORY else [])
2051
2052
2053
                        + [parser.repr for parser in self.parsers[self.mandatory:]])

    # The following operator definitions add syntactical sugar, so one can write:
2054
    # `RE('\d+') + Optional(RE('\.\d+)` instead of `Series(RE('\d+'), Optional(RE('\.\d+))`
2055
2056

    @staticmethod
eckhart's avatar
eckhart committed
2057
    def combined_mandatory(left: Parser, right: Parser) -> int:
2058
2059
2060
2061
2062
        """
        Returns the position of the first mandatory element (if any) when
        parsers `left` and `right` are joined to a sequence.
        """
        left_mandatory, left_length = (left.mandatory, len(left.parsers)) \
2063
2064
            if isinstance(left, Series) else (NO_MANDATORY, 1)
        if left_mandatory != NO_MANDATORY:
2065
            return left_mandatory
2066
2067
        right_mandatory = right.mandatory if isinstance(right, Series) else NO_MANDATORY
        if right_mandatory != NO_MANDATORY:
2068
            return right_mandatory + left_length
2069
        return NO_MANDATORY
2070
2071
2072

    def __add__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
2073
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2074
2075
2076
2077
2078
        return Series(*(self.parsers + other_parsers),
                      mandatory=self.combined_mandatory(self, other))

    def __radd__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
2079
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2080
2081
2082
2083
2084
        return Series(*(other_parsers + self.parsers),
                      mandatory=self.combined_mandatory(other, self))

    def __iadd__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
2085
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2086
2087
2088
2089
2090
        self.parsers += other_parsers
        self.mandatory = self.combined_mandatory(self, other)
        return self


eckhart's avatar
eckhart committed
2091
class Alternative(NaryParser):
2092
2093
2094
2095
2096
2097
    r"""
    Matches if one of several alternatives matches. Returns
    the first match.

    This parser represents the EBNF-operator "|" with the qualification
    that both the symmetry and the ambiguity of the EBNF-or-operator
2098
    are broken by selecting the first match.::
2099

2100
        # the order of the sub-expression matters!
2101
        >>> number = RE(r'\d+') | RE(r'\d+') + RE(r'\.') + RE(r'\d+')
2102
        >>> str(Grammar(number)("3.1416"))
2103
        '3 <<< Error on ".1416" | Parser stopped before end! Terminating parser. >>> '
2104

2105
        # the most selective expression should be put first:
2106
        >>> number = RE(r'\d+') + RE(r'\.') + RE(r'\d+') | RE(r'\d+')
2107
2108
        >>> Grammar(number)("3.1416").content
        '3.1416'
2109

2110
    EBNF-Notation: ``... | ...``
2111

2112
    EBNF-Example:  ``sentence = /\d+\.\d+/ | /\d+/``
2113
2114
    """

2115
    def __init__(self, *parsers: Parser) -> None:
2116
        super(Alternative, self).__init__(*parsers)
2117
2118
2119
2120
2121
        assert len(self.parsers) >= 1
        # only the last alternative may be optional. Could this be checked at compile time?
        assert all(not isinstance(p, Option) for p in self.parsers[:-1]), \
            "Parser-specification Error: only the last alternative may be optional!"

2122
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2123
2124
        for parser in self.parsers:
            node, text_ = parser(text)
di68kap's avatar
di68kap committed
2125
            if node is not None:
2126
2127
2128
2129
                return self._return_value(node), text_
                # return self._return_value(node if node._result or parser.pname else None), text_
                # return Node(self.tag_name,
                #             node if node._result or parser.pname else ()), text_
2130
2131
2132
        return None, text

    def __repr__(self):
2133
2134
        if self.pname:
            return ' | '.join(parser.repr for parser in self.parsers)
2135
2136
2137
2138
2139
2140
2141
        return '(' + ' | '.join(parser.repr for parser in self.parsers) + ')'

    def reset(self):
        super(Alternative, self).reset()
        return self

    # The following operator definitions add syntactical sugar, so one can write:
2142
2143
    # `RE('\d+') + RE('\.') + RE('\d+') | RE('\d+')` instead of:
    # `Alternative(Series(RE('\d+'), RE('\.'), RE('\d+')), RE('\d+'))`
2144
2145
2146

    def __or__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
2147
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2148
2149
2150
2151
        return Alternative(*(self.parsers + other_parsers))

    def __ror__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
2152
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2153
2154
2155
2156
        return Alternative(*(other_parsers + self.parsers))

    def __ior__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
2157
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2158
2159
2160
2161
        self.parsers += other_parsers
        return self


Eckhart Arnold's avatar
Eckhart Arnold committed
2162
class AllOf(MandatoryNary):
2163
2164
2165
2166
2167
    """
    Matches if all elements of a list of parsers match. Each parser must
    match exactly once. Other than in a sequence, the order in which
    the parsers match is arbitrary, however.

2168
2169
    Example::

2170
        >>> prefixes = AllOf(TKN("A"), TKN("B"))
2171
2172
2173
2174
        >>> Grammar(prefixes)('A B').content
        'A B'
        >>> Grammar(prefixes)('B A').content
        'B A'
2175

2176
2177
2178
    Note: The semantics of the mandatory-parameter differs for `AllOf` from
    that of `Series`: Rather than the position of the sub-parser starting
    from which all following parsers cause the Series-parser to raise an
2179
2180
2181
2182
2183
    Error instead of returning a non-match, an error is raised if and only
    if the parsers up to (but not including the one at) the mandatory-position
    have already been exhausted, i.e. have already captured content for the
    AllOf-parser. Otherwise no error is raised, but just a non-match is
    returned.
2184

2185
    EBNF-Notation: ``<... ...>``    (sequence of parsers enclosed by angular brackets)
2186

2187
    EBNF-Example:  ``set = <letter letter_or_digit>``
2188
2189
    """

2190
2191
2192
2193
    def __init__(self, *parsers: Parser,
                 mandatory: int = NO_MANDATORY,
                 err_msgs: MessagesType = [],
                 skip: ResumeList = []) -> None:
2194
2195
        assert (not isinstance(parser, Option) and not isinstance(parser, OneOrMore)
                and not isinstance(parser, FlowParser) for parser in parsers)
2196
2197
        if len(parsers) == 1 and isinstance(parsers[0], Series):
            parsers = parsers[0].parsers
2198
2199
            if self.mandatory == NO_MANDATORY:
                self.mandatory = parsers[0].mandatory
2200
        assert len(parsers) > 1, "AllOf requires at least two sub-parsers."
eckhart's avatar
eckhart committed
2201
        super(AllOf, self).__init__(*parsers, mandatory=mandatory, err_msgs=err_msgs, skip=skip)
2202
        self.num_parsers = len(self.parsers)  # type: int
2203

2204
2205
2206
2207
    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
        duplicate = self.__class__(*parsers, mandatory=self.mandatory,
                                   err_msgs=self.err_msgs, skip=self.skip)
di68kap's avatar
di68kap committed
2208
        duplicate.pname = self.pname
2209
        copy_parser_attrs(self, duplicate)
2210
        return duplicate
2211

2212
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2213
2214
2215
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
        parsers = list(self.parsers)  # type: List[Parser]
2216
        error = None  # type: Optional[Error]
2217
2218
2219
        while parsers:
            for i, parser in enumerate(parsers):
                node, text__ = parser(text_)
di68kap's avatar
di68kap committed
2220
                if node is not None:
eckhart's avatar
eckhart committed
2221
2222
                    if node._result or not node.tag_name.startswith(':'):
                        # drop anonymous empty nodes
2223
2224
                        results += (node,)
                        text_ = text__
2225
2226
2227
                    del parsers[i]
                    break
            else:
2228
                # TODO: Should mandatory-semantics be changed for AllOf to match that of Interleave???
2229
2230
2231
2232
2233
                for i, p in enumerate(self.parsers):
                    if p in parsers and i < self.mandatory:
                        return None, text
                # if self.num_parsers - len(parsers) < self.mandatory:
                #     return None, text
2234
2235
                reloc = self.get_reentry_point(text_)
                expected = '< ' + ' '.join([parser.repr for parser in parsers]) + ' >'
2236
                error, err_node, text_ = self.mandatory_violation(text_, False, expected, reloc)
2237
2238
2239
                results += (err_node,)
                if reloc < 0:
                    parsers = []
2240
        assert len(results) <= len(self.parsers) \
eckhart's avatar
eckhart committed
2241
            or len(self.parsers) >= len([p for p in results if p.tag_name != ZOMBIE_TAG])
eckhart's avatar
eckhart committed
2242
        nd = self._return_values(results)  # type: Node
2243
        if error and reloc < 0:
2244
            raise ParserError(nd.with_pos(self.grammar.document_length__ - len(text)),
2245
                              text, error, first_throw=True)
eckhart's avatar
eckhart committed
2246
        return nd, text_
2247
2248

    def __repr__(self):
2249
        return '< ' + ' '.join(parser.repr for parser in self.parsers) + ' >'
2250
2251


eckhart's avatar
eckhart committed
2252
class SomeOf(NaryParser):
2253
2254
2255
2256
2257
    """
    Matches if at least one element of a list of parsers match. No parser
    must match more than once . Other than in a sequence, the order in which
    the parsers match is arbitrary, however.

2258
2259
    Example::

2260
        >>> prefixes = SomeOf(TKN("A"), TKN("B"))
2261
2262
2263
2264
2265
2266
        >>> Grammar(prefixes)('A B').content
        'A B'
        >>> Grammar(prefixes)('B A').content
        'B A'
        >>> Grammar(prefixes)('B').content
        'B'
2267

2268
    EBNF-Notation: ``<... ...>``    (sequence of parsers enclosed by angular brackets)
2269

2270
    EBNF-Example:  ``set = <letter letter_or_digit>``
2271
2272
    """

2273
    def __init__(self, *parsers: Parser) -> None:
2274
        if len(parsers) == 1 and isinstance(parsers[0], Alternative):
2275
            parsers = parsers[0].parsers
2276
        assert len(parsers) > 1, "SomeOf requires at least two sub-parsers."
2277
2278
        assert (not isinstance(parser, Option) and not isinstance(parser, OneOrMore)
                and not isinstance(parser, FlowParser) for parser in parsers)
2279
        super(SomeOf, self).__init__(*parsers)
2280

2281
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2282
2283
2284
2285
2286
2287
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
        parsers = list(self.parsers)  # type: List[Parser]
        while parsers:
            for i, parser in enumerate(parsers):
                node, text__ = parser(text_)
di68kap's avatar
di68kap committed
2288
                if node is not None:
eckhart's avatar
eckhart committed
2289
2290
                    if node._result or not node.tag_name.startswith(':'):
                        # drop anonymous empty nodes
2291
2292
                        results += (node,)
                        text_ = text__
2293
2294
2295
2296
2297
2298
                    del parsers[i]
                    break
            else:
                parsers = []
        assert len(results) <= len(self.parsers)
        if results:
eckhart's avatar
eckhart committed
2299
            return self._return_values(results), text_
2300
2301
2302
2303
        else:
            return None, text

    def __repr__(self):
2304
        return '< ' + ' | '.join(parser.repr for parser in self.parsers) + ' >'
2305
2306


eckhart's avatar
eckhart committed
2307
def Unordered(parser: NaryParser) -> NaryParser:
2308
2309
2310
2311
2312
    """
    Returns an AllOf- or SomeOf-parser depending on whether `parser`
    is a Series (AllOf) or an Alternative (SomeOf).
    """
    if isinstance(parser, Series):
2313
        return AllOf(parser)
2314
    elif isinstance(parser, Alternative):
2315
        return SomeOf(parser)
2316
2317
2318
2319
    else:
        raise AssertionError("Unordered can take only Series or Alternative as parser.")


2320
class Interleave(MandatoryNary):
2321
2322
2323
2324
2325
2326
    """EXPERIMENTAL!!! NOT YET TESTED at all!!!"""

    def __init__(self, *parsers: Parser,
                 mandatory: int = NO_MANDATORY,
                 err_msgs: MessagesType = [],
                 skip: ResumeList = [],
2327
2328
2329
                 repetitions: Sequence[Tuple[int, int]] = ()) -> None:
        assert (not isinstance(parser, Option) and not isinstance(parser, OneOrMore)
                and not isinstance(parser, FlowParser) for parser in parsers)
2330
2331
        super(Interleave, self).__init__(
            *parsers, mandatory=mandatory, err_msgs=err_msgs, skip=skip)
2332
2333
2334
2335
        if len(repetitions) == 0:
            repetitions = [(1, 1)] * len(parsers)
        elif len(parsers) != len(repetitions):
            raise ValueError("Number of repetition-tuples unequal number of sub-parsers!")
2336
        self.repetitions = repetitions
2337
        self.non_mandatory = frozenset(parsers[i] for i in range(min(mandatory, len(parsers))))
2338
        self.parsers_set = frozenset(self.parsers)
2339
2340
2341
2342
2343
2344
2345
2346
2347

    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
        duplicate = self.__class__(*parsers, mandatory=self.mandatory,
                                   err_msgs=self.err_msgs, skip=self.skip,
                                   repetitions=self.repetitions)
        duplicate.pname = self.pname
        copy_parser_attrs(self, duplicate)
        return duplicate
2348
2349
2350
2351

    def _parse(self, text: StringView) :
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
2352
        counter = [0] * len(self.parsers)
2353
2354
        consumed = set()  # type: Set[Parser]
        error = None  # type: Optional[Error]
2355
2356
        while True:
            for i, parser in enumerate(self.parsers):
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
                if parser not in consumed:
                    node, text__ = parser(text_)
                    if node is not None:
                        if node._result or not node.tag_name.startswith(':'):
                            # drop anonymous empty nodes
                            results += (node,)
                            text_ = text__
                        counter[i] += 1
                        if counter[i] >= self.repetitions[i][1]:
                            consumed.add(parser)
                        break
2368
            else:
2369
2370
2371
2372
2373
2374
2375
                for i, parser in enumerate(self.parsers):
                    if counter[i] >= self.repetitions[i][0]:
                        consumed.add(parser)
                if self.non_mandatory <= consumed:
                    if consumed == self.parsers_set:
                        break
                else:
2376
2377
                    return None, text
                reloc = self.get_reentry_point(text_)
2378
2379
                expected = ' ° '.join([parser.repr for parser in self.parsers])
                error, err_node, text_ = self.mandatory_violation(text_, False, expected, reloc)
2380
2381
                results += (err_node,)
                if reloc < 0:
2382
                    break
2383
2384
2385
2386
2387
2388
2389
        nd = self._return_values(results)  # type: Node
        if error and reloc < 0:
            raise ParserError(nd.with_pos(self.grammar.document_length__ - len(text)),
                              text, error, first_throw=True)
        return nd, text_

    def __repr__(self):
2390
        return ' ° '.join(parser.repr for parser in self.parsers)
2391
2392


2393
2394
########################################################################
#
eckhart's avatar
eckhart committed
2395
# Flow control parsers
2396
2397
2398
#
########################################################################

eckhart's avatar
eckhart committed
2399
class FlowParser(UnaryParser):
2400
    """
eckhart's avatar
eckhart committed
2401
    Base class for all flow parsers like Lookahead and Lookbehind.
2402
2403
2404
2405
2406
2407
    """
    def sign(self, bool_value) -> bool:
        """Returns the value. Can be overriden to return the inverted bool."""
        return bool_value


eckhart's avatar
eckhart committed
2408
2409
2410
2411
def Required(parser: Parser) -> Parser:
    return Series(parser, mandatory=0)


eckhart's avatar
eckhart committed
2412
# class Required(FlowParser):
2413
2414
2415
2416
#     """OBSOLETE. Use mandatory-parameter of Series-parser instead!
#     """
#     RX_ARGUMENT = re.compile(r'\s(\S)')
#
2417
#     def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2418
2419
2420
2421
2422
2423
#         node, text_ = self.parser(text)
#         if not node:
#             m = text.search(Required.RX_ARGUMENT)  # re.search(r'\s(\S)', text)
#             i = max(1, text.index(m.regs[1][0])) if m else 1
#             node = Node(self, text[:i])
#             text_ = text[i:]
eckhart's avatar
eckhart committed
2424
#             self.grammar.tree__.new_error(node,
2425
2426
#                                           '%s expected; "%s" found!' % (str(self.parser),
#                                           text[:10]), code=Error.MANDATORY_CONTINUATION)
2427
2428
2429
2430
2431
2432
#         return node, text_
#
#     def __repr__(self):
#         return '§' + self.parser.repr


eckhart's avatar
eckhart committed
2433
class Lookahead(FlowParser):
2434
2435
2436
2437
    """
    Matches, if the contained parser would match for the following text,
    but does not consume any text.
    """
2438
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2439
        node, _ = self.parser(text)
2440
        if self.sign(node is not None):
eckhart's avatar
eckhart committed
2441
2442
            # static analysis requires lookahead to be disabled at document end
            # or (self.grammar.static_analysis_pending__ and not text)):
2443
            return (EMPTY_NODE if self.anonymous else Node(self.tag_name, '')), text
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
        else:
            return None, text

    def __repr__(self):
        return '&' + self.parser.repr


class NegativeLookahead(Lookahead):
    """
    Matches, if the contained parser would *not* match for the following
    text.
    """
    def __repr__(self):
        return '!' + self.parser.repr

    def sign(self, bool_value) -> bool:
        return not bool_value


eckhart's avatar
eckhart committed
2463
class Lookbehind(FlowParser):
2464
2465
    """
    Matches, if the contained parser would match backwards. Requires
2466
    the contained parser to be a RegExp, _RE, PlainText or _Token parser.
2467
2468

    EXPERIMENTAL
2469
    """
2470
    def __init__(self, parser: Parser) -> None:
2471
2472
2473
        p = parser
        while isinstance(p, Synonym):
            p = p.parser
2474
        assert isinstance(p, RegExp) or isinstance(p, Token)
eckhart's avatar
eckhart committed
2475
        self.regexp = None
eckhart's avatar
eckhart committed
2476
        self.text = ''  # type: str
2477
        if isinstance(p, RegExp):
eckhart's avatar
eckhart committed
2478
2479
            self.regexp = cast(RegExp, p).regexp
        else:  # p is of type PlainText
2480
            self.text = cast(Token, p).text
2481
        super(Lookbehind, self).__init__(parser)
2482

2483
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
di68kap's avatar
di68kap committed
2484
        backwards_text = self.grammar.reversed__[text.__len__():]
eckhart's avatar
eckhart committed
2485
        if self.regexp is None:  # assert self.text is not None
di68kap's avatar
di68kap committed
2486
            does_match = backwards_text[:text.__len__()] == self.text
eckhart's avatar
eckhart committed
2487
2488
        else:  # assert self.regexp is not None
            does_match = backwards_text.match(self.regexp)
2489
2490
2491
2492
2493
        if self.sign(does_match):
            if self.drop_content:
                return EMPTY_NODE, text
            return Node(self.tag_name, ''), text
        return None, text
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512

    def __repr__(self):
        return '-&' + self.parser.repr


class NegativeLookbehind(Lookbehind):
    """
    Matches, if the contained parser would *not* match backwards. Requires
    the contained parser to be a RegExp-parser.
    """
    def __repr__(self):
        return '-!' + self.parser.repr

    def sign(self, bool_value) -> bool:
        return not bool(bool_value)


########################################################################
#
eckhart's avatar
eckhart committed
2513
# Capture and Retrieve parsers (for passing variables in the parser)
2514
2515
2516
2517
#
########################################################################


eckhart's avatar
eckhart committed
2518
class Capture(UnaryParser):
2519
2520
2521
2522
2523
    """
    Applies the contained parser and, in case of a match, saves the result
    in a variable. A variable is a stack of values associated with the
    contained parser's name. This requires the contained parser to be named.
    """
2524
2525
2526
2527
    def __init__(self, parser: Parser) -> None:
        assert not parser.drop_content, \
            "Cannot capture content of returned by parser, the content of which will be dropped!"
        super(Capture, self).__init__(parser)
2528

eckhart's avatar
eckhart committed
2529
2530
2531
    def _rollback(self):
        return self.grammar.variables__[self.pname].pop()

2532
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2533
        node, text_ = self.parser(text)
di68kap's avatar
di68kap committed
2534
        if node is not None:
di68kap's avatar
di68kap committed
2535
            assert self.pname, """Tried to apply an unnamed capture-parser!"""
2536
            assert not self.parser.drop_content, \
eckhart's avatar
eckhart committed
2537
2538
                "Cannot capture content of returned by parser, the content of which " \
                "will be dropped!"
eckhart's avatar
eckhart committed
2539
            self.grammar.variables__[self.pname].append(node.content)
di68kap's avatar
di68kap committed
2540
            location = self.grammar.document_length__ - text.__len__()
eckhart's avatar
eckhart committed
2541
            self.grammar.push_rollback__(location, self._rollback)  # lambda: stack.pop())
2542
2543
            # caching will be blocked by parser guard (see way above),
            # because it would prevent recapturing of rolled back captures
eckhart's avatar
eckhart committed
2544
            return self._return_value(node), text_
2545
2546
2547
2548
2549
2550
2551
        else:
            return None, text

    def __repr__(self):
        return self.parser.repr


2552
2553
2554
2555
2556
2557
MatchVariableFunc = Callable[[Union[StringView, str], List[str]], Optional[str]]
# (text, stack) -> value, where:
# text is the following text for be parsed
# stack is a stack of stored variables (for a particular symbol)
# and the return value is the matched text (which can be the empty string) or
# None, if no match occurred
2558
2559


2560
2561
2562
2563
2564
def last_value(text: Union[StringView, str], stack: List[str]) -> str:
    """Matches `text` with the most recent value on the capture stack.
    This is the default case when retrieving captured substrings."""
    value = stack[-1]
    return value if text.startswith(value) else None
2565
2566


2567
2568
2569
2570
2571
2572
2573
def optional_last_value(text: Union[StringView, str], stack: List[str]) -> str:
    """Matches `text` with the most recent value on the capture stack or
    with the empty string, i.e. `optional_match` never returns `None` but
    either the value on the stack or the empty string.
    Use case: Implement shorthand notation for matching tags, i.e.:
        Good Morning, Mrs. <emph>Smith</>!
    """
2574
    value = stack[-1]
2575
    return value if text.startswith(value) else ""
2576
2577


2578
2579
2580
2581
2582
2583
def matching_bracket(text: Union[StringView, str], stack: List[str]) -> str:
    """Returns a closing bracket for the opening bracket on the capture stack,
    i.e. if "[" was captured, "]" will be retrieved."""
    value = stack[-1]
    value = value.replace("(", ")").replace("[", "]").replace("{", "}").replace("<", ">")
    return value if text.startswith(value) else None
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594


class Retrieve(Parser):
    """
    Matches if the following text starts with the value of a particular
    variable. As a variable in this context means a stack of values,
    the last value will be compared with the following text. It will not
    be removed from the stack! (This is the difference between the
    `Retrieve` and the `Pop` parser.)
    The constructor parameter `symbol` determines which variable is
    used.
2595
2596
2597
2598

    Attributes:
        symbol: The parser that has stored the value to be retrieved, in
            other words: "the observed parser"
2599
        match_func: a procedure that through which the processing to the
eckhart's avatar
eckhart committed
2600
            retrieved symbols is channeled. In the simplest case it merely
2601
2602
            returns the last string stored by the observed parser. This can
            be (mis-)used to execute any kind of semantic action.
2603
2604
    """

2605
    def __init__(self, symbol: Parser, match_func: MatchVariableFunc = None) -> None:
2606
        super(Retrieve, self).__init__()
2607
        self.symbol = symbol
2608
        self.match = match_func if match_func else last_value
2609
2610

    def __deepcopy__(self, memo):
2611
        duplicate = self.__class__(self.symbol, self.match)
2612
        copy_parser_attrs(self, duplicate)
2613
        return duplicate
2614

2615
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2616
2617
2618
2619
2620
2621
2622
2623
2624
        # the following indirection allows the call() method to be called
        # from subclass without triggering the parser guard a second time
        return self.retrieve_and_match(text)

    def __repr__(self):
        return ':' + self.symbol.repr

    def retrieve_and_match(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        """
2625
        Retrieves variable from stack through the match function passed to
2626
2627
2628
2629
2630
        the class' constructor and tries to match the variable's value with
        the following text. Returns a Node containing the value or `None`
        accordingly.
        """
        try:
di68kap's avatar
di68kap committed
2631
            stack = self.grammar.variables__[self.symbol.pname]
2632
            value = self.match(text, stack)
2633
        except (KeyError, IndexError):
di68kap's avatar
di68kap committed
2634
            node = Node(self.tag_name, '').with_pos(self.grammar.document_length__ - text.__len__())
eckhart's avatar
eckhart committed
2635
            self.grammar.tree__.new_error(
di68kap's avatar
di68kap committed
2636
                node, dsl_error_msg(self, "'%s' undefined or exhausted." % self.symbol.pname))
2637
            return node, text
2638
        if value is None:
2639
            return None, text
2640
2641
2642
        elif self.drop_content:
            return EMPTY_NODE, text[len(value):]
        return Node(self.tag_name, value), text[len(value):]
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655


class Pop(Retrieve):
    """
    Matches if the following text starts with the value of a particular
    variable. As a variable in this context means a stack of values,
    the last value will be compared with the following text. Other
    than the `Retrieve`-parser, the `Pop`-parser removes the value
    from the stack in case of a match.

    The constructor parameter `symbol` determines which variable is
    used.
    """
2656
2657
    def __init__(self, symbol: Parser, match_func: MatchVariableFunc = None) -> None:
        super(Pop, self).__init__(symbol, match_func)
eckhart's avatar
eckhart committed
2658
2659
2660

    def reset(self):
        super(Pop, self).reset()
eckhart's avatar
eckhart committed
2661
2662
2663
        self.values = []

    def __deepcopy__(self, memo):
2664
        duplicate = self.__class__(self.symbol, self.match)
2665
        copy_parser_attrs(self, duplicate)
eckhart's avatar
eckhart committed
2666
2667
2668
2669
        duplicate.values = self.values[:]
        return duplicate

    def _rollback(self):
2670
        self.grammar.variables__[self.symbol.pname].append(self.values.pop())
2671

2672
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
eckhart's avatar
eckhart committed
2673
        node, txt = self.retrieve_and_match(text)
di68kap's avatar
di68kap committed
2674
        if node is not None and not id(node) in self.grammar.tree__.error_nodes:
eckhart's avatar
eckhart committed
2675
            self.values.append(self.grammar.variables__[self.symbol.pname].pop())
di68kap's avatar
di68kap committed
2676
            location = self.grammar.document_length__ - text.__len__()
eckhart's avatar
eckhart committed
2677
            self.grammar.push_rollback__(location, self._rollback)  # lambda: stack.append(value))
2678
2679
2680
        return node, txt

    def __repr__(self):
2681
        return ':?' + self.symbol.repr if self.match.__name__.startswith('optional_') else '::'
2682
2683


2684
2685
2686
2687
2688
########################################################################
#
# Aliasing parser classes
#
########################################################################
2689
2690


eckhart's avatar
eckhart committed
2691
class Synonym(UnaryParser):
2692
2693
2694
2695
    r"""
    Simply calls another parser and encapsulates the result in
    another node if that parser matches.

2696
    This parser is needed to support synonyms in EBNF, e.g.::
2697

2698
2699
        jahr       = JAHRESZAHL
        JAHRESZAHL = /\d\d\d\d/
2700

2701
2702
    Otherwise the first line could not be represented by any parser
    class, in which case it would be unclear whether the parser
2703
    RegExp('\d\d\d\d') carries the name 'JAHRESZAHL' or 'jahr'.
2704
    """
2705
    def __init__(self, parser: Parser) -> None:
Eckhart Arnold's avatar
Eckhart Arnold committed
2706
        assert not parser.drop_content
2707
        super(Synonym, self).__init__(parser)
2708

2709
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
eckhart's avatar
eckhart committed
2710
2711
        # circumvent Parser.__call__ as an optimization (dangerous?)
        node, text = self.parser._parse(text)
di68kap's avatar
di68kap committed
2712
        if node is not None:
2713
2714
            if self.drop_content:
                return EMPTY_NODE, text
2715
2716
2717
2718
2719
            # if self.anonymous:
            #     if node.tag_name[0] != ':':  # implies != EMPTY_NODE
            #         node.tag_name = self.tag_name
            # else:
            if not self.anonymous:
2720
2721
2722
2723
                if node == EMPTY_NODE:
                    return Node(self.tag_name, ''), text
                node.tag_name = self.tag_name
        return node, text
2724
2725

    def __repr__(self):
di68kap's avatar
di68kap committed
2726
        return self.pname or self.parser.repr
2727
2728
2729
2730
2731
2732


class Forward(Parser):
    r"""
    Forward allows to declare a parser before it is actually defined.
    Forward declarations are needed for parsers that are recursively
2733
    nested, e.g.::
2734
2735
2736
2737
2738
2739
2740
2741
2742

        class Arithmetic(Grammar):
            '''
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
2743
2744
2745
2746
            INTEGER    = RE('\\d+')
            factor     = INTEGER | TKN("(") + expression + TKN(")")
            term       = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
2747
            root__     = expression
2748
2749
2750
    """

    def __init__(self):
2751
        super(Forward, self).__init__()
eckhart's avatar
eckhart committed
2752
        self.parser = PARSER_PLACEHOLDER  # type: Parser
2753
2754
2755
2756
        self.cycle_reached = False

    def __deepcopy__(self, memo):
        duplicate = self.__class__()
di68kap's avatar
di68kap committed
2757
        # duplicate.pname = self.pname  # Forward-Parsers should never have a name!
2758
        duplicate.anonymous = self.anonymous
2759
        duplicate.tag_name = self.tag_name
2760
2761
        memo[id(self)] = duplicate
        parser = copy.deepcopy(self.parser, memo)
Eckhart Arnold's avatar
Eckhart Arnold committed
2762
        duplicate.parser = parser
eckhart's avatar
eckhart committed
2763
        duplicate.drop_content = parser.drop_content
2764
2765
2766
        return duplicate

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2767
2768
2769
2770
2771
2772
        """
        Overrides Parser.__call__, because Forward is not an independent parser
        but merely a redirects the call to another parser. Other then parser
        `Synonym`, which might be a meaningful marker for the syntax tree,
        parser Forward should never appear in the syntax tree.
        """
2773
2774
        return self.parser(text)

di68kap's avatar
di68kap committed
2775
2776
2777
2778
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        # for the exceptional case in class Synonym where the ._parse method is called directly
        return self.parser(text)

2779
2780
2781
2782
    def set_proxy(self, proxy: Optional[ParseFunc]):
        """`set_proxy` has no effects on Forward-objects!"""
        return

2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
    def __cycle_guard(self, func, alt_return):
        """
        Returns the value of `func()` or `alt_return` if a cycle has
        been reached (which can happen if `func` calls methods of
        child parsers).
        """
        if self.cycle_reached:
            return alt_return
        else:
            self.cycle_reached = True
            ret = func()
            self.cycle_reached = False
            return ret

    def __repr__(self):
        return self.__cycle_guard(lambda: repr(self.parser), '...')

    def __str__(self):
        return self.__cycle_guard(lambda: str(self.parser), '...')

2803
2804
    @property
    def repr(self) -> str:
di68kap's avatar
di68kap committed
2805
        """Returns the parser's name if it has a name or repr(self) if not."""
di68kap's avatar
di68kap committed
2806
        return self.parser.pname if self.parser.pname else self.__repr__()
2807

2808
2809
2810
2811
2812
2813
    def set(self, parser: Parser):
        """
        Sets the parser to which the calls to this Forward-object
        shall be delegated.
        """
        self.parser = parser
2814
        self.drop_content = parser.drop_content
2815

Eckhart Arnold's avatar
Eckhart Arnold committed
2816
    def sub_parsers(self) -> Tuple[Parser, ...]:
eckhart's avatar
eckhart committed
2817
2818
2819
        if is_parser_placeholder(self.parser):
            return tuple()
        return (self.parser,)