parse.py 140 KB
Newer Older
2001
        >>> number = Option(TKN('-')) + RegExp(r'\d+') + Option(RegExp(r'\.\d+'))
2002
2003
        >>> Grammar(number)('3.14159').content
        '3.14159'
eckhart's avatar
eckhart committed
2004
        >>> Grammar(number)('3.14159').as_sxpr()
eckhart's avatar
eckhart committed
2005
        '(:Series (:RegExp "3") (:RegExp ".14159"))'
2006
2007
        >>> Grammar(number)('-1').content
        '-1'
2008

2009
    EBNF-Notation: ``[ ... ]``
2010

2011
    EBNF-Example:  ``number = ["-"]  /\d+/  [ /\.\d+/ ]``
2012
2013
    """

2014
    def __init__(self, parser: Parser) -> None:
2015
        super(Option, self).__init__(parser)
2016

2017
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2018
        node, text = self.parser(text)
eckhart's avatar
eckhart committed
2019
        return self._return_value(node), text
2020

eckhart's avatar
eckhart committed
2021
2022
2023
    def is_optional(self) -> Optional[bool]:
        return True

2024
2025
    def __repr__(self):
        return '[' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
di68kap's avatar
di68kap committed
2026
                      and not self.parser.pname else self.parser.repr) + ']'
2027

eckhart's avatar
eckhart committed
2028
2029
    def static_analysis(self) -> List['AnalysisError']:
        errors = super().static_analysis()
2030
        if self.parser.is_optional():
eckhart's avatar
eckhart committed
2031
            errors.append(self.static_error(
2032
                "Redundant nesting of optional parser in " + self.location_info(),
eckhart's avatar
eckhart committed
2033
2034
                OPTIONAL_REDUNDANTLY_NESTED_WARNING))
        return errors
2035

2036
2037
2038
2039
2040
2041
2042

class ZeroOrMore(Option):
    r"""
    `ZeroOrMore` applies a parser repeatedly as long as this parser
    matches. Like `Option` the `ZeroOrMore` parser always matches. In
    case of zero repetitions, the empty match `((), text)` is returned.

2043
2044
    Examples::

2045
        >>> sentence = ZeroOrMore(RE(r'\w+,?')) + TKN('.')
2046
2047
2048
2049
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> Grammar(sentence)('.').content  # an empty sentence also matches
        '.'
2050
2051
        >>> forever = ZeroOrMore(RegExp(''))
        >>> Grammar(forever)('')  # infinite loops will automatically be broken
2052
        Node(':EMPTY', '')
2053

2054
    EBNF-Notation: ``{ ... }``
2055

2056
    EBNF-Example:  ``sentence = { /\w+,?/ } "."``
2057
2058
    """

eckhart's avatar
eckhart committed
2059
    @cython.locals(n=cython.int)
2060
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2061
        results = ()  # type: Tuple[Node, ...]
2062
2063
2064
2065
        length = text.__len__()
        n = length + 1  # type: int
        while length < n:  # text and length(text) < n:
            n = length
2066
            node, text = self.parser(text)
2067
            length = text.__len__()
di68kap's avatar
di68kap committed
2068
            if node is None:
2069
                break
2070
            if node._result or not node.tag_name.startswith(':'):  # drop anonymous empty nodes
2071
                results += (node,)
2072
            if length == n:
2073
                break  # avoid infinite loop
eckhart's avatar
eckhart committed
2074
2075
        nd = self._return_values(results)  # type: Node
        return nd, text
2076
2077
2078

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
di68kap's avatar
di68kap committed
2079
                      and not self.parser.pname else self.parser.repr) + '}'
2080
2081


eckhart's avatar
eckhart committed
2082
class OneOrMore(UnaryParser):
2083
2084
2085
2086
2087
    r"""
    `OneOrMore` applies a parser repeatedly as long as this parser
    matches. Other than `ZeroOrMore` which always matches, at least
    one match is required by `OneOrMore`.

2088
2089
    Examples::

2090
        >>> sentence = OneOrMore(RE(r'\w+,?')) + TKN('.')
2091
2092
2093
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> str(Grammar(sentence)('.'))  # an empty sentence also matches
2094
        ' <<< Error on "." | Parser "{/\\\\w+,?/ ~}+ `.` ~" did not match! >>> '
2095
2096
        >>> forever = OneOrMore(RegExp(''))
        >>> Grammar(forever)('')  # infinite loops will automatically be broken
2097
        Node(':EMPTY', '')
2098

2099
    EBNF-Notation: ``{ ... }+``
2100

2101
    EBNF-Example:  ``sentence = { /\w+,?/ }+``
2102
    """
2103
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2104
2105
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
2106
        match_flag = False
2107
2108
2109
2110
        length = text.__len__()
        n = length + 1  # type: int
        while length < n:  # text_ and len(text_) < n:
            n = length
2111
            node, text_ = self.parser(text_)
2112
            length = text_.__len__()
di68kap's avatar
di68kap committed
2113
            if node is None:
2114
                break
2115
            match_flag = True
2116
            if node._result or not node.tag_name.startswith(':'):  # drop anonymous empty nodes
2117
                results += (node,)
2118
            if length == n:
2119
                break  # avoid infinite loop
2120
        if not match_flag:
2121
            return None, text
eckhart's avatar
eckhart committed
2122
2123
        nd = self._return_values(results)  # type: Node
        return nd, text_
2124
2125
2126

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
di68kap's avatar
di68kap committed
2127
                      and not self.parser.pname else self.parser.repr) + '}+'
2128

eckhart's avatar
eckhart committed
2129
2130
    def static_analysis(self) -> List[AnalysisError]:
        errors = super().static_analysis()
2131
        if self.parser.is_optional():
eckhart's avatar
eckhart committed
2132
            errors.append(self.static_error(
2133
                "Use ZeroOrMore instead of nesting OneOrMore with an optional parser in " \
eckhart's avatar
eckhart committed
2134
2135
                + self.location_info(), BADLY_NESTED_OPTIONAL_PARSER))
        return errors
2136

2137

di68kap's avatar
di68kap committed
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
INFINITE = 2**30


def to_interleave(parser: Parser) -> Parser:
    """Converts a `Counted`-parser into an `Interleave`-parser. Any other
    parser is simply passed through."""
    if isinstance(parser, Counted):
        return Interleave(cast(Counted, parser).parser,
                          repetitions=[cast(Counted, parser).repetitions])
    return parser


class Counted(UnaryParser):
    """Counted applies a parser for a number of repetitions within a given range, i.e.
    the parser must at least for the lower bound number of repetitions in order to
2153
2154
2155
    match and it matches at most the upper bound number of repetitions.

    Examples:
di68kap's avatar
di68kap committed
2156

2157
    >>> A2_4 = Counted(Text('A'), (2, 4))
di68kap's avatar
di68kap committed
2158
    >>> A2_4
eckhart's avatar
eckhart committed
2159
    `A`{2,4}
2160
    >>> Grammar(A2_4)('AA').as_sxpr()
eckhart's avatar
eckhart committed
2161
    '(:Counted (:Text "A") (:Text "A"))'
2162
    >>> Grammar(A2_4)('AAAAA', complete_match=False).as_sxpr()
eckhart's avatar
eckhart committed
2163
    '(:Counted (:Text "A") (:Text "A") (:Text "A") (:Text "A"))'
2164
    >>> Grammar(A2_4)('A', complete_match=False).as_sxpr()
eckhart's avatar
eckhart committed
2165
    '(ZOMBIE__ `(Error (1040): Parser did not match!))'
2166
    >>> moves = OneOrMore(Counted(Text('A'), (1, 3)) + Counted(Text('B'), (1, 3)))
2167
2168
2169
    >>> result = Grammar(moves)('AAABABB')
    >>> result.tag_name, result.content
    (':OneOrMore', 'AAABABB')
2170
    >>> moves = Counted(Text('A'), (2, 3)) * Counted(Text('B'), (2, 3))
di68kap's avatar
di68kap committed
2171
    >>> moves
eckhart's avatar
eckhart committed
2172
    `A`{2,3} ° `B`{2,3}
2173
    >>> Grammar(moves)('AAABB').as_sxpr()
eckhart's avatar
eckhart committed
2174
    '(:Interleave (:Text "A") (:Text "A") (:Text "A") (:Text "B") (:Text "B"))'
di68kap's avatar
di68kap committed
2175
2176
2177

    While a Counted-parser could be treated as a special case of Interleave-parser,
    defining a dedicated class makes the purpose clearer and runs slightly faster.
di68kap's avatar
di68kap committed
2178
2179
2180
    """
    def __init__(self, parser: Parser, repetitions: Tuple[int, int]) -> None:
        super(Counted, self).__init__(parser)
eckhart's avatar
eckhart committed
2181
        self.repetitions = repetitions  # type: Tuple[int, int]
di68kap's avatar
di68kap committed
2182
2183
2184
2185
2186
2187
2188

    def __deepcopy__(self, memo):
        parser = copy.deepcopy(self.parser, memo)
        duplicate = self.__class__(parser, self.repetitions)
        copy_parser_base_attrs(self, duplicate)
        return duplicate

eckhart's avatar
eckhart committed
2189
    @cython.locals(n=cython.int, length=cython.int)
di68kap's avatar
di68kap committed
2190
2191
2192
    def _parse(self, text: StringView):
        results = ()  # Tuple[Node, ...]
        text_ = text
2193
        length = text_.__len__()
di68kap's avatar
di68kap committed
2194
2195
2196
2197
2198
        for _ in range(self.repetitions[0]):
            node, text_ = self.parser(text_)
            if node is None:
                return None, text
            results += (node,)
2199
2200
2201
2202
            n = length
            length = text_.__len__()
            if length == n:
                break  # avoid infinite loop
di68kap's avatar
di68kap committed
2203
2204
2205
2206
2207
        for _ in range(self.repetitions[1] - self.repetitions[0]):
            node, text_ = self.parser(text_)
            if node is None:
                break
            results += (node,)
2208
2209
2210
2211
            n = length
            length = text_.__len__()
            if length == n:
                break  # avoid infinite loop
di68kap's avatar
di68kap committed
2212
2213
2214
2215
2216
2217
        return self._return_values(results), text_

    def is_optional(self) -> Optional[bool]:
        return self.repetitions[0] == 0

    def __repr__(self):
2218
        return self.parser.repr + "{%i,%i}" % self.repetitions
di68kap's avatar
di68kap committed
2219

eckhart's avatar
eckhart committed
2220
    @cython.locals(a=cython.int, b=cython.int)
eckhart's avatar
eckhart committed
2221
2222
    def static_analysis(self) -> List['AnalysisError']:
        errors = super().static_analysis()
di68kap's avatar
di68kap committed
2223
2224
        a, b = self.repetitions
        if a < 0 or b < 0 or a > b or a > INFINITE or b > INFINITE:
eckhart's avatar
eckhart committed
2225
            errors.append(self.static_error(
di68kap's avatar
di68kap committed
2226
                'Repetition count [a=%i, b=%i] for parser %s violates requirement '
2227
                '0 <= a <= b <= infinity = 2^30' % (a, b, str(self)),
eckhart's avatar
eckhart committed
2228
2229
                BAD_REPETITION_COUNT))
        return errors
di68kap's avatar
di68kap committed
2230
2231


2232
MessagesType = List[Tuple[Union[str, RxPatternType, Callable], str]]
2233
NO_MANDATORY = 2**30
2234
2235


Eckhart Arnold's avatar
Eckhart Arnold committed
2236
class MandatoryNary(NaryParser):
2237
    r"""
2238
    Attributes:
eckhart's avatar
eckhart committed
2239
        mandatory:  Number of the element starting at which the element
2240
2241
                and all following elements are considered "mandatory". This
                means that rather than returning a non-match an error message
eckhart's avatar
eckhart committed
2242
                is issued. The default value is NO_MANDATORY, which means that
2243
2244
2245
                no elements are mandatory. NOTE: The semantics of the mandatory-
                parameter might change depending on the sub-class implementing
                it.
eckhart's avatar
eckhart committed
2246
        err_msgs:  A list of pairs of regular expressions (or simple
2247
2248
2249
                strings or boolean valued functions) and error messages
                that are chosen if the regular expression matches the text
                where the error occurred.
2250
        skip: A list of regular expressions. The rest of the text is searched for
eckhart's avatar
eckhart committed
2251
2252
                each of these. The closest match is the point where parsing will be
                resumed.
2253
    """
2254
    def __init__(self, *parsers: Parser,
2255
                 mandatory: int = NO_MANDATORY,
eckhart's avatar
eckhart committed
2256
                 err_msgs: MessagesType = [],
2257
                 skip: ResumeList = []) -> None:
Eckhart Arnold's avatar
Eckhart Arnold committed
2258
        super(MandatoryNary, self).__init__(*parsers)
2259
2260
2261
2262
        length = len(self.parsers)
        if mandatory < 0:
            mandatory += length

2263
        self.mandatory = mandatory  # type: int
Eckhart Arnold's avatar
Eckhart Arnold committed
2264
        self.err_msgs = err_msgs    # type: MessagesType
2265
        self.skip = skip            # type: ResumeList
2266

2267
2268
2269
2270
2271
2272
2273
    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
        duplicate = self.__class__(*parsers, mandatory=self.mandatory,
                                   err_msgs=self.err_msgs, skip=self.skip)
        copy_parser_base_attrs(self, duplicate)
        return duplicate

eckhart's avatar
eckhart committed
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
    def get_reentry_point(self, text_: StringView) -> int:
        """Returns a reentry-point determined by the skip-list in `self.skip`.
        If no reentry-point was found or the skip-list ist empty, -1 is returned.
        """
        if self.skip:
            gr = self._grammar
            return reentry_point(text_, self.skip, gr.comment_rx__, gr.reentry_search_window__)
        return -1

    @cython.locals(i=cython.int, location=cython.int)
    def mandatory_violation(self,
                            text_: StringView,
                            failed_on_lookahead: bool,
                            expected: str,
                            reloc: int) -> Tuple[Error, Node, StringView]:
        """
2290
        Chooses the right error message in case of a mandatory violation and
eckhart's avatar
eckhart committed
2291
2292
2293
2294
        returns an error with this message, an error node, to which the error
        is attached, and the text segment where parsing is to continue.

        This is a helper function that abstracts functionality that is
eckhart's avatar
eckhart committed
2295
        needed by the Interleave- as well as the Series-parser.
eckhart's avatar
eckhart committed
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316

        :param parser: the grammar
        :param text_: the point, where the mandatory violation. As usual the
                string view represents the remaining text from this point.
        :param failed_on_lookahead: True if the violating parser was a
                Lookahead-Parser.
        :param expected:  the expected (but not found) text at this point.
        :param reloc: A position value that represents the reentry point for
                parsing after the error occurred.

        :return:   a tuple of an error object, a zombie node at the position
                where the mandatory violation occurred and to which the error
                object is attached and a string view for continuing the
                parsing process
        """
        grammar = self._grammar
        i = reloc if reloc >= 0 else 0
        location = grammar.document_length__ - len(text_)
        err_node = Node(ZOMBIE_TAG, text_[:i]).with_pos(location)
        found = text_[:10].replace('\n', '\\n ') + '...'
        for search, message in self.err_msgs:
2317
2318
2319
2320
2321
2322
            is_func = callable(search)           # search rule is a function: StringView -> bool
            is_str = isinstance(search, str)     # search rule is a simple string
            is_rxs = not is_func and not is_str  # search rule is a regular expression
            if (is_func and search(text_)) \
                    or (is_rxs and text_.match(search)) \
                    or (is_str and text_.startswith(search)):
eckhart's avatar
eckhart committed
2323
2324
2325
2326
2327
2328
                try:
                    msg = message.format(expected, found)
                    break
                except (ValueError, KeyError, IndexError) as e:
                    error = Error("Malformed error format string »{}« leads to »{}«"
                                  .format(message, str(e)),
2329
                                  location, MALFORMED_ERROR_STRING)
eckhart's avatar
eckhart committed
2330
2331
2332
2333
2334
2335
2336
                    grammar.tree__.add_error(err_node, error)
        else:
            if grammar.history_tracking__:
                pname = ':root'
                for pname, _ in reversed(grammar.call_stack__):
                    if not pname.startswith(':'):
                        break
2337
                msg = '%s expected by parser %s, »%s« found!' % (expected, repr(pname), found)
eckhart's avatar
eckhart committed
2338
2339
            else:
                msg = '%s expected, »%s« found!' % (expected, found)
2340
2341
        error = Error(msg, location, MANDATORY_CONTINUATION_AT_EOF
                      if (failed_on_lookahead and not text_) else MANDATORY_CONTINUATION)
eckhart's avatar
eckhart committed
2342
2343
        grammar.tree__.add_error(err_node, error)
        if reloc >= 0:
2344
2345
            # signal error to tracer directly, because this error is not raised!
            grammar.most_recent_error__ = ParserError(err_node, text_, error, first_throw=False)
eckhart's avatar
eckhart committed
2346
2347
        return error, err_node, text_[i:]

eckhart's avatar
eckhart committed
2348
2349
    def static_analysis(self) -> List['AnalysisError']:
        errors = super().static_analysis()
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
        msg = []
        length = len(self.parsers)
        if self.mandatory == NO_MANDATORY and self.err_msgs:
            msg.append('Custom error messages require that parameter "mandatory" is set!')
        elif self.mandatory == NO_MANDATORY and self.skip:
            msg.append('Search expressions for skipping text require parameter '
                       '"mandatory" to be set!')
        elif length == 0:
            msg.append('Number of elements %i is below minimum length of 1' % length)
        elif length >= NO_MANDATORY:
            msg.append('Number of elemnts %i of series exceeds maximum length of %i' \
                  % (length, NO_MANDATORY))
        elif not (0 <= self.mandatory < length or self.mandatory == NO_MANDATORY):
            msg.append('Illegal value %i for mandatory-parameter in a parser with %i elements!'
                  % (self.mandatory, length))
        if msg:
2366
2367
            msg.insert(0, 'Illegal configuration of mandatory Nary-parser '
                       + self.location_info())
eckhart's avatar
eckhart committed
2368
2369
            errors.append(self.static_error('\n'.join(msg), BAD_MANDATORY_SETUP))
        return errors
2370

eckhart's avatar
eckhart committed
2371

Eckhart Arnold's avatar
Eckhart Arnold committed
2372
class Series(MandatoryNary):
eckhart's avatar
eckhart committed
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
    r"""
    Matches if each of a series of parsers matches exactly in the order of
    the series.

    Example::

        >>> variable_name = RegExp(r'(?!\d)\w') + RE(r'\w*')
        >>> Grammar(variable_name)('variable_1').content
        'variable_1'
        >>> str(Grammar(variable_name)('1_variable'))
2383
        ' <<< Error on "1_variable" | Parser "/(?!\\\\d)\\\\w/ /\\\\w*/ ~" did not match! >>> '
eckhart's avatar
eckhart committed
2384
2385
2386
2387
2388
2389
2390

    EBNF-Notation: ``... ...``    (sequence of parsers separated by a blank or new line)

    EBNF-Example:  ``series = letter letter_or_digit``
    """
    RX_ARGUMENT = re.compile(r'\s(\S)')

eckhart's avatar
eckhart committed
2391
    @cython.locals(pos=cython.int, reloc=cython.int)
2392
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2393
2394
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
2395
        error = None  # type: Optional[Error]
2396
        for pos, parser in enumerate(self.parsers):
2397
            node, text_ = parser(text_)
di68kap's avatar
di68kap committed
2398
            if node is None:
2399
2400
2401
                if pos < self.mandatory:
                    return None, text
                else:
eckhart's avatar
eckhart committed
2402
2403
2404
                    reloc = self.get_reentry_point(text_)
                    error, node, text_ = self.mandatory_violation(
                        text_, isinstance(parser, Lookahead), parser.repr, reloc)
2405
                    # check if parsing of the series can be resumed somewhere
2406
                    if reloc >= 0:
2407
                        nd, text_ = parser(text_)  # try current parser again
di68kap's avatar
di68kap committed
2408
                        if nd is not None:
2409
2410
2411
2412
2413
                            results += (node,)
                            node = nd
                    else:
                        results += (node,)
                        break
2414
            if node._result or not node.tag_name.startswith(':'):  # drop anonymous empty nodes
2415
                results += (node,)
eckhart's avatar
eckhart committed
2416
        # assert len(results) <= len(self.parsers) \
2417
        #        or len(self.parsers) >= len([p for p in results if p.tag_name != ZOMBIE_TAG])
eckhart's avatar
eckhart committed
2418
        ret_node = self._return_values(results)  # type: Node
2419
        if error and reloc < 0:
2420
            raise ParserError(ret_node.with_pos(self.grammar.document_length__ - len(text_)),
2421
                              text, error, first_throw=True)
eckhart's avatar
eckhart committed
2422
        return ret_node, text_
2423
2424
2425

    def __repr__(self):
        return " ".join([parser.repr for parser in self.parsers[:self.mandatory]]
2426
                        + (['§'] if self.mandatory != NO_MANDATORY else [])
2427
2428
2429
                        + [parser.repr for parser in self.parsers[self.mandatory:]])

    # The following operator definitions add syntactical sugar, so one can write:
2430
    # `RE('\d+') + Optional(RE('\.\d+)` instead of `Series(RE('\d+'), Optional(RE('\.\d+))`
2431
2432

    @staticmethod
eckhart's avatar
eckhart committed
2433
    def combined_mandatory(left: Parser, right: Parser) -> int:
2434
2435
2436
2437
2438
        """
        Returns the position of the first mandatory element (if any) when
        parsers `left` and `right` are joined to a sequence.
        """
        left_mandatory, left_length = (left.mandatory, len(left.parsers)) \
2439
2440
            if isinstance(left, Series) else (NO_MANDATORY, 1)
        if left_mandatory != NO_MANDATORY:
2441
            return left_mandatory
2442
2443
        right_mandatory = right.mandatory if isinstance(right, Series) else NO_MANDATORY
        if right_mandatory != NO_MANDATORY:
2444
            return right_mandatory + left_length
2445
        return NO_MANDATORY
2446
2447
2448

    def __add__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
2449
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2450
2451
2452
2453
2454
        return Series(*(self.parsers + other_parsers),
                      mandatory=self.combined_mandatory(self, other))

    def __radd__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
2455
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2456
2457
2458
2459
2460
        return Series(*(other_parsers + self.parsers),
                      mandatory=self.combined_mandatory(other, self))

    def __iadd__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
2461
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2462
2463
2464
2465
2466
        self.parsers += other_parsers
        self.mandatory = self.combined_mandatory(self, other)
        return self


2467
2468
2469
2470
def starting_string(parser: Parser) -> str:
    """If parser starts with a fixed string, this will be returned.
    """
    # keep track of already visited parsers to avoid infinite circles
2471
    been_there = parser.grammar.static_analysis_caches__.setdefault('starting_strings', dict())  # type: Dict[Parser, str]
2472
2473
2474

    def find_starting_string(p: Parser) -> str:
        nonlocal been_there
2475
2476
2477
2478
        if p in been_there:
            return been_there[p]
        else:
            been_there[p] = ""
2479
2480
            if isinstance(p, Text):
                been_there[p] = cast(Text, p).text
2481
            elif isinstance(p, Series) or isinstance(p, Alternative):
2482
                been_there[p] = find_starting_string(cast(NaryParser, p).parsers[0])
2483
2484
            elif isinstance(p, Synonym) or isinstance(p, OneOrMore) \
                    or isinstance(p, Lookahead):
2485
                been_there[p] = find_starting_string(cast(UnaryParser, p).parser)
2486
2487
2488
            elif isinstance(p, Counted):
                counted = cast(Counted, p)  # type: Counted
                if not counted.is_optional():
2489
                    been_there[p] = find_starting_string(counted.parser)
2490
2491
2492
            elif isinstance(p, Interleave):
                interleave = cast(Interleave, p)
                if interleave.repetitions[0][0] >= 1:
2493
2494
                    been_there[p] = find_starting_string(interleave.parsers[0])
            return been_there[p]
2495
    return find_starting_string(parser)
2496
2497


eckhart's avatar
eckhart committed
2498
class Alternative(NaryParser):
2499
2500
2501
2502
2503
2504
    r"""
    Matches if one of several alternatives matches. Returns
    the first match.

    This parser represents the EBNF-operator "|" with the qualification
    that both the symmetry and the ambiguity of the EBNF-or-operator
2505
    are broken by selecting the first match.::
2506

2507
        # the order of the sub-expression matters!
2508
        >>> number = RE(r'\d+') | RE(r'\d+') + RE(r'\.') + RE(r'\d+')
2509
        >>> str(Grammar(number)("3.1416"))
2510
        '3 <<< Error on ".1416" | Parser stopped before end! Terminating parser. >>> '
2511

2512
        # the most selective expression should be put first:
2513
        >>> number = RE(r'\d+') + RE(r'\.') + RE(r'\d+') | RE(r'\d+')
2514
2515
        >>> Grammar(number)("3.1416").content
        '3.1416'
2516

2517
    EBNF-Notation: ``... | ...``
2518

2519
    EBNF-Example:  ``number = /\d+\.\d+/ | /\d+/``
2520
    """
2521
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2522
2523
        for parser in self.parsers:
            node, text_ = parser(text)
di68kap's avatar
di68kap committed
2524
            if node is not None:
2525
2526
2527
2528
                return self._return_value(node), text_
                # return self._return_value(node if node._result or parser.pname else None), text_
                # return Node(self.tag_name,
                #             node if node._result or parser.pname else ()), text_
2529
2530
2531
        return None, text

    def __repr__(self):
2532
2533
        if self.pname:
            return ' | '.join(parser.repr for parser in self.parsers)
2534
2535
        return '(' + ' | '.join(parser.repr for parser in self.parsers) + ')'

2536
2537
2538
    # def reset(self):
    #     super(Alternative, self).reset()
    #     return self
2539
2540

    # The following operator definitions add syntactical sugar, so one can write:
2541
2542
    # `RE('\d+') + RE('\.') + RE('\d+') | RE('\d+')` instead of:
    # `Alternative(Series(RE('\d+'), RE('\.'), RE('\d+')), RE('\d+'))`
2543
2544
2545

    def __or__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
2546
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2547
2548
2549
2550
        return Alternative(*(self.parsers + other_parsers))

    def __ror__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
2551
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2552
2553
2554
2555
        return Alternative(*(other_parsers + self.parsers))

    def __ior__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
2556
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
2557
2558
2559
        self.parsers += other_parsers
        return self

eckhart's avatar
eckhart committed
2560
2561
    def static_analysis(self) -> List['AnalysisError']:
        errors = super().static_analysis()
2562
2563
        if len(set(self.parsers)) != len(self.parsers):
            errors.append(self.static_error(
2564
                'Duplicate parsers in ' + self.location_info(),
2565
                DUPLICATE_PARSERS_IN_ALTERNATIVE))
2566
        if not all(not p.is_optional() for p in self.parsers[:-1]):
2567
2568
2569
            for i, p in enumerate(self.parsers):
                if p.is_optional():
                    break
2570
            errors.append(self.static_error(
2571
                "Parser-specification Error in " + self.location_info()
2572
2573
2574
                + "\nOnly the very last alternative may be optional! "
                + 'Parser "%s" at position %i out of %i is optional'
                %(p.tag_name, i + 1, len(self.parsers)),
2575
                BAD_ORDER_OF_ALTERNATIVES))
2576

2577
2578
2579
        # check for errors like "A" | "AB" where "AB" would never be reached,
        # because a substring at the beginning is already caught by an earlier
        # alternative
2580
2581
        # WARNING: This can become time-consuming!!!
        # EXPERIMENTAL
2582
2583

        def does_preempt(start, parser):
2584
2585
            cst = self.grammar(start, parser, complete_match=False)
            return not cst.errors and len(cst) >= 1
2586

2587
2588
2589
2590
        for i in range(2, len(self.parsers)):
            fixed_start = starting_string(self.parsers[i])
            if fixed_start:
                for k in range(i):
2591
                    if does_preempt(fixed_start, self.parsers[k]):
2592
                        errors.append(self.static_error(
2593
2594
2595
2596
                            "Parser-specification Error in " + self.location_info()
                            + "\nAlternative %i will never be reached, because its starting-"
                            'string "%s" is already captured by earlier alternative %i !'
                            % (i + 1, fixed_start, k + 1), BAD_ORDER_OF_ALTERNATIVES))
eckhart's avatar
eckhart committed
2597
        return errors
2598

2599

eckhart's avatar
eckhart committed
2600
class Interleave(MandatoryNary):
2601
    r"""Parse elements in arbitrary order.
2602

eckhart's avatar
eckhart committed
2603
    Examples::
2604
        >>> prefixes = TKN("A") * TKN("B")
2605
2606
2607
2608
        >>> Grammar(prefixes)('A B').content
        'A B'
        >>> Grammar(prefixes)('B A').content
        'B A'
2609

eckhart's avatar
eckhart committed
2610
        >>> prefixes = Interleave(TKN("A"), TKN("B"), repetitions=((0, 1), (0, 1)))
2611
2612
2613
2614
2615
2616
        >>> Grammar(prefixes)('A B').content
        'A B'
        >>> Grammar(prefixes)('B A').content
        'B A'
        >>> Grammar(prefixes)('B').content
        'B'
2617
2618
2619
2620

    EBNF-Notation: ``... ° ...``

    EBNF-Example:  ``float =  { /\d/ }+ ° /\./``
2621
2622
    """

2623
2624
2625
2626
    def __init__(self, *parsers: Parser,
                 mandatory: int = NO_MANDATORY,
                 err_msgs: MessagesType = [],
                 skip: ResumeList = [],
2627
                 repetitions: Sequence[Tuple[int, int]] = ()) -> None:
2628
2629
        super(Interleave, self).__init__(
            *parsers, mandatory=mandatory, err_msgs=err_msgs, skip=skip)
2630
2631
2632
2633
        if len(repetitions) == 0:
            repetitions = [(1, 1)] * len(parsers)
        elif len(parsers) != len(repetitions):
            raise ValueError("Number of repetition-tuples unequal number of sub-parsers!")
2634
        self.repetitions = repetitions
2635
        self.non_mandatory = frozenset(parsers[i] for i in range(min(mandatory, len(parsers))))
2636
        self.parsers_set = frozenset(self.parsers)
2637
2638
2639
2640
2641
2642

    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
        duplicate = self.__class__(*parsers, mandatory=self.mandatory,
                                   err_msgs=self.err_msgs, skip=self.skip,
                                   repetitions=self.repetitions)
2643
        copy_parser_base_attrs(self, duplicate)
2644
        return duplicate
2645

eckhart's avatar
eckhart committed
2646
    @cython.locals(i=cython.int, n=cython.int, length=cython.int, reloc=cython.int)
di68kap's avatar
di68kap committed
2647
    def _parse(self, text: StringView):
2648
2649
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
2650
        counter = [0] * len(self.parsers)
2651
2652
        consumed = set()  # type: Set[Parser]
        error = None  # type: Optional[Error]
2653
        length = text_.__len__()
2654
        while True:
di68kap's avatar
di68kap committed
2655
            # there is an order of testing, but no promise about the order of testing, here!
2656
            for i, parser in enumerate(self.parsers):
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
                if parser not in consumed:
                    node, text__ = parser(text_)
                    if node is not None:
                        if node._result or not node.tag_name.startswith(':'):
                            # drop anonymous empty nodes
                            results += (node,)
                            text_ = text__
                        counter[i] += 1
                        if counter[i] >= self.repetitions[i][1]:
                            consumed.add(parser)
                        break
2668
            else:
2669
2670
2671
2672
2673
2674
2675
                for i, parser in enumerate(self.parsers):
                    if counter[i] >= self.repetitions[i][0]:
                        consumed.add(parser)
                if self.non_mandatory <= consumed:
                    if consumed == self.parsers_set:
                        break
                else:
2676
2677
                    return None, text
                reloc = self.get_reentry_point(text_)
2678
2679
                expected = ' ° '.join([parser.repr for parser in self.parsers])
                error, err_node, text_ = self.mandatory_violation(text_, False, expected, reloc)
2680
2681
                results += (err_node,)
                if reloc < 0:
2682
                    break
2683
2684
2685
2686
            n = length
            length = text_.__len__()
            if length == n:
                break  # avoid infinite loop
2687
2688
2689
2690
2691
2692
        nd = self._return_values(results)  # type: Node
        if error and reloc < 0:
            raise ParserError(nd.with_pos(self.grammar.document_length__ - len(text)),
                              text, error, first_throw=True)
        return nd, text_

eckhart's avatar
eckhart committed
2693
2694
2695
    def is_optional(self) -> Optional[bool]:
        return all(r[0] == 0 for r in self.repetitions)

2696
    def __repr__(self):
2697
2698
2699
2700
2701
        def rep(parser: Parser) -> str:
            return '(' + parser.repr + ')' \
                if isinstance(parser, Series) or isinstance(parser, Alternative) else parser.repr

        return ' ° '.join(rep(parser) for parser in self.parsers)
2702

di68kap's avatar
di68kap committed
2703
2704
2705
    def _prepare_combined(self, other: Parser) -> Tuple[Tuple[Parser], int, List[Tuple[int, int]]]:
        """Returns the other's parsers and repetitions if `other` is an Interleave-parser,
        otherwise returns ((other,),), [(1, 1])."""
di68kap's avatar
di68kap committed
2706
        other = to_interleave(other)
di68kap's avatar
di68kap committed
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
        other_parsers = cast('Interleave', other).parsers if isinstance(other, Interleave) \
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
        other_repetitions = cast('Interleave', other).repetitions \
            if isinstance(other, Interleave) else [(1, 1),]
        other_mandatory = cast('Interleave', other).mandatory \
            if isinstance(other, Interleave) else NO_MANDATORY
        if other_mandatory == NO_MANDATORY:
            mandatory = self.mandatory
            parsers = self.parsers + other_parsers
            repetitions = self.repetitions + other_repetitions
        elif self.mandatory == NO_MANDATORY:
            mandatory = other_mandatory
            parsers = other_parsers + self.parsers
            repetitions = other_repetitions + self.repetitions
        else:
            mandatory = self.mandatory + other_mandatory
            parsers = self.parsers[:self.mandatory] + other_parsers[:other_mandatory] \
                + self.parsers[self.mandatory:] + other_parsers[other_mandatory:]
            repetitions = self.repetitions[:self.mandatory] + other_repetitions[:other_mandatory] \
                + self.repetitions[self.mandatory:] + other_repetitions[other_mandatory:]
        return parsers, mandatory, repetitions

    def __mul__(self, other: Parser) -> 'Interleave':
        parsers, mandatory, repetitions = self._prepare_combined(other)
        return Interleave(*parsers, mandatory=mandatory, repetitions=repetitions)

    def __rmul__(self, other: Parser) -> 'Interleave':
        parsers, mandatory, repetitions = self._prepare_combined(other)
        return Interleave(*parsers, mandatory=mandatory, repetitions=repetitions)

    def __imul__(self, other: Parser) -> 'Interleave':
        parsers, mandatory, repetitions = self._prepare_combined(other)
        self.parsers = parsers
        self.mandatory = mandatory
        self.repetitions = repetitions
        return self

eckhart's avatar
eckhart committed
2744
    @cython.locals(a=cython.int, b=cython.int)
eckhart's avatar
eckhart committed
2745
    def static_analysis(self) -> List['AnalysisError']:
2746
        # assert len(set(parsers)) == len(parsers)  # commented out, because this could make sense
eckhart's avatar
eckhart committed
2747
        errors = super().static_analysis()
2748
        if not all(not parser.is_optional() and not isinstance(parser, FlowParser)
2749
                   for parser in self.parsers):
eckhart's avatar
eckhart committed
2750
            errors.append(self.static_error(
2751
2752
                "Flow-operators and optional parsers are neither allowed "
                "nor needed in an interleave-parser " + self.location_info(),
eckhart's avatar
eckhart committed
2753
                BADLY_NESTED_OPTIONAL_PARSER))
di68kap's avatar
di68kap committed
2754
2755
        for parser, (a, b) in zip(self.parsers, self.repetitions):
            if a < 0 or b < 0 or a > b or a > INFINITE or b > INFINITE:
eckhart's avatar
eckhart committed
2756
                errors.append(self.static_error(
di68kap's avatar
di68kap committed
2757
                    'Repetition count [a=%i, b=%i] for parser %s violates requirement '
2758
                    '0 <= a <= b <= infinity = 2^30' % (a, b, str(parser)),
eckhart's avatar
eckhart committed
2759
2760
                    BAD_REPETITION_COUNT))
        return errors
2761

2762

2763
2764
########################################################################
#
eckhart's avatar
eckhart committed
2765
# Flow control parsers
2766
2767
2768
#
########################################################################

eckhart's avatar
eckhart committed
2769
class FlowParser(UnaryParser):
2770
    """
eckhart's avatar
eckhart committed
2771
    Base class for all flow parsers like Lookahead and Lookbehind.
2772
2773
2774
2775
2776
2777
    """
    def sign(self, bool_value) -> bool:
        """Returns the value. Can be overriden to return the inverted bool."""
        return bool_value


eckhart's avatar
eckhart committed
2778
2779
2780
2781
def Required(parser: Parser) -> Parser:
    return Series(parser, mandatory=0)


eckhart's avatar
eckhart committed
2782
class Lookahead(FlowParser):
2783
2784
2785
2786
    """
    Matches, if the contained parser would match for the following text,
    but does not consume any text.
    """
2787
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2788
        node, _ = self.parser(text)
2789
        if self.sign(node is not None):
2790
            # TODO: Delete following comment
eckhart's avatar
eckhart committed
2791
2792
            # static analysis requires lookahead to be disabled at document end
            # or (self.grammar.static_analysis_pending__ and not text)):
2793
            return (EMPTY_NODE if self.anonymous else Node(self.tag_name, '')), text
2794
2795
2796
2797
2798
2799
        else:
            return None, text

    def __repr__(self):
        return '&' + self.parser.repr

eckhart's avatar
eckhart committed
2800
2801
    def static_analysis(self) -> List[AnalysisError]:
        errors = super().static_analysis()
eckhart's avatar
eckhart committed
2802
        if self.parser.is_optional():
eckhart's avatar
eckhart committed
2803
            errors.append((self.pname, self, Error(
eckhart's avatar
eckhart committed
2804
2805
                'Lookahead %s does not make sense with optional parser "%s"!' \
                % (self.pname, str(self.parser)),
eckhart's avatar
eckhart committed
2806
2807
                0, LOOKAHEAD_WITH_OPTIONAL_PARSER)))
        return errors
eckhart's avatar
eckhart committed
2808

2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821

class NegativeLookahead(Lookahead):
    """
    Matches, if the contained parser would *not* match for the following
    text.
    """
    def __repr__(self):
        return '!' + self.parser.repr

    def sign(self, bool_value) -> bool:
        return not bool_value


eckhart's avatar
eckhart committed
2822
class Lookbehind(FlowParser):
2823
2824
    """
    Matches, if the contained parser would match backwards. Requires
2825
    the contained parser to be a RegExp, _RE, Text parser.
2826
2827

    EXPERIMENTAL
2828
    """
2829
    def __init__(self, parser: Parser) -> None:
2830
2831
2832
        p = parser
        while isinstance(p, Synonym):
            p = p.parser
2833
        assert isinstance(p, RegExp) or isinstance(p, Text)
eckhart's avatar
eckhart committed
2834
        self.regexp = None
eckhart's avatar
eckhart committed
2835
        self.text = ''  # type: str
2836
        if isinstance(p, RegExp):
eckhart's avatar
eckhart committed
2837
            self.regexp = cast(RegExp, p).regexp
2838
2839
        else:  # p is of type Text
            self.text = cast(Text, p).text
2840
        super(Lookbehind, self).__init__(parser)
2841

2842
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
di68kap's avatar
di68kap committed
2843
        backwards_text = self.grammar.reversed__[text.__len__():]
eckhart's avatar
eckhart committed
2844
        if self.regexp is None:  # assert self.text is not None
di68kap's avatar
di68kap committed
2845
            does_match = backwards_text[:text.__len__()] == self.text
eckhart's avatar
eckhart committed
2846
2847
        else:  # assert self.regexp is not None
            does_match = backwards_text.match(self.regexp)
2848
2849
2850
2851
2852
        if self.sign(does_match):
            if self.drop_content:
                return EMPTY_NODE, text
            return Node(self.tag_name, ''), text
        return None, text
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871

    def __repr__(self):
        return '-&' + self.parser.repr


class NegativeLookbehind(Lookbehind):
    """
    Matches, if the contained parser would *not* match backwards. Requires
    the contained parser to be a RegExp-parser.
    """
    def __repr__(self):
        return '-!' + self.parser.repr

    def sign(self, bool_value) -> bool:
        return not bool(bool_value)


########################################################################
#
eckhart's avatar
eckhart committed
2872
# Capture and Retrieve parsers (for passing variables in the parser)
2873
2874
2875
2876
#
########################################################################


eckhart's avatar
eckhart committed
2877
class Capture(UnaryParser):
2878
2879
2880
2881
2882
    """
    Applies the contained parser and, in case of a match, saves the result
    in a variable. A variable is a stack of values associated with the
    contained parser's name. This requires the contained parser to be named.
    """
2883
2884
    def __init__(self, parser: Parser) -> None:
        super(Capture, self).__init__(parser)
2885

eckhart's avatar
eckhart committed
2886
2887
2888
    def _rollback(self):
        return self.grammar.variables__[self.pname].pop()

2889
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
2890
        node, text_ = self.parser(text)
di68kap's avatar
di68kap committed
2891
        if node is not None:
di68kap's avatar
di68kap committed
2892
            assert self.pname, """Tried to apply an unnamed capture-parser!"""
2893
            assert not self.parser.drop_content, \
2894
                "Cannot capture content from parsers that drop content!"
eckhart's avatar
eckhart committed
2895
            self.grammar.variables__[self.pname].append(node.content)
di68kap's avatar
di68kap committed
2896
            location = self.grammar.document_length__ - text.__len__()
eckhart's avatar
eckhart committed
2897
            self.grammar.push_rollback__(location, self._rollback)  # lambda: stack.pop())
2898
2899
            # caching will be blocked by parser guard (see way above),
            # because it would prevent recapturing of rolled back captures
eckhart's avatar
eckhart committed
2900
            return self._return_value(node), text_
2901
2902
2903
2904
2905
2906
        else:
            return None, text

    def __repr__(self):
        return self.parser.repr

eckhart's avatar
eckhart committed
2907
2908
    def static_analysis(self) -> List[AnalysisError]:
        errors = super().static_analysis()
2909
2910
2911
        if not self.pname:
            errors.append((self.pname, self, Error(
                'Capture only works as named parser! Error in parser: ' + str(self),
2912
                0, CAPTURE_WITHOUT_PARSERNAME
2913
            )))
2914
        if self.parser.apply(lambda plist: plist[-1].drop_content):
2915
2916
2917
            errors.append((self.pname, self, Error(
                'Captured symbol "%s" contains parsers that drop content, '
                'which can lead to unintended results!' % (self.<