parse.py 113 KB
Newer Older
eckhart's avatar
eckhart committed
1001
1002
1003
                self.comment_rx__ = re.compile(self.__class__.COMMENT__)
            else:
                self.comment_rx__ = RX_NEVER_MATCH
1004
        else:
eckhart's avatar
eckhart committed
1005
1006
            assert ((self.__class__.COMMENT__
                     and self.__class__.COMMENT__ == self.comment_rx__.pattern)
eckhart's avatar
eckhart committed
1007
                    or (not self.__class__.COMMENT__ and self.comment_rx__ == RX_NEVER_MATCH))
1008
        self.start_parser__ = None             # type: Optional[Parser]
1009
1010
        self._dirty_flag__ = False             # type: bool
        self.memoization__ = True              # type: bool
1011
1012
        self.history_tracking__ = False        # type: bool
        self.resume_notices__ = False          # type: bool
1013
        self.flatten_tree__ = get_config_value('flatten_tree')                    # type: bool
1014
1015
1016
        self.left_recursion_depth__ = get_config_value('left_recursion_depth')    # type: int
        self.max_parser_dropouts__ = get_config_value('max_parser_dropouts')      # type: int
        self.reentry_search_window__ = get_config_value('reentry_search_window')  # type: int
1017
1018
1019
1020
1021
1022
1023
        self._reset__()

        # prepare parsers in the class, first
        self._assign_parser_names__()

        # then deep-copy the parser tree from class to instance;
        # parsers not connected to the root object will be copied later
1024
1025
1026
        # on demand (see Grammar.__getitem__()).
        # (Usually, all parsers should be connected to the root object. But
        # during testing and development this does not need to be the case.)
eckhart's avatar
eckhart committed
1027
1028
1029
1030
        self.root_parser__ = copy.deepcopy(root) if root else copy.deepcopy(self.__class__.root__)
        self.root_parser__.apply(self._add_parser__)
        assert 'root_parser__' in self.__dict__
        assert self.root_parser__ == self.__dict__['root_parser__']
1031

1032
1033
1034
1035
1036
1037
1038
1039
1040
        if self.__class__.static_analysis_pending__ \
                and get_config_value('static_analysis') in {'early', 'late'}:
            try:
                result = self.static_analysis()
                if result:
                    raise GrammarError(result)
                self.__class__.static_analysis_pending__.pop()
            except (NameError, AttributeError):
                pass  # don't fail the initialization of PLACEHOLDER
1041

1042
1043
1044
1045
1046

    def __getitem__(self, key):
        try:
            return self.__dict__[key]
        except KeyError:
eckhart's avatar
eckhart committed
1047
            parser_template = getattr(self.__class__, key, None)
1048
1049
1050
1051
            if parser_template:
                # add parser to grammar object on the fly...
                parser = copy.deepcopy(parser_template)
                parser.apply(self._add_parser__)
eckhart's avatar
eckhart committed
1052
                assert self[key] == parser
1053
                return self[key]
1054
            raise UnknownParserError('Unknown parser "%s" !' % key)
1055

1056

1057
1058
    def __contains__(self, key):
        return key in self.__dict__ or hasattr(self, key)
1059

1060

1061
    def _reset__(self):
1062
        self.tree__ = RootNode()              # type: RootNode
1063
1064
        self.document__ = EMPTY_STRING_VIEW   # type: StringView
        self._reversed__ = EMPTY_STRING_VIEW  # type: StringView
eckhart's avatar
eckhart committed
1065
        self.document_length__ = 0            # type: int
1066
        self._document_lbreaks__ = []         # type: List[int]
1067
        # variables stored and recalled by Capture and Retrieve parsers
eckhart's avatar
eckhart committed
1068
        self.variables__ = defaultdict(lambda: [])  # type: DefaultDict[str, List[str]]
1069
1070
1071
        self.rollback__ = []                  # type: List[Tuple[int, Callable]]
        self.last_rb__loc__ = -1              # type: int
        # support for call stack tracing
1072
        self.call_stack__ = []                # type: List[Tuple[str, int]]  # tag_name, location
1073
1074
1075
1076
1077
        # snapshots of call stacks
        self.history__ = []                   # type: List[HistoryRecord]
        # also needed for call stack tracing
        self.moving_forward__ = False         # type: bool
        self.recursion_locations__ = set()    # type: Set[int]
1078
        self.last_recursion_location__ = -1   # type: int
1079
        self.most_recent_error__ = None       # type: Optional[ParserError]
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089


    @property
    def reversed__(self) -> StringView:
        """
        Returns a reversed version of the currently parsed document. As
        about the only case where this is needed is the Lookbehind-parser,
        this is done lazily.
        """
        if not self._reversed__:
1090
            self._reversed__ = StringView(self.document__.get_text()[::-1])
1091
1092
1093
1094
1095
1096
1097
1098
        return self._reversed__


    def _add_parser__(self, parser: Parser) -> None:
        """
        Adds the particular copy of the parser object to this
        particular instance of Grammar.
        """
di68kap's avatar
di68kap committed
1099
        if parser.pname:
1100
            # prevent overwriting instance variables or parsers of a different class
eckhart's avatar
eckhart committed
1101
1102
            assert (parser.pname not in self.__dict__
                    or isinstance(self.__dict__[parser.pname], parser.__class__)), \
1103
                ('Cannot add parser "%s" because a field with the same name '
di68kap's avatar
di68kap committed
1104
                 'already exists in grammar object: %s!'
di68kap's avatar
di68kap committed
1105
1106
                 % (parser.pname, str(self.__dict__[parser.pname])))
            setattr(self, parser.pname, parser)
1107
        parser.tag_name = parser.ptype if parser.anonymous else parser.pname
1108
1109
1110
1111
        self.all_parsers__.add(parser)
        parser.grammar = self


eckhart's avatar
eckhart committed
1112
1113
    def __call__(self,
                 document: str,
eckhart's avatar
eckhart committed
1114
                 start_parser: Union[str, Parser] = "root_parser__",
1115
                 *, complete_match: bool = True) -> RootNode:
1116
1117
1118
1119
1120
        """
        Parses a document with with parser-combinators.

        Args:
            document (str): The source text to be parsed.
di68kap's avatar
di68kap committed
1121
1122
            start_parser (str or Parser): The name of the parser with which
                to start. This is useful for testing particular parsers
1123
                (i.e. particular parts of the EBNF-Grammar.)
1124
1125
            complete_match (bool): If True, an error is generated, if
                `start_parser` did not match the entire document.
1126
        Returns:
di68kap's avatar
di68kap committed
1127
            Node: The root node to the parse tree.
1128
        """
1129

eckhart's avatar
eckhart committed
1130
        def tail_pos(predecessors: Union[List[Node], Tuple[Node, ...], None]) -> int:
eckhart's avatar
eckhart committed
1131
1132
            """Adds the position after the last node in the list of
            predecessors to the node."""
eckhart's avatar
eckhart committed
1133
            return predecessors[-1].pos + len(predecessors[-1]) if predecessors else 0
eckhart's avatar
eckhart committed
1134

1135
1136
1137
1138
1139
1140
        def lookahead_failure_only(parser):
            """EXPERIMENTAL!

            Checks if failure to match document was only due to a succeeding
            lookahead parser, which is a common design pattern that can break test
            cases. (Testing for this case allows to modify the error message, so
1141
            that the testing framework knows that the failure is only a
1142
1143
1144
            test-case-artifact and no real failure.
            (See test/test_testing.TestLookahead !)
            """
1145
1146
1147
            def is_lookahead(tag_name: str) -> bool:
                return (tag_name in self and isinstance(self[tag_name], Lookahead)
                        or tag_name[0] == ':' and issubclass(eval(tag_name[1:]), Lookahead))
di68kap's avatar
di68kap committed
1148
1149
1150
            # for h in reversed(self.history__[:-1]):
            #     for tn, pos in h.call_stack:
            #         if is_lookahead(tn) and h.status == HistoryRecord.MATCH:
1151
            #              print(h.call_stack, pos, h.line_col)
eckhart's avatar
eckhart committed
1152
1153
            last_record = self.history__[-2] if len(self.history__) > 1 \
                else None  # type: Optional[HistoryRecord]
1154
            return last_record and parser != self.root_parser__ \
eckhart's avatar
eckhart committed
1155
1156
1157
1158
                and any(h.status == HistoryRecord.MATCH  # or was it HistoryRecord.MATCH !?
                        and any(is_lookahead(tn) and location >= len(self.document__)
                                for tn, location in h.call_stack)
                        for h in self.history__[:-1])
1159

1160
1161
1162
1163
1164
1165
1166
        # assert isinstance(document, str), type(document)
        if self._dirty_flag__:
            self._reset__()
            for parser in self.all_parsers__:
                parser.reset()
        else:
            self._dirty_flag__ = True
eckhart's avatar
eckhart committed
1167

1168
        self.document__ = StringView(document)
eckhart's avatar
eckhart committed
1169
        self.document_length__ = len(self.document__)
1170
        self._document_lbreaks__ = linebreaks(document) if self.history_tracking__ else []
Eckhart Arnold's avatar
Eckhart Arnold committed
1171
        self.last_rb__loc__ = -1  # rollback location
1172
        parser = self[start_parser] if isinstance(start_parser, str) else start_parser
1173
        self.start_parser__ = parser.parser if isinstance(parser, Forward) else parser
1174
1175
1176
1177
1178
1179
1180
1181
        assert parser.grammar == self, "Cannot run parsers from a different grammar object!" \
                                       " %s vs. %s" % (str(self), str(parser.grammar))
        result = None  # type: Optional[Node]
        stitches = []  # type: List[Node]
        rest = self.document__
        if not rest:
            result, _ = parser(rest)
            if result is None:
Eckhart Arnold's avatar
Eckhart Arnold committed
1182
                result = Node(ZOMBIE_TAG, '').with_pos(0)
1183
1184
                if lookahead_failure_only(parser):
                    self.tree__.new_error(
1185
1186
                        result, 'Parser "%s" only did not match empty document '
                                'because of lookahead' % str(parser),
1187
                        Error.PARSER_LOOKAHEAD_FAILURE_ONLY)
1188
1189
1190
1191
1192
                else:
                    self.tree__.new_error(
                        result, 'Parser "%s" did not match empty document.' % str(parser),
                        Error.PARSER_DID_NOT_MATCH)

1193
1194
1195
        # copy to local variable, so break condition can be triggered manually
        max_parser_dropouts = self.max_parser_dropouts__
        while rest and len(stitches) < max_parser_dropouts:
1196
            result, rest = parser(rest)
1197
            if rest and complete_match:
1198
1199
1200
                fwd = rest.find("\n") + 1 or len(rest)
                skip, rest = rest[:fwd], rest[fwd:]
                if result is None:
1201
                    err_info = '' if not self.history_tracking__ else \
di68kap's avatar
di68kap committed
1202
1203
1204
1205
1206
                               '\n    Most advanced: %s\n    Last match:    %s;' % \
                               (str(HistoryRecord.most_advanced_match(self.history__)),
                                str(HistoryRecord.last_match(self.history__)))
                    # Check if a Lookahead-Parser did match. Needed for testing, because
                    # in a test case this is not necessarily an error.
1207
                    if lookahead_failure_only(parser):
1208
1209
                        error_msg = 'Parser "%s" only did not match because of lookahead! ' \
                                    % str(parser) + err_info
1210
                        error_code = Error.PARSER_LOOKAHEAD_FAILURE_ONLY
di68kap's avatar
di68kap committed
1211
                    else:
1212
                        error_msg = 'Parser "%s" did not match!' % str(parser) + err_info
di68kap's avatar
di68kap committed
1213
                        error_code = Error.PARSER_DID_NOT_MATCH
1214
1215
                else:
                    stitches.append(result)
1216
                    for h in reversed(self.history__):
1217
1218
                        if h.node and h.node.tag_name != EMPTY_NODE.tag_name \
                                and any('Lookahead' in tag for tag, _ in h.call_stack):
1219
1220
                            break
                    else:
eckhart's avatar
eckhart committed
1221
                        h = HistoryRecord([], EMPTY_NODE, StringView(''), (0, 0))
1222
                    if h.status == h.MATCH and (h.node.pos + len(h.node) == len(self.document__)):
1223
                        # TODO: this case still needs unit-tests and support in testing.py
1224
                        error_msg = "Parser stopped before end, but matched with lookahead."
1225
                        error_code = Error.PARSER_LOOKAHEAD_MATCH_ONLY
1226
                        max_parser_dropouts = -1  # no further retries!
1227
1228
1229
1230
1231
1232
                    else:
                        error_msg = "Parser stopped before end" \
                            + (("! trying to recover"
                                + (" but stopping history recording at this point."
                                   if self.history_tracking__ else "..."))
                                if len(stitches) < self.max_parser_dropouts__
di68kap's avatar
di68kap committed
1233
1234
                                else " too often!" if self.max_parser_dropouts__ > 1 else "!"
                                     + " Terminating parser.")
1235
                        error_code = Error.PARSER_STOPPED_BEFORE_END
1236
1237
1238
1239
1240
                stitch = Node(ZOMBIE_TAG, skip).with_pos(tail_pos(stitches))
                stitches.append(stitch)
                error = Error(error_msg, stitch.pos, error_code)
                self.tree__.add_error(stitch, error)
                if self.history_tracking__:
eckhart's avatar
eckhart committed
1241
                    lc = line_col(self.document_lbreaks__, error.pos)
1242
                    self.history__.append(HistoryRecord([(stitch.tag_name, stitch.pos)], stitch,
eckhart's avatar
eckhart committed
1243
                                                        self.document__[error.pos:], lc, [error]))
1244
            else:
eckhart's avatar
eckhart committed
1245
1246
                # if complete_match is False, ignore the rest and leave while loop
                rest = StringView('')
1247
1248
        if stitches:
            if rest:
1249
                stitches.append(Node(ZOMBIE_TAG, rest))
Eckhart Arnold's avatar
Eckhart Arnold committed
1250
            result = Node(ZOMBIE_TAG, tuple(stitches)).with_pos(0)
1251
        if any(self.variables__.values()):
di68kap's avatar
di68kap committed
1252
            error_msg = "Capture-retrieve-stack not empty after end of parsing: " \
1253
                + str(self.variables__)
di68kap's avatar
di68kap committed
1254
            error_code = Error.CAPTURE_STACK_NOT_EMPTY
1255
1256
1257
1258
1259
            if result:
                if result.children:
                    # add another child node at the end to ensure that the position
                    # of the error will be the end of the text. Otherwise, the error
                    # message above ("...after end of parsing") would appear illogical.
Eckhart Arnold's avatar
Eckhart Arnold committed
1260
                    error_node = Node(ZOMBIE_TAG, '').with_pos(tail_pos(result.children))
di68kap's avatar
di68kap committed
1261
                    self.tree__.new_error(error_node, error_msg, error_code)
eckhart's avatar
eckhart committed
1262
                    result.result = result.children + (error_node,)
1263
                else:
di68kap's avatar
di68kap committed
1264
                    self.tree__.new_error(result, error_msg, error_code)
eckhart's avatar
eckhart committed
1265
1266
        if result:
            self.tree__.swallow(result)
eckhart's avatar
eckhart committed
1267
        self.start_parser__ = None
1268
        # self.history_tracking__ = save_history_tracking
1269
        return self.tree__
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281


    def push_rollback__(self, location, func):
        """
        Adds a rollback function that either removes or re-adds
        values on the variable stack (`self.variables`) that have been
        added (or removed) by Capture or Pop Parsers, the results of
        which have been dismissed.
        """
        self.rollback__.append((location, func))
        self.last_rb__loc__ = location

1282
1283
1284
1285
1286
    @property
    def document_lbreaks__(self) -> List[int]:
        if not self._document_lbreaks__:
            self._document_lbreaks__ = linebreaks(self.document__)
        return self._document_lbreaks__
1287
1288
1289
1290
1291
1292

    def rollback_to__(self, location):
        """
        Rolls back the variable stacks (`self.variables`) to its
        state at an earlier location in the parsed document.
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
1293
        while self.rollback__ and self.rollback__[-1][0] >= location:
1294
1295
1296
1297
1298
1299
1300
            _, rollback_func = self.rollback__.pop()
            # assert not loc > self.last_rb__loc__, \
            #     "Rollback confusion: line %i, col %i < line %i, col %i" % \
            #     (*line_col(self.document__, len(self.document__) - loc),
            #      *line_col(self.document__, len(self.document__) - self.last_rb__loc__))
            rollback_func()
        self.last_rb__loc__ == self.rollback__[-1][0] if self.rollback__ \
di68kap's avatar
di68kap committed
1301
            else (self.document__.__len__() + 1)
1302
1303


1304
    def line_col__(self, text: Union[StringView, str]) -> Tuple[int, int]:
1305
1306
1307
1308
1309
1310
1311
1312
        """
        Returns the line and column where text is located in the document.
        Does not check, whether text is actually a substring of the currently
        parsed document.
        """
        return line_col(self.document_lbreaks__, self.document_length__ - len(text))


1313
1314
    def static_analysis(self) -> List[GrammarErrorType]:
        """
eckhart's avatar
eckhart committed
1315
        EXPERIMENTAL
1316
1317
1318

        Checks the parser tree statically for possible errors. At the moment,
        no checks are implemented
1319
1320
1321
1322
1323
1324
1325

        :return: a list of error-tuples consisting of the narrowest containing
            named parser (i.e. the symbol on which the failure occurred),
            the actual parser that failed and an error object.
        """
        error_list = []  # type: List[GrammarErrorType]

eckhart's avatar
eckhart committed
1326
1327
1328
1329
1330
        # disabled, because no use case as of now
        # def visit_parser(parser: Parser) -> None:
        #     nonlocal error_list
        #
        # self.root_parser__.apply(visit_parser)
1331
        return error_list
1332
1333


1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
def dsl_error_msg(parser: Parser, error_str: str) -> str:
    """
    Returns an error message for errors in the parser configuration,
    e.g. errors that result in infinite loops.

    Args:
        parser (Parser):  The parser where the error was noticed. Note
            that this is not necessarily the parser that caused the
            error but only where the error became apparent.
        error_str (str):  A short string describing the error.
    Returns:
        str: An error message including the call stack if history
        tacking has been turned in the grammar object.
    """
    msg = ["DSL parser specification error:", error_str, 'Caught by parser "%s".' % str(parser)]
    if parser.grammar.history__:
        msg.extend(["\nCall stack:", parser.grammar.history__[-1].stack])
    else:
        msg.extend(["\nEnable history tracking in Grammar object to display call stack."])
    return " ".join(msg)


1356
GRAMMAR_PLACEHOLDER = Grammar()
eckhart's avatar
eckhart committed
1357
1358


1359
1360
########################################################################
#
1361
# _Token and Regular Expression parser classes (i.e. leaf classes)
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
#
########################################################################


class PreprocessorToken(Parser):
    """
    Parses tokens that have been inserted by a preprocessor.

    Preprocessors can generate Tokens with the ``make_token``-function.
    These tokens start and end with magic characters that can only be
    matched by the PreprocessorToken Parser. Such tokens can be used to
    insert BEGIN - END delimiters at the beginning or ending of a
    quoted block, for example.
    """

    def __init__(self, token: str) -> None:
        assert token and token.isupper()
1379
        assert RX_TOKEN_NAME.match(token)
1380
        super(PreprocessorToken, self).__init__()
di68kap's avatar
di68kap committed
1381
        self.pname = token
1382
1383
        if token:
            self.anonymous = False
1384

1385
    def __deepcopy__(self, memo):
di68kap's avatar
di68kap committed
1386
        duplicate = self.__class__(self.pname)
1387
1388
        # duplicate.pname = self.pname
        duplicate.anonymous = self.anonymous
1389
        duplicate.tag_name = self.tag_name
1390
1391
        return duplicate

1392
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1393
1394
1395
        if text[0:1] == BEGIN_TOKEN:
            end = text.find(END_TOKEN, 1)
            if end < 0:
eckhart's avatar
eckhart committed
1396
                node = Node(self.tag_name, '')  # type: Node
eckhart's avatar
eckhart committed
1397
1398
                self.grammar.tree__.new_error(
                    node,
1399
                    'END_TOKEN delimiter missing from preprocessor token. '
eckhart's avatar
eckhart committed
1400
                    '(Most likely due to a preprocessor bug!)')
1401
                return node, text[1:]
1402
            elif end == 0:
1403
                node = Node(self.tag_name, '')
eckhart's avatar
eckhart committed
1404
1405
                self.grammar.tree__.new_error(
                    node,
1406
1407
                    'Preprocessor-token cannot have zero length. '
                    '(Most likely due to a preprocessor bug!)')
1408
                return node, text[2:]
1409
            elif text.find(BEGIN_TOKEN, 1, end) >= 0:
di68kap's avatar
di68kap committed
1410
                node = Node(self.tag_name, text[len(self.pname) + 1:end])
eckhart's avatar
eckhart committed
1411
1412
                self.grammar.tree__.new_error(
                    node,
1413
1414
1415
1416
                    'Preprocessor-tokens must not be nested or contain '
                    'BEGIN_TOKEN delimiter as part of their argument. '
                    '(Most likely due to a preprocessor bug!)')
                return node, text[end:]
di68kap's avatar
di68kap committed
1417
            if text[1:len(self.pname) + 1] == self.pname:
1418
1419
                if self.drop_content:
                    return EMPTY_NODE, text[end + 1:]
di68kap's avatar
di68kap committed
1420
                return Node(self.tag_name, text[len(self.pname) + 2:end]), text[end + 1:]
1421
1422
1423
        return None, text


1424
class Token(Parser):
eckhart's avatar
eckhart committed
1425
    """
1426
    Parses plain text strings. (Could be done by RegExp as well, but is faster.)
eckhart's avatar
eckhart committed
1427

eckhart's avatar
eckhart committed
1428
1429
    Example::

1430
        >>> while_token = Token("while")
eckhart's avatar
eckhart committed
1431
1432
        >>> Grammar(while_token)("while").content
        'while'
eckhart's avatar
eckhart committed
1433
    """
1434
    assert TOKEN_PTYPE == ":Token"
eckhart's avatar
eckhart committed
1435

1436
    def __init__(self, text: str) -> None:
1437
        super(Token, self).__init__()
eckhart's avatar
eckhart committed
1438
        self.text = text
1439
        self.len = len(text)
eckhart's avatar
eckhart committed
1440
1441

    def __deepcopy__(self, memo):
1442
        duplicate = self.__class__(self.text)
1443
        copy_parser_attrs(self, duplicate)
1444
        return duplicate
eckhart's avatar
eckhart committed
1445

1446
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
eckhart's avatar
eckhart committed
1447
        if text.startswith(self.text):
1448
1449
1450
            if self.drop_content:
                return EMPTY_NODE, text[self.len:]
            elif self.text or not self.anonymous:
1451
                return Node(self.tag_name, self.text, True), text[self.len:]
1452
            return EMPTY_NODE, text
eckhart's avatar
eckhart committed
1453
1454
        return None, text

1455
    def __repr__(self):
di68kap's avatar
di68kap committed
1456
        return ("'%s'" if self.text.find("'") <= 0 else '"%s"') % abbreviate_middle(self.text, 80)
1457

eckhart's avatar
eckhart committed
1458

1459
1460
1461
1462
1463
1464
1465
1466
1467
class RegExp(Parser):
    r"""
    Regular expression parser.

    The RegExp-parser parses text that matches a regular expression.
    RegExp can also be considered as the "atomic parser", because all
    other parsers delegate part of the parsing job to other parsers,
    but do not match text directly.

1468
1469
1470
1471
1472
    Example::

        >>> word = RegExp(r'\w+')
        >>> Grammar(word)("Haus").content
        'Haus'
1473

1474
    EBNF-Notation:  ``/ ... /``
1475

1476
    EBNF-Example:   ``word = /\w+/``
1477
1478
    """

1479
    def __init__(self, regexp) -> None:
1480
        super(RegExp, self).__init__()
1481
1482
1483
1484
1485
1486
1487
1488
        self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

    def __deepcopy__(self, memo):
        # `regex` supports deep copies, but not `re`
        try:
            regexp = copy.deepcopy(self.regexp, memo)
        except TypeError:
            regexp = self.regexp.pattern
1489
        duplicate = self.__class__(regexp)
1490
        copy_parser_attrs(self, duplicate)
1491
        return duplicate
1492

1493
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1494
1495
1496
        match = text.match(self.regexp)
        if match:
            capture = match.group(0)
1497
            if capture or not self.anonymous:
1498
                end = text.index(match.end())
1499
1500
                if self.drop_content:
                    return EMPTY_NODE, text[end:]
1501
                return Node(self.tag_name, capture, True), text[end:]
1502
            return EMPTY_NODE, text
1503
1504
1505
        return None, text

    def __repr__(self):
di68kap's avatar
di68kap committed
1506
        return escape_control_characters('/%s/' % abbreviate_middle(self.regexp.pattern, 120))
1507
1508


Eckhart Arnold's avatar
Eckhart Arnold committed
1509
def DropToken(text: str) -> Token:
eckhart's avatar
eckhart committed
1510
    return cast(Token, Drop(Token(text)))
Eckhart Arnold's avatar
Eckhart Arnold committed
1511
1512
1513


def DropRegExp(regexp) -> RegExp:
eckhart's avatar
eckhart committed
1514
    return cast(RegExp, Drop(RegExp(regexp)))
Eckhart Arnold's avatar
Eckhart Arnold committed
1515
1516


eckhart's avatar
eckhart committed
1517
def withWS(parser_factory, wsL='', wsR=r'\s*'):
1518
    """Syntactic Sugar for 'Series(Whitespace(wsL), parser_factory(), Whitespace(wsR))'.
1519
    """
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
    if wsL and isinstance(wsL, str):
        wsL = Whitespace(wsL)
    if wsR and isinstance(wsR, str):
        wsR = Whitespace(wsR)
    if wsL and wsR:
        return Series(wsL, parser_factory(), wsR)
    elif wsL:
        return Series(wsL, parser_factory())
    elif wsR:
        return Series(parser_factory(), wsR)
    else:
        return parser_factory()
1532

1533

eckhart's avatar
eckhart committed
1534
def RE(regexp, wsL='', wsR=r'\s*'):
1535
    """Syntactic Sugar for 'Series(Whitespace(wsL), RegExp(regexp), Whitespace(wsR))'"""
eckhart's avatar
eckhart committed
1536
    return withWS(lambda: RegExp(regexp), wsL, wsR)
1537
1538


eckhart's avatar
eckhart committed
1539
def TKN(token, wsL='', wsR=r'\s*'):
1540
1541
    """Syntactic Sugar for 'Series(Whitespace(wsL), Token(token), Whitespace(wsR))'"""
    return withWS(lambda: Token(token), wsL, wsR)
1542
1543


eckhart's avatar
eckhart committed
1544
1545
def DTKN(token, wsL='', wsR=r'\s*'):
    """Syntactic Sugar for 'Series(Whitespace(wsL), DropToken(token), Whitespace(wsR))'"""
eckhart's avatar
eckhart committed
1546
    return withWS(lambda: Drop(Token(token)), wsL, wsR)
eckhart's avatar
eckhart committed
1547
1548


1549
1550
1551
1552
class Whitespace(RegExp):
    """An variant of RegExp that signifies through its class name that it
    is a RegExp-parser for whitespace."""
    assert WHITESPACE_PTYPE == ":Whitespace"
eckhart's avatar
eckhart committed
1553

eckhart's avatar
eckhart committed
1554
1555
1556
    def __repr__(self):
        return '~'

1557
1558
1559

########################################################################
#
eckhart's avatar
eckhart committed
1560
# Meta parser classes, i.e. parsers that contain other parsers
1561
1562
1563
1564
1565
# to which they delegate parsing
#
########################################################################


eckhart's avatar
eckhart committed
1566
class MetaParser(Parser):
1567
    """Class Meta-Parser contains functions for the optimization of
1568
    return values of parsers that call other parsers (i.e descendants
1569
1570
1571
1572
1573
1574
    of classes UnaryParser and NaryParser).

    The optimization consists in flattening the tree by eliminating
    anonymous nodes. This is the same as what the function
    DHParser.transform.flatten() does, only at an earlier stage.
    The reasoning is that the earlier the tree is reduced, the less work
1575
    remains to do at all the later processing stages.
1576
1577
    """

eckhart's avatar
eckhart committed
1578
    def _return_value(self, node: Optional[Node]) -> Node:
1579
        """
1580
1581
        Generates a return node if a single node has been returned from
        any descendant parsers. Anonymous empty nodes will be dropped.
1582
1583
1584
1585
1586
        If `self` is an unnamed parser, a non-empty descendant node
        will be passed through. If the descendant node is anonymous,
        it will be dropped and only its result will be kept.
        In all other cases or if the optimization is turned off by
        setting `grammar.flatten_tree__` to False, a new node will be
1587
        generated and the descendant node will be its single child.
1588
        """
eckhart's avatar
eckhart committed
1589
        assert node is None or isinstance(node, Node)
1590
        if self._grammar.flatten_tree__:
di68kap's avatar
di68kap committed
1591
            if node is not None:
1592
                if self.anonymous:
1593
1594
                    if self.drop_content:
                        return EMPTY_NODE
1595
1596
1597
1598
1599
1600
1601
                    return node
                if node.tag_name[0] == ':':  # faster than node.is_anonymous()
                    return Node(self.tag_name, node._result)
                return Node(self.tag_name, node)
            elif self.anonymous:
                return EMPTY_NODE  # avoid creation of a node object for anonymous empty nodes
            return Node(self.tag_name, ())
1602
1603
        if self.drop_content:
            return EMPTY_NODE
1604
        return Node(self.tag_name, node or ())  # unoptimized code
eckhart's avatar
eckhart committed
1605

eckhart's avatar
eckhart committed
1606
1607
    @cython.locals(N=cython.int)
    def _return_values(self, results: Tuple[Node, ...]) -> Node:
1608
1609
1610
1611
1612
1613
        """
        Generates a return node from a tuple of returned nodes from
        descendant parsers. Anonymous empty nodes will be removed from
        the tuple. Anonymous child nodes will be flattened if
        `grammar.flatten_tree__` is True.
        """
eckhart's avatar
eckhart committed
1614
        assert isinstance(results, tuple)
1615
1616
        if self.drop_content:
            return EMPTY_NODE
eckhart's avatar
eckhart committed
1617
1618
        N = len(results)
        if N > 1:
1619
            if self._grammar.flatten_tree__:
1620
                nr = []  # type: List[Node]
1621
                # flatten parse tree
1622
1623
1624
                for child in results:
                    if child.children and child.tag_name[0] == ':':  # faster than c.is_anonymous():
                        nr.extend(child.children)
1625
                    elif child._result or child.tag_name[0] != ':':
1626
1627
1628
                        nr.append(child)
                return Node(self.tag_name, tuple(nr))
            return Node(self.tag_name, results)  # unoptimized code
eckhart's avatar
eckhart committed
1629
1630
        elif N == 1:
            return self._return_value(results[0])
1631
        elif self._grammar.flatten_tree__:
1632
1633
1634
            if self.anonymous:
                return EMPTY_NODE  # avoid creation of a node object for anonymous empty nodes
            return Node(self.tag_name, ())
1635
        return Node(self.tag_name, results)  # unoptimized code
eckhart's avatar
eckhart committed
1636
1637
1638


class UnaryParser(MetaParser):
1639
    """
eckhart's avatar
eckhart committed
1640
    Base class of all unary parsers, i.e. parser that contains
1641
1642
1643
    one and only one other parser, like the optional parser for example.

    The UnaryOperator base class supplies __deepcopy__ and apply
eckhart's avatar
eckhart committed
1644
    methods for unary parsers. The __deepcopy__ method needs
1645
1646
1647
1648
    to be overwritten, however, if the constructor of a derived class
    has additional parameters.
    """

1649
    def __init__(self, parser: Parser) -> None:
eckhart's avatar
eckhart committed
1650
        super(UnaryParser, self).__init__()
1651
1652
1653
1654
1655
        assert isinstance(parser, Parser), str(parser)
        self.parser = parser  # type: Parser

    def __deepcopy__(self, memo):
        parser = copy.deepcopy(self.parser, memo)
1656
        duplicate = self.__class__(parser)
1657
        copy_parser_attrs(self, duplicate)
1658
        return duplicate
1659

eckhart's avatar
eckhart committed
1660
1661
    def sub_parsers(self) -> Tuple['Parser', ...]:
        return (self.parser,)
1662
1663


eckhart's avatar
eckhart committed
1664
class NaryParser(MetaParser):
1665
    """
eckhart's avatar
eckhart committed
1666
    Base class of all Nnary parsers, i.e. parser that
1667
1668
1669
1670
    contains one or more other parsers, like the alternative
    parser for example.

    The NnaryOperator base class supplies __deepcopy__ and apply methods
eckhart's avatar
eckhart committed
1671
    for unary parsers. The __deepcopy__ method needs to be
1672
1673
1674
1675
    overwritten, however, if the constructor of a derived class has
    additional parameters.
    """

1676
    def __init__(self, *parsers: Parser) -> None:
eckhart's avatar
eckhart committed
1677
        super(NaryParser, self).__init__()
1678
1679
1680
1681
1682
        assert all([isinstance(parser, Parser) for parser in parsers]), str(parsers)
        self.parsers = parsers  # type: Tuple[Parser, ...]

    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
1683
        duplicate = self.__class__(*parsers)
1684
        copy_parser_attrs(self, duplicate)
1685
        return duplicate
1686

eckhart's avatar
eckhart committed
1687
    def sub_parsers(self) -> Tuple['Parser', ...]:
1688
        return self.parsers
1689
1690


eckhart's avatar
eckhart committed
1691
class Option(UnaryParser):
1692
    r"""
1693
    Parser ``Option`` always matches, even if its child-parser
1694
1695
    did not match.

1696
    If the child-parser did not match ``Option`` returns a node
1697
1698
    with no content and does not move forward in the text.

1699
    If the child-parser did match, ``Option`` returns the a node
eckhart's avatar
eckhart committed
1700
    with the node returned by the child-parser as its single
1701
1702
1703
    child and the text at the position where the child-parser
    left it.

1704
1705
    Examples::

1706
        >>> number = Option(TKN('-')) + RegExp(r'\d+') + Option(RegExp(r'\.\d+'))
1707
1708
        >>> Grammar(number)('3.14159').content
        '3.14159'
eckhart's avatar
eckhart committed
1709
        >>> Grammar(number)('3.14159').as_sxpr()
eckhart's avatar
eckhart committed
1710
        '(:Series (:RegExp "3") (:RegExp ".14159"))'
1711
1712
        >>> Grammar(number)('-1').content
        '-1'
1713

1714
    EBNF-Notation: ``[ ... ]``
1715

1716
    EBNF-Example:  ``number = ["-"]  /\d+/  [ /\.\d+/ ]``
1717
1718
    """

1719
    def __init__(self, parser: Parser) -> None:
1720
        super(Option, self).__init__(parser)
1721
1722
        # assert isinstance(parser, Parser)
        assert not isinstance(parser, Option), \
1723
            "Redundant nesting of options: %s(%s)" % (self.ptype, parser.pname)
1724

1725
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1726
        node, text = self.parser(text)
eckhart's avatar
eckhart committed
1727
        return self._return_value(node), text
1728
1729
1730

    def __repr__(self):
        return '[' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
di68kap's avatar
di68kap committed
1731
                      and not self.parser.pname else self.parser.repr) + ']'
1732
1733
1734
1735
1736
1737
1738
1739


class ZeroOrMore(Option):
    r"""
    `ZeroOrMore` applies a parser repeatedly as long as this parser
    matches. Like `Option` the `ZeroOrMore` parser always matches. In
    case of zero repetitions, the empty match `((), text)` is returned.

1740
1741
    Examples::

1742
        >>> sentence = ZeroOrMore(RE(r'\w+,?')) + TKN('.')
1743
1744
1745
1746
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> Grammar(sentence)('.').content  # an empty sentence also matches
        '.'
1747
1748
        >>> forever = ZeroOrMore(RegExp(''))
        >>> Grammar(forever)('')  # infinite loops will automatically be broken
1749
        Node(:EMPTY, )
1750

1751
    EBNF-Notation: ``{ ... }``
1752

1753
    EBNF-Example:  ``sentence = { /\w+,?/ } "."``
1754
1755
    """

eckhart's avatar
eckhart committed
1756
    @cython.locals(n=cython.int)
1757
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1758
        results = ()  # type: Tuple[Node, ...]
di68kap's avatar
di68kap committed
1759
1760
1761
1762
        len = text.__len__()
        n = len + 1  # type: int
        while len < n:  # text and len(text) < n:
            n = len
1763
            node, text = self.parser(text)
di68kap's avatar
di68kap committed
1764
            len = text.__len__()
di68kap's avatar
di68kap committed
1765
            if node is None:
1766
                break
1767
            if node._result or not node.tag_name.startswith(':'):  # drop anonymous empty nodes
1768
                results += (node,)
di68kap's avatar
di68kap committed
1769
            if len == n:
1770
                break  # avoid infinite loop
eckhart's avatar
eckhart committed
1771
1772
        nd = self._return_values(results)  # type: Node
        return nd, text
1773
1774
1775

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
di68kap's avatar
di68kap committed
1776
                      and not self.parser.pname else self.parser.repr) + '}'
1777
1778


eckhart's avatar
eckhart committed
1779
class OneOrMore(UnaryParser):
1780
1781
1782
1783
1784
    r"""
    `OneOrMore` applies a parser repeatedly as long as this parser
    matches. Other than `ZeroOrMore` which always matches, at least
    one match is required by `OneOrMore`.

1785
1786
    Examples::

1787
        >>> sentence = OneOrMore(RE(r'\w+,?')) + TKN('.')
1788
1789
1790
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> str(Grammar(sentence)('.'))  # an empty sentence also matches
1791
        ' <<< Error on "." | Parser "{/\\w+,?/ ~}+ \'.\' ~" did not match! >>> '
1792
1793
        >>> forever = OneOrMore(RegExp(''))
        >>> Grammar(forever)('')  # infinite loops will automatically be broken
1794
        Node(:EMPTY, )
1795

1796
    EBNF-Notation: ``{ ... }+``
1797

1798
    EBNF-Example:  ``sentence = { /\w+,?/ }+``
1799
1800
    """

1801
    def __init__(self, parser: Parser) -> None:
1802
        super(OneOrMore, self).__init__(parser)
1803
1804
        assert not isinstance(parser, Option), \
            "Use ZeroOrMore instead of nesting OneOrMore and Option: " \
1805
            "%s(%s)" % (self.ptype, parser.pname)
1806

1807
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1808
1809
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
1810
        match_flag = False
di68kap's avatar
di68kap committed
1811
1812
1813
1814
        len = text.__len__()
        n = len + 1  # type: int
        while len < n:  # text_ and len(text_) < n:
            n = len
1815
            node, text_ = self.parser(text_)
di68kap's avatar
di68kap committed
1816
            len = text_.__len__()
di68kap's avatar
di68kap committed
1817
            if node is None:
1818
                break
1819
            match_flag = True
1820
            if node._result or not node.tag_name.startswith(':'):  # drop anonymous empty nodes
1821
                results += (node,)
di68kap's avatar
di68kap committed
1822
            if len == n:
1823
                break  # avoid infinite loop
1824
        if not match_flag:
1825
            return None, text
eckhart's avatar
eckhart committed
1826
1827
        nd = self._return_values(results)  # type: Node
        return nd, text_
1828
1829
1830

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
di68kap's avatar
di68kap committed
1831
                      and not self.parser.pname else self.parser.repr) + '}+'
1832
1833


1834
1835
1836
1837
MessagesType = List[Tuple[Union[str, Any], str]]
NO_MANDATORY = 1000


eckhart's avatar
eckhart committed
1838
class MandatoryElementsParser(NaryParser):
1839
    r"""
1840
    Attributes:
eckhart's avatar
eckhart committed
1841
        mandatory:  Number of the element starting at which the element
1842
1843
                and all following elements are considered "mandatory". This
                means that rather than returning a non-match an error message
eckhart's avatar
eckhart committed
1844
                is issued. The default value is NO_MANDATORY, which means that
1845
                no elements are mandatory.
eckhart's avatar
eckhart committed
1846
1847
1848
1849
1850
1851
1852
        err_msgs:  A list of pairs of regular expressions (or simple
                strings for that matter) and error messages that are chosen
                if the regular expression matches the text where the error
                occurred.
        skip_list: A list of regular expressions. The rest of the text is searched for
                each of these. The closest match is the point where parsing will be
                resumed.
1853
    """
1854

1855
    def __init__(self, *parsers: Parser,
1856
                 mandatory: int = NO_MANDATORY,
eckhart's avatar
eckhart committed
1857
                 err_msgs: MessagesType = [],
1858
                 skip: ResumeList = []) -> None:
eckhart's avatar
eckhart committed
1859
        super(MandatoryElementsParser, self).__init__(*parsers)
1860
1861
1862
1863
1864
        length = len(self.parsers)
        if mandatory < 0:
            mandatory += length

        assert not (mandatory == NO_MANDATORY and err_msgs), \
1865
            'Custom error messages require that parameter "mandatory" is set!'
1866
        assert not (mandatory == NO_MANDATORY and skip), \
1867
            'Search expressions for skipping text require that parameter "mandatory" is set!'
1868
        assert length > 0, \
eckhart's avatar
eckhart committed
1869
            'Number of elements %i is below minimum length of 1' % length
1870
        assert length < NO_MANDATORY, \
eckhart's avatar
eckhart committed
1871
            'Number of elemnts %i of series exceeds maximum length of %i' % (length, NO_MANDATORY)
1872
1873
1874

        assert 0 <= mandatory < length or mandatory == NO_MANDATORY

1875
        self.mandatory = mandatory  # type: int
Eckhart Arnold's avatar
Eckhart Arnold committed
1876
        self.err_msgs = err_msgs    # type: MessagesType
1877
        self.skip = skip            # type: ResumeList