parse.py 119 KB
Newer Older
1001
1002
1003


    def __init__(self, root: Parser = None) -> None:
1004
        self.all_parsers__ = set()             # type: Set[Parser]
1005
        # add compiled regular expression for comments, if it does not already exist
eckhart's avatar
eckhart committed
1006
1007
1008
1009
1010
        if not hasattr(self, 'comment_rx__') or self.comment_rx__ is None:
            if hasattr(self.__class__, 'COMMENT__') and self.__class__.COMMENT__:
                self.comment_rx__ = re.compile(self.__class__.COMMENT__)
            else:
                self.comment_rx__ = RX_NEVER_MATCH
1011
        else:
eckhart's avatar
eckhart committed
1012
1013
            assert ((self.__class__.COMMENT__
                     and self.__class__.COMMENT__ == self.comment_rx__.pattern)
eckhart's avatar
eckhart committed
1014
                    or (not self.__class__.COMMENT__ and self.comment_rx__ == RX_NEVER_MATCH))
1015
        self.start_parser__ = None             # type: Optional[Parser]
1016
1017
        self._dirty_flag__ = False             # type: bool
        self.memoization__ = True              # type: bool
1018
1019
        self.history_tracking__ = False        # type: bool
        self.resume_notices__ = False          # type: bool
1020
        self.flatten_tree__ = get_config_value('flatten_tree')                    # type: bool
1021
1022
1023
        self.left_recursion_depth__ = get_config_value('left_recursion_depth')    # type: int
        self.max_parser_dropouts__ = get_config_value('max_parser_dropouts')      # type: int
        self.reentry_search_window__ = get_config_value('reentry_search_window')  # type: int
1024
1025
1026
1027
1028
1029
1030
        self._reset__()

        # prepare parsers in the class, first
        self._assign_parser_names__()

        # then deep-copy the parser tree from class to instance;
        # parsers not connected to the root object will be copied later
1031
1032
1033
        # on demand (see Grammar.__getitem__()).
        # (Usually, all parsers should be connected to the root object. But
        # during testing and development this does not need to be the case.)
eckhart's avatar
eckhart committed
1034
1035
1036
1037
        self.root_parser__ = copy.deepcopy(root) if root else copy.deepcopy(self.__class__.root__)
        self.root_parser__.apply(self._add_parser__)
        assert 'root_parser__' in self.__dict__
        assert self.root_parser__ == self.__dict__['root_parser__']
1038

1039
1040
1041
1042
1043
1044
1045
1046
1047
        if self.__class__.static_analysis_pending__ \
                and get_config_value('static_analysis') in {'early', 'late'}:
            try:
                result = self.static_analysis()
                if result:
                    raise GrammarError(result)
                self.__class__.static_analysis_pending__.pop()
            except (NameError, AttributeError):
                pass  # don't fail the initialization of PLACEHOLDER
1048

1049
1050
    def __str__(self):
        return self.__class__.__name__
1051
1052
1053
1054
1055

    def __getitem__(self, key):
        try:
            return self.__dict__[key]
        except KeyError:
eckhart's avatar
eckhart committed
1056
            parser_template = getattr(self.__class__, key, None)
1057
1058
1059
1060
            if parser_template:
                # add parser to grammar object on the fly...
                parser = copy.deepcopy(parser_template)
                parser.apply(self._add_parser__)
eckhart's avatar
eckhart committed
1061
                assert self[key] == parser
1062
                return self[key]
1063
            raise UnknownParserError('Unknown parser "%s" !' % key)
1064

1065
1066
    def __contains__(self, key):
        return key in self.__dict__ or hasattr(self, key)
1067

1068

1069
    def _reset__(self):
1070
        self.tree__ = RootNode()              # type: RootNode
1071
1072
        self.document__ = EMPTY_STRING_VIEW   # type: StringView
        self._reversed__ = EMPTY_STRING_VIEW  # type: StringView
eckhart's avatar
eckhart committed
1073
        self.document_length__ = 0            # type: int
1074
        self._document_lbreaks__ = []         # type: List[int]
1075
        # variables stored and recalled by Capture and Retrieve parsers
eckhart's avatar
eckhart committed
1076
        self.variables__ = defaultdict(lambda: [])  # type: DefaultDict[str, List[str]]
1077
1078
1079
        self.rollback__ = []                  # type: List[Tuple[int, Callable]]
        self.last_rb__loc__ = -1              # type: int
        # support for call stack tracing
eckhart's avatar
eckhart committed
1080
        self.call_stack__ = []                # type: List[CallItem]  # tag_name, location
1081
1082
1083
1084
1085
        # snapshots of call stacks
        self.history__ = []                   # type: List[HistoryRecord]
        # also needed for call stack tracing
        self.moving_forward__ = False         # type: bool
        self.recursion_locations__ = set()    # type: Set[int]
1086
        self.last_recursion_location__ = -1   # type: int
1087
        self.most_recent_error__ = None       # type: Optional[ParserError]
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097


    @property
    def reversed__(self) -> StringView:
        """
        Returns a reversed version of the currently parsed document. As
        about the only case where this is needed is the Lookbehind-parser,
        this is done lazily.
        """
        if not self._reversed__:
1098
            self._reversed__ = StringView(self.document__.get_text()[::-1])
1099
1100
1101
1102
1103
1104
1105
1106
        return self._reversed__


    def _add_parser__(self, parser: Parser) -> None:
        """
        Adds the particular copy of the parser object to this
        particular instance of Grammar.
        """
di68kap's avatar
di68kap committed
1107
        if parser.pname:
1108
            # prevent overwriting instance variables or parsers of a different class
eckhart's avatar
eckhart committed
1109
1110
            assert (parser.pname not in self.__dict__
                    or isinstance(self.__dict__[parser.pname], parser.__class__)), \
1111
                ('Cannot add parser "%s" because a field with the same name '
di68kap's avatar
di68kap committed
1112
                 'already exists in grammar object: %s!'
di68kap's avatar
di68kap committed
1113
1114
                 % (parser.pname, str(self.__dict__[parser.pname])))
            setattr(self, parser.pname, parser)
1115
        parser.tag_name = parser.ptype if parser.anonymous else parser.pname
1116
1117
1118
1119
        self.all_parsers__.add(parser)
        parser.grammar = self


eckhart's avatar
eckhart committed
1120
1121
    def __call__(self,
                 document: str,
eckhart's avatar
eckhart committed
1122
                 start_parser: Union[str, Parser] = "root_parser__",
1123
                 *, complete_match: bool = True) -> RootNode:
1124
1125
1126
1127
1128
        """
        Parses a document with with parser-combinators.

        Args:
            document (str): The source text to be parsed.
di68kap's avatar
di68kap committed
1129
1130
            start_parser (str or Parser): The name of the parser with which
                to start. This is useful for testing particular parsers
1131
                (i.e. particular parts of the EBNF-Grammar.)
1132
1133
            complete_match (bool): If True, an error is generated, if
                `start_parser` did not match the entire document.
1134
        Returns:
di68kap's avatar
di68kap committed
1135
            Node: The root node to the parse tree.
1136
        """
1137

eckhart's avatar
eckhart committed
1138
        def tail_pos(predecessors: Union[List[Node], Tuple[Node, ...], None]) -> int:
eckhart's avatar
eckhart committed
1139
1140
            """Adds the position after the last node in the list of
            predecessors to the node."""
eckhart's avatar
eckhart committed
1141
            return predecessors[-1].pos + len(predecessors[-1]) if predecessors else 0
eckhart's avatar
eckhart committed
1142

1143
1144
1145
1146
1147
1148
        def lookahead_failure_only(parser):
            """EXPERIMENTAL!

            Checks if failure to match document was only due to a succeeding
            lookahead parser, which is a common design pattern that can break test
            cases. (Testing for this case allows to modify the error message, so
1149
            that the testing framework knows that the failure is only a
1150
1151
1152
            test-case-artifact and no real failure.
            (See test/test_testing.TestLookahead !)
            """
1153
1154
1155
            def is_lookahead(tag_name: str) -> bool:
                return (tag_name in self and isinstance(self[tag_name], Lookahead)
                        or tag_name[0] == ':' and issubclass(eval(tag_name[1:]), Lookahead))
di68kap's avatar
di68kap committed
1156
1157
1158
            # for h in reversed(self.history__[:-1]):
            #     for tn, pos in h.call_stack:
            #         if is_lookahead(tn) and h.status == HistoryRecord.MATCH:
1159
            #              print(h.call_stack, pos, h.line_col)
eckhart's avatar
eckhart committed
1160
1161
            last_record = self.history__[-2] if len(self.history__) > 1 \
                else None  # type: Optional[HistoryRecord]
1162
            return last_record and parser != self.root_parser__ \
eckhart's avatar
eckhart committed
1163
1164
1165
1166
                and any(h.status == HistoryRecord.MATCH  # or was it HistoryRecord.MATCH !?
                        and any(is_lookahead(tn) and location >= len(self.document__)
                                for tn, location in h.call_stack)
                        for h in self.history__[:-1])
1167

1168
1169
1170
1171
1172
1173
1174
        # assert isinstance(document, str), type(document)
        if self._dirty_flag__:
            self._reset__()
            for parser in self.all_parsers__:
                parser.reset()
        else:
            self._dirty_flag__ = True
eckhart's avatar
eckhart committed
1175

1176
        self.document__ = StringView(document)
eckhart's avatar
eckhart committed
1177
        self.document_length__ = len(self.document__)
1178
        self._document_lbreaks__ = linebreaks(document) if self.history_tracking__ else []
Eckhart Arnold's avatar
Eckhart Arnold committed
1179
        self.last_rb__loc__ = -1  # rollback location
1180
        parser = self[start_parser] if isinstance(start_parser, str) else start_parser
1181
        self.start_parser__ = parser.parser if isinstance(parser, Forward) else parser
1182
1183
1184
1185
1186
1187
1188
1189
        assert parser.grammar == self, "Cannot run parsers from a different grammar object!" \
                                       " %s vs. %s" % (str(self), str(parser.grammar))
        result = None  # type: Optional[Node]
        stitches = []  # type: List[Node]
        rest = self.document__
        if not rest:
            result, _ = parser(rest)
            if result is None:
Eckhart Arnold's avatar
Eckhart Arnold committed
1190
                result = Node(ZOMBIE_TAG, '').with_pos(0)
1191
1192
                if lookahead_failure_only(parser):
                    self.tree__.new_error(
1193
1194
                        result, 'Parser "%s" only did not match empty document '
                                'because of lookahead' % str(parser),
1195
                        Error.PARSER_LOOKAHEAD_FAILURE_ONLY)
1196
1197
1198
1199
1200
                else:
                    self.tree__.new_error(
                        result, 'Parser "%s" did not match empty document.' % str(parser),
                        Error.PARSER_DID_NOT_MATCH)

1201
1202
1203
        # copy to local variable, so break condition can be triggered manually
        max_parser_dropouts = self.max_parser_dropouts__
        while rest and len(stitches) < max_parser_dropouts:
1204
            result, rest = parser(rest)
1205
            if rest and complete_match:
1206
1207
1208
                fwd = rest.find("\n") + 1 or len(rest)
                skip, rest = rest[:fwd], rest[fwd:]
                if result is None:
1209
                    err_info = '' if not self.history_tracking__ else \
di68kap's avatar
di68kap committed
1210
1211
1212
1213
1214
                               '\n    Most advanced: %s\n    Last match:    %s;' % \
                               (str(HistoryRecord.most_advanced_match(self.history__)),
                                str(HistoryRecord.last_match(self.history__)))
                    # Check if a Lookahead-Parser did match. Needed for testing, because
                    # in a test case this is not necessarily an error.
1215
                    if lookahead_failure_only(parser):
1216
1217
                        error_msg = 'Parser "%s" only did not match because of lookahead! ' \
                                    % str(parser) + err_info
1218
                        error_code = Error.PARSER_LOOKAHEAD_FAILURE_ONLY
di68kap's avatar
di68kap committed
1219
                    else:
1220
                        error_msg = 'Parser "%s" did not match!' % str(parser) + err_info
di68kap's avatar
di68kap committed
1221
                        error_code = Error.PARSER_DID_NOT_MATCH
1222
1223
                else:
                    stitches.append(result)
1224
                    for h in reversed(self.history__):
1225
1226
                        if h.node and h.node.tag_name != EMPTY_NODE.tag_name \
                                and any('Lookahead' in tag for tag, _ in h.call_stack):
1227
1228
                            break
                    else:
eckhart's avatar
eckhart committed
1229
                        h = HistoryRecord([], EMPTY_NODE, StringView(''), (0, 0))
1230
                    if h.status == h.MATCH and (h.node.pos + len(h.node) == len(self.document__)):
1231
                        # TODO: this case still needs unit-tests and support in testing.py
1232
                        error_msg = "Parser stopped before end, but matched with lookahead."
1233
                        error_code = Error.PARSER_LOOKAHEAD_MATCH_ONLY
1234
                        max_parser_dropouts = -1  # no further retries!
1235
1236
1237
1238
1239
1240
                    else:
                        error_msg = "Parser stopped before end" \
                            + (("! trying to recover"
                                + (" but stopping history recording at this point."
                                   if self.history_tracking__ else "..."))
                                if len(stitches) < self.max_parser_dropouts__
di68kap's avatar
di68kap committed
1241
1242
                                else " too often!" if self.max_parser_dropouts__ > 1 else "!"
                                     + " Terminating parser.")
1243
                        error_code = Error.PARSER_STOPPED_BEFORE_END
1244
1245
1246
1247
1248
                stitch = Node(ZOMBIE_TAG, skip).with_pos(tail_pos(stitches))
                stitches.append(stitch)
                error = Error(error_msg, stitch.pos, error_code)
                self.tree__.add_error(stitch, error)
                if self.history_tracking__:
eckhart's avatar
eckhart committed
1249
                    lc = line_col(self.document_lbreaks__, error.pos)
1250
                    self.history__.append(HistoryRecord([(stitch.tag_name, stitch.pos)], stitch,
eckhart's avatar
eckhart committed
1251
                                                        self.document__[error.pos:], lc, [error]))
1252
            else:
eckhart's avatar
eckhart committed
1253
1254
                # if complete_match is False, ignore the rest and leave while loop
                rest = StringView('')
1255
1256
        if stitches:
            if rest:
1257
                stitches.append(Node(ZOMBIE_TAG, rest))
Eckhart Arnold's avatar
Eckhart Arnold committed
1258
            result = Node(ZOMBIE_TAG, tuple(stitches)).with_pos(0)
1259
        if any(self.variables__.values()):
di68kap's avatar
di68kap committed
1260
            error_msg = "Capture-retrieve-stack not empty after end of parsing: " \
1261
                + str(self.variables__)
di68kap's avatar
di68kap committed
1262
            error_code = Error.CAPTURE_STACK_NOT_EMPTY
1263
1264
1265
1266
1267
            if result:
                if result.children:
                    # add another child node at the end to ensure that the position
                    # of the error will be the end of the text. Otherwise, the error
                    # message above ("...after end of parsing") would appear illogical.
Eckhart Arnold's avatar
Eckhart Arnold committed
1268
                    error_node = Node(ZOMBIE_TAG, '').with_pos(tail_pos(result.children))
di68kap's avatar
di68kap committed
1269
                    self.tree__.new_error(error_node, error_msg, error_code)
eckhart's avatar
eckhart committed
1270
                    result.result = result.children + (error_node,)
1271
                else:
di68kap's avatar
di68kap committed
1272
                    self.tree__.new_error(result, error_msg, error_code)
eckhart's avatar
eckhart committed
1273
1274
        if result:
            self.tree__.swallow(result)
eckhart's avatar
eckhart committed
1275
        self.start_parser__ = None
1276
        # self.history_tracking__ = save_history_tracking
1277
        return self.tree__
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289


    def push_rollback__(self, location, func):
        """
        Adds a rollback function that either removes or re-adds
        values on the variable stack (`self.variables`) that have been
        added (or removed) by Capture or Pop Parsers, the results of
        which have been dismissed.
        """
        self.rollback__.append((location, func))
        self.last_rb__loc__ = location

1290

1291
1292
1293
1294
1295
    @property
    def document_lbreaks__(self) -> List[int]:
        if not self._document_lbreaks__:
            self._document_lbreaks__ = linebreaks(self.document__)
        return self._document_lbreaks__
1296

1297

1298
1299
1300
1301
1302
    def rollback_to__(self, location):
        """
        Rolls back the variable stacks (`self.variables`) to its
        state at an earlier location in the parsed document.
        """
Eckhart Arnold's avatar
Eckhart Arnold committed
1303
        while self.rollback__ and self.rollback__[-1][0] >= location:
1304
1305
1306
1307
1308
1309
1310
            _, rollback_func = self.rollback__.pop()
            # assert not loc > self.last_rb__loc__, \
            #     "Rollback confusion: line %i, col %i < line %i, col %i" % \
            #     (*line_col(self.document__, len(self.document__) - loc),
            #      *line_col(self.document__, len(self.document__) - self.last_rb__loc__))
            rollback_func()
        self.last_rb__loc__ == self.rollback__[-1][0] if self.rollback__ \
di68kap's avatar
di68kap committed
1311
            else (self.document__.__len__() + 1)
1312
1313


1314
    def line_col__(self, text: Union[StringView, str]) -> Tuple[int, int]:
1315
1316
1317
1318
1319
1320
1321
1322
        """
        Returns the line and column where text is located in the document.
        Does not check, whether text is actually a substring of the currently
        parsed document.
        """
        return line_col(self.document_lbreaks__, self.document_length__ - len(text))


1323
1324
1325
    def as_ebnf(self) -> str:
        """
        Serializes the Grammar object as a grammar-description in the
1326
1327
1328
        Extended Backus-Naur-Form. Does not serialize directives and
        may contain abbreviations with three dots " ... " for very long
        expressions.
1329
        """
1330
1331
        ebnf = ['# This grammar does not include any of the DHParser-specific ',
                '# directives and may contain abbreviations ("...")!', '']
1332
1333
1334
        for entry, parser in self.__dict__.items():
            if isinstance(parser, Parser) and sane_parser_name(entry):
                ebnf.append(str(parser))
1335
        ebnf.append('')
1336
1337
1338
        return '\n'.join(ebnf)


1339
1340
    def static_analysis(self) -> List[GrammarErrorType]:
        """
1341
        EXPERIMENTAL!!!
1342
1343
1344

        Checks the parser tree statically for possible errors. At the moment,
        no checks are implemented
1345
1346
1347
1348
1349
1350
1351

        :return: a list of error-tuples consisting of the narrowest containing
            named parser (i.e. the symbol on which the failure occurred),
            the actual parser that failed and an error object.
        """
        error_list = []  # type: List[GrammarErrorType]

eckhart's avatar
eckhart committed
1352
1353
1354
1355
1356
        # disabled, because no use case as of now
        # def visit_parser(parser: Parser) -> None:
        #     nonlocal error_list
        #
        # self.root_parser__.apply(visit_parser)
1357
        return error_list
1358
1359


1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
def dsl_error_msg(parser: Parser, error_str: str) -> str:
    """
    Returns an error message for errors in the parser configuration,
    e.g. errors that result in infinite loops.

    Args:
        parser (Parser):  The parser where the error was noticed. Note
            that this is not necessarily the parser that caused the
            error but only where the error became apparent.
        error_str (str):  A short string describing the error.
    Returns:
        str: An error message including the call stack if history
        tacking has been turned in the grammar object.
    """
    msg = ["DSL parser specification error:", error_str, 'Caught by parser "%s".' % str(parser)]
    if parser.grammar.history__:
        msg.extend(["\nCall stack:", parser.grammar.history__[-1].stack])
    else:
        msg.extend(["\nEnable history tracking in Grammar object to display call stack."])
    return " ".join(msg)


1382
GRAMMAR_PLACEHOLDER = Grammar()
eckhart's avatar
eckhart committed
1383
1384


1385
1386
########################################################################
#
1387
# _Token and Regular Expression parser classes (i.e. leaf classes)
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
#
########################################################################


class PreprocessorToken(Parser):
    """
    Parses tokens that have been inserted by a preprocessor.

    Preprocessors can generate Tokens with the ``make_token``-function.
    These tokens start and end with magic characters that can only be
    matched by the PreprocessorToken Parser. Such tokens can be used to
    insert BEGIN - END delimiters at the beginning or ending of a
    quoted block, for example.
    """

    def __init__(self, token: str) -> None:
        assert token and token.isupper()
1405
        assert RX_TOKEN_NAME.match(token)
1406
        super(PreprocessorToken, self).__init__()
di68kap's avatar
di68kap committed
1407
        self.pname = token
1408
1409
        if token:
            self.anonymous = False
1410

1411
    def __deepcopy__(self, memo):
di68kap's avatar
di68kap committed
1412
        duplicate = self.__class__(self.pname)
1413
1414
        # duplicate.pname = self.pname
        duplicate.anonymous = self.anonymous
1415
        duplicate.tag_name = self.tag_name
1416
1417
        return duplicate

1418
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1419
1420
1421
        if text[0:1] == BEGIN_TOKEN:
            end = text.find(END_TOKEN, 1)
            if end < 0:
eckhart's avatar
eckhart committed
1422
                node = Node(self.tag_name, '')  # type: Node
eckhart's avatar
eckhart committed
1423
1424
                self.grammar.tree__.new_error(
                    node,
1425
                    'END_TOKEN delimiter missing from preprocessor token. '
eckhart's avatar
eckhart committed
1426
                    '(Most likely due to a preprocessor bug!)')
1427
                return node, text[1:]
1428
            elif end == 0:
1429
                node = Node(self.tag_name, '')
eckhart's avatar
eckhart committed
1430
1431
                self.grammar.tree__.new_error(
                    node,
1432
1433
                    'Preprocessor-token cannot have zero length. '
                    '(Most likely due to a preprocessor bug!)')
1434
                return node, text[2:]
1435
            elif text.find(BEGIN_TOKEN, 1, end) >= 0:
di68kap's avatar
di68kap committed
1436
                node = Node(self.tag_name, text[len(self.pname) + 1:end])
eckhart's avatar
eckhart committed
1437
1438
                self.grammar.tree__.new_error(
                    node,
1439
1440
1441
1442
                    'Preprocessor-tokens must not be nested or contain '
                    'BEGIN_TOKEN delimiter as part of their argument. '
                    '(Most likely due to a preprocessor bug!)')
                return node, text[end:]
di68kap's avatar
di68kap committed
1443
            if text[1:len(self.pname) + 1] == self.pname:
1444
1445
                if self.drop_content:
                    return EMPTY_NODE, text[end + 1:]
di68kap's avatar
di68kap committed
1446
                return Node(self.tag_name, text[len(self.pname) + 2:end]), text[end + 1:]
1447
1448
1449
        return None, text


1450
class Token(Parser):
eckhart's avatar
eckhart committed
1451
    """
1452
    Parses plain text strings. (Could be done by RegExp as well, but is faster.)
eckhart's avatar
eckhart committed
1453

eckhart's avatar
eckhart committed
1454
1455
    Example::

1456
        >>> while_token = Token("while")
eckhart's avatar
eckhart committed
1457
1458
        >>> Grammar(while_token)("while").content
        'while'
eckhart's avatar
eckhart committed
1459
    """
1460
    assert TOKEN_PTYPE == ":Token"
eckhart's avatar
eckhart committed
1461

1462
    def __init__(self, text: str) -> None:
1463
        super(Token, self).__init__()
eckhart's avatar
eckhart committed
1464
        self.text = text
1465
        self.len = len(text)
eckhart's avatar
eckhart committed
1466
1467

    def __deepcopy__(self, memo):
1468
        duplicate = self.__class__(self.text)
1469
        copy_parser_attrs(self, duplicate)
1470
        return duplicate
eckhart's avatar
eckhart committed
1471

1472
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
eckhart's avatar
eckhart committed
1473
        if text.startswith(self.text):
1474
1475
1476
            if self.drop_content:
                return EMPTY_NODE, text[self.len:]
            elif self.text or not self.anonymous:
1477
                return Node(self.tag_name, self.text, True), text[self.len:]
1478
            return EMPTY_NODE, text
eckhart's avatar
eckhart committed
1479
1480
        return None, text

1481
    def __repr__(self):
1482
1483
        return '`%s`' % abbreviate_middle(self.text, 80)
        # return ("'%s'" if self.text.find("'") <= 0 else '"%s"') % abbreviate_middle(self.text, 80)
1484

eckhart's avatar
eckhart committed
1485

1486
1487
1488
1489
1490
1491
1492
1493
1494
class RegExp(Parser):
    r"""
    Regular expression parser.

    The RegExp-parser parses text that matches a regular expression.
    RegExp can also be considered as the "atomic parser", because all
    other parsers delegate part of the parsing job to other parsers,
    but do not match text directly.

1495
1496
1497
1498
1499
    Example::

        >>> word = RegExp(r'\w+')
        >>> Grammar(word)("Haus").content
        'Haus'
1500

1501
    EBNF-Notation:  ``/ ... /``
1502

1503
    EBNF-Example:   ``word = /\w+/``
1504
1505
    """

1506
    def __init__(self, regexp) -> None:
1507
        super(RegExp, self).__init__()
1508
1509
1510
1511
1512
1513
1514
1515
        self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

    def __deepcopy__(self, memo):
        # `regex` supports deep copies, but not `re`
        try:
            regexp = copy.deepcopy(self.regexp, memo)
        except TypeError:
            regexp = self.regexp.pattern
1516
        duplicate = self.__class__(regexp)
1517
        copy_parser_attrs(self, duplicate)
1518
        return duplicate
1519

1520
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1521
1522
1523
        match = text.match(self.regexp)
        if match:
            capture = match.group(0)
1524
            if capture or not self.anonymous:
1525
                end = text.index(match.end())
1526
1527
                if self.drop_content:
                    return EMPTY_NODE, text[end:]
1528
                return Node(self.tag_name, capture, True), text[end:]
1529
            return EMPTY_NODE, text
1530
1531
1532
        return None, text

    def __repr__(self):
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
        pattern = self.regexp.pattern
        try:
            if pattern == self._grammar.WSP_RE__:
                return '~'
            elif pattern == self._grammar.COMMENT__:
                return 'comment__'
            elif pattern == self._grammar.WHITESPACE__:
                return 'whitespace__'
        except (AttributeError, NameError):
            pass
eckhart's avatar
eckhart committed
1543
        return '/' + escape_control_characters('%s' % abbreviate_middle(pattern, 118)) + '/'
1544
1545


Eckhart Arnold's avatar
Eckhart Arnold committed
1546
def DropToken(text: str) -> Token:
eckhart's avatar
eckhart committed
1547
    return cast(Token, Drop(Token(text)))
Eckhart Arnold's avatar
Eckhart Arnold committed
1548
1549
1550


def DropRegExp(regexp) -> RegExp:
eckhart's avatar
eckhart committed
1551
    return cast(RegExp, Drop(RegExp(regexp)))
Eckhart Arnold's avatar
Eckhart Arnold committed
1552
1553


eckhart's avatar
eckhart committed
1554
def withWS(parser_factory, wsL='', wsR=r'\s*'):
1555
    """Syntactic Sugar for 'Series(Whitespace(wsL), parser_factory(), Whitespace(wsR))'.
1556
    """
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
    if wsL and isinstance(wsL, str):
        wsL = Whitespace(wsL)
    if wsR and isinstance(wsR, str):
        wsR = Whitespace(wsR)
    if wsL and wsR:
        return Series(wsL, parser_factory(), wsR)
    elif wsL:
        return Series(wsL, parser_factory())
    elif wsR:
        return Series(parser_factory(), wsR)
    else:
        return parser_factory()
1569

1570

eckhart's avatar
eckhart committed
1571
def RE(regexp, wsL='', wsR=r'\s*'):
1572
    """Syntactic Sugar for 'Series(Whitespace(wsL), RegExp(regexp), Whitespace(wsR))'"""
eckhart's avatar
eckhart committed
1573
    return withWS(lambda: RegExp(regexp), wsL, wsR)
1574
1575


eckhart's avatar
eckhart committed
1576
def TKN(token, wsL='', wsR=r'\s*'):
1577
1578
    """Syntactic Sugar for 'Series(Whitespace(wsL), Token(token), Whitespace(wsR))'"""
    return withWS(lambda: Token(token), wsL, wsR)
1579
1580


eckhart's avatar
eckhart committed
1581
1582
def DTKN(token, wsL='', wsR=r'\s*'):
    """Syntactic Sugar for 'Series(Whitespace(wsL), DropToken(token), Whitespace(wsR))'"""
eckhart's avatar
eckhart committed
1583
    return withWS(lambda: Drop(Token(token)), wsL, wsR)
eckhart's avatar
eckhart committed
1584
1585


1586
1587
1588
1589
class Whitespace(RegExp):
    """An variant of RegExp that signifies through its class name that it
    is a RegExp-parser for whitespace."""
    assert WHITESPACE_PTYPE == ":Whitespace"
eckhart's avatar
eckhart committed
1590

eckhart's avatar
eckhart committed
1591
1592
1593
    def __repr__(self):
        return '~'

1594
1595
1596

########################################################################
#
eckhart's avatar
eckhart committed
1597
# Meta parser classes, i.e. parsers that contain other parsers
1598
1599
1600
1601
1602
# to which they delegate parsing
#
########################################################################


eckhart's avatar
eckhart committed
1603
class MetaParser(Parser):
1604
    """Class Meta-Parser contains functions for the optimization of
1605
    return values of parsers that call other parsers (i.e descendants
1606
1607
1608
1609
1610
1611
    of classes UnaryParser and NaryParser).

    The optimization consists in flattening the tree by eliminating
    anonymous nodes. This is the same as what the function
    DHParser.transform.flatten() does, only at an earlier stage.
    The reasoning is that the earlier the tree is reduced, the less work
1612
    remains to do at all the later processing stages.
1613
1614
    """

eckhart's avatar
eckhart committed
1615
    def _return_value(self, node: Optional[Node]) -> Node:
1616
        """
1617
1618
        Generates a return node if a single node has been returned from
        any descendant parsers. Anonymous empty nodes will be dropped.
1619
1620
1621
1622
1623
        If `self` is an unnamed parser, a non-empty descendant node
        will be passed through. If the descendant node is anonymous,
        it will be dropped and only its result will be kept.
        In all other cases or if the optimization is turned off by
        setting `grammar.flatten_tree__` to False, a new node will be
1624
        generated and the descendant node will be its single child.
1625
        """
eckhart's avatar
eckhart committed
1626
        assert node is None or isinstance(node, Node)
1627
        if self._grammar.flatten_tree__:
di68kap's avatar
di68kap committed
1628
            if node is not None:
1629
                if self.anonymous:
1630
1631
                    if self.drop_content:
                        return EMPTY_NODE
1632
1633
1634
1635
1636
1637
1638
                    return node
                if node.tag_name[0] == ':':  # faster than node.is_anonymous()
                    return Node(self.tag_name, node._result)
                return Node(self.tag_name, node)
            elif self.anonymous:
                return EMPTY_NODE  # avoid creation of a node object for anonymous empty nodes
            return Node(self.tag_name, ())
1639
1640
        if self.drop_content:
            return EMPTY_NODE
1641
        return Node(self.tag_name, node or ())  # unoptimized code
eckhart's avatar
eckhart committed
1642

eckhart's avatar
eckhart committed
1643
1644
    @cython.locals(N=cython.int)
    def _return_values(self, results: Tuple[Node, ...]) -> Node:
1645
1646
1647
1648
1649
1650
        """
        Generates a return node from a tuple of returned nodes from
        descendant parsers. Anonymous empty nodes will be removed from
        the tuple. Anonymous child nodes will be flattened if
        `grammar.flatten_tree__` is True.
        """
eckhart's avatar
eckhart committed
1651
        assert isinstance(results, tuple)
1652
1653
        if self.drop_content:
            return EMPTY_NODE
eckhart's avatar
eckhart committed
1654
1655
        N = len(results)
        if N > 1:
1656
            if self._grammar.flatten_tree__:
1657
                nr = []  # type: List[Node]
1658
                # flatten parse tree
1659
1660
1661
                for child in results:
                    if child.children and child.tag_name[0] == ':':  # faster than c.is_anonymous():
                        nr.extend(child.children)
1662
                    elif child._result or child.tag_name[0] != ':':
1663
1664
1665
                        nr.append(child)
                return Node(self.tag_name, tuple(nr))
            return Node(self.tag_name, results)  # unoptimized code
eckhart's avatar
eckhart committed
1666
1667
        elif N == 1:
            return self._return_value(results[0])
1668
        elif self._grammar.flatten_tree__:
1669
1670
1671
            if self.anonymous:
                return EMPTY_NODE  # avoid creation of a node object for anonymous empty nodes
            return Node(self.tag_name, ())
1672
        return Node(self.tag_name, results)  # unoptimized code
eckhart's avatar
eckhart committed
1673
1674
1675


class UnaryParser(MetaParser):
1676
    """
eckhart's avatar
eckhart committed
1677
    Base class of all unary parsers, i.e. parser that contains
1678
1679
1680
    one and only one other parser, like the optional parser for example.

    The UnaryOperator base class supplies __deepcopy__ and apply
eckhart's avatar
eckhart committed
1681
    methods for unary parsers. The __deepcopy__ method needs
1682
1683
1684
1685
    to be overwritten, however, if the constructor of a derived class
    has additional parameters.
    """

1686
    def __init__(self, parser: Parser) -> None:
eckhart's avatar
eckhart committed
1687
        super(UnaryParser, self).__init__()
1688
1689
1690
1691
1692
        assert isinstance(parser, Parser), str(parser)
        self.parser = parser  # type: Parser

    def __deepcopy__(self, memo):
        parser = copy.deepcopy(self.parser, memo)
1693
        duplicate = self.__class__(parser)
1694
        copy_parser_attrs(self, duplicate)
1695
        return duplicate
1696

eckhart's avatar
eckhart committed
1697
1698
    def sub_parsers(self) -> Tuple['Parser', ...]:
        return (self.parser,)
1699
1700


eckhart's avatar
eckhart committed
1701
class NaryParser(MetaParser):
1702
    """
eckhart's avatar
eckhart committed
1703
    Base class of all Nnary parsers, i.e. parser that
1704
1705
1706
1707
    contains one or more other parsers, like the alternative
    parser for example.

    The NnaryOperator base class supplies __deepcopy__ and apply methods
eckhart's avatar
eckhart committed
1708
    for unary parsers. The __deepcopy__ method needs to be
1709
1710
1711
1712
    overwritten, however, if the constructor of a derived class has
    additional parameters.
    """

1713
    def __init__(self, *parsers: Parser) -> None:
eckhart's avatar
eckhart committed
1714
        super(NaryParser, self).__init__()
1715
1716
1717
1718
1719
        assert all([isinstance(parser, Parser) for parser in parsers]), str(parsers)
        self.parsers = parsers  # type: Tuple[Parser, ...]

    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
1720
        duplicate = self.__class__(*parsers)
1721
        copy_parser_attrs(self, duplicate)
1722
        return duplicate
1723

eckhart's avatar
eckhart committed
1724
    def sub_parsers(self) -> Tuple['Parser', ...]:
1725
        return self.parsers
1726
1727


eckhart's avatar
eckhart committed
1728
class Option(UnaryParser):
1729
    r"""
1730
    Parser ``Option`` always matches, even if its child-parser
1731
1732
    did not match.

1733
    If the child-parser did not match ``Option`` returns a node
1734
1735
    with no content and does not move forward in the text.

1736
    If the child-parser did match, ``Option`` returns the a node
eckhart's avatar
eckhart committed
1737
    with the node returned by the child-parser as its single
1738
1739
1740
    child and the text at the position where the child-parser
    left it.

1741
1742
    Examples::

1743
        >>> number = Option(TKN('-')) + RegExp(r'\d+') + Option(RegExp(r'\.\d+'))
1744
1745
        >>> Grammar(number)('3.14159').content
        '3.14159'
eckhart's avatar
eckhart committed
1746
        >>> Grammar(number)('3.14159').as_sxpr()
eckhart's avatar
eckhart committed
1747
        '(:Series (:RegExp "3") (:RegExp ".14159"))'
1748
1749
        >>> Grammar(number)('-1').content
        '-1'
1750

1751
    EBNF-Notation: ``[ ... ]``
1752

1753
    EBNF-Example:  ``number = ["-"]  /\d+/  [ /\.\d+/ ]``
1754
1755
    """

1756
    def __init__(self, parser: Parser) -> None:
1757
        super(Option, self).__init__(parser)
1758
1759
        # assert isinstance(parser, Parser)
        assert not isinstance(parser, Option), \
1760
            "Redundant nesting of options: %s(%s)" % (self.ptype, parser.pname)
1761

1762
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1763
        node, text = self.parser(text)
eckhart's avatar
eckhart committed
1764
        return self._return_value(node), text
1765
1766
1767

    def __repr__(self):
        return '[' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
di68kap's avatar
di68kap committed
1768
                      and not self.parser.pname else self.parser.repr) + ']'
1769
1770
1771
1772
1773
1774
1775
1776


class ZeroOrMore(Option):
    r"""
    `ZeroOrMore` applies a parser repeatedly as long as this parser
    matches. Like `Option` the `ZeroOrMore` parser always matches. In
    case of zero repetitions, the empty match `((), text)` is returned.

1777
1778
    Examples::

1779
        >>> sentence = ZeroOrMore(RE(r'\w+,?')) + TKN('.')
1780
1781
1782
1783
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> Grammar(sentence)('.').content  # an empty sentence also matches
        '.'
1784
1785
        >>> forever = ZeroOrMore(RegExp(''))
        >>> Grammar(forever)('')  # infinite loops will automatically be broken
1786
        Node(':EMPTY', '')
1787

1788
    EBNF-Notation: ``{ ... }``
1789

1790
    EBNF-Example:  ``sentence = { /\w+,?/ } "."``
1791
1792
    """

eckhart's avatar
eckhart committed
1793
    @cython.locals(n=cython.int)
1794
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1795
        results = ()  # type: Tuple[Node, ...]
di68kap's avatar
di68kap committed
1796
1797
1798
1799
        len = text.__len__()
        n = len + 1  # type: int
        while len < n:  # text and len(text) < n:
            n = len
1800
            node, text = self.parser(text)
di68kap's avatar
di68kap committed
1801
            len = text.__len__()
di68kap's avatar
di68kap committed
1802
            if node is None:
1803
                break
1804
            if node._result or not node.tag_name.startswith(':'):  # drop anonymous empty nodes
1805
                results += (node,)
di68kap's avatar
di68kap committed
1806
            if len == n:
1807
                break  # avoid infinite loop
eckhart's avatar
eckhart committed
1808
1809
        nd = self._return_values(results)  # type: Node
        return nd, text
1810
1811
1812

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
di68kap's avatar
di68kap committed
1813
                      and not self.parser.pname else self.parser.repr) + '}'
1814
1815


eckhart's avatar
eckhart committed
1816
class OneOrMore(UnaryParser):
1817
1818
1819
1820
1821
    r"""
    `OneOrMore` applies a parser repeatedly as long as this parser
    matches. Other than `ZeroOrMore` which always matches, at least
    one match is required by `OneOrMore`.

1822
1823
    Examples::

1824
        >>> sentence = OneOrMore(RE(r'\w+,?')) + TKN('.')
1825
1826
1827
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> str(Grammar(sentence)('.'))  # an empty sentence also matches
Eckhart Arnold's avatar
Eckhart Arnold committed
1828
        ' <<< Error on "." | Parser "{/\\w+,?/ ~}+ `.` ~" did not match! >>> '
1829
1830
        >>> forever = OneOrMore(RegExp(''))
        >>> Grammar(forever)('')  # infinite loops will automatically be broken
1831
        Node(':EMPTY', '')
1832

1833
    EBNF-Notation: ``{ ... }+``
1834

1835
    EBNF-Example:  ``sentence = { /\w+,?/ }+``
1836
1837
    """

1838
    def __init__(self, parser: Parser) -> None:
1839
        super(OneOrMore, self).__init__(parser)
1840
1841
        assert not isinstance(parser, Option), \
            "Use ZeroOrMore instead of nesting OneOrMore and Option: " \
1842
            "%s(%s)" % (self.ptype, parser.pname)
1843

1844
    def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1845
1846
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
1847
        match_flag = False
di68kap's avatar
di68kap committed
1848
1849
1850
1851
        len = text.__len__()
        n = len + 1  # type: int
        while len < n:  # text_ and len(text_) < n:
            n = len
1852
            node, text_ = self.parser(text_)
di68kap's avatar
di68kap committed
1853
            len = text_.__len__()
di68kap's avatar
di68kap committed
1854
            if node is None:
1855
                break
1856
            match_flag = True
1857
            if node._result or not node.tag_name.startswith(':'):  # drop anonymous empty nodes
1858
                results += (node,)
di68kap's avatar
di68kap committed
1859
            if len == n:
1860
                break  # avoid infinite loop
1861
        if not match_flag:
1862
            return None, text
eckhart's avatar
eckhart committed
1863
1864
        nd = self._return_values(results)  # type: Node
        return nd, text_
1865
1866
1867

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
di68kap's avatar
di68kap committed
1868
                      and not self.parser.pname else self.parser.repr) + '}+'
1869
1870


1871
1872
1873
1874
MessagesType = List[Tuple[Union[str, Any], str]]
NO_MANDATORY = 1000


Eckhart Arnold's avatar
Eckhart Arnold committed
1875
class MandatoryNary(NaryParser):
1876
    r"""
1877
    Attributes:
eckhart's avatar
eckhart committed
1878
        mandatory:  Number of the element starting at which the element
1879
1880
                and all following elements are considered "mandatory". This
                means that rather than returning a non-match an error message
eckhart's avatar
eckhart committed
1881
                is issued. The default value is NO_MANDATORY, which means that
1882
1883
1884
                no elements are mandatory. NOTE: The semantics of the mandatory-
                parameter might change depending on the sub-class implementing
                it.
eckhart's avatar
eckhart committed
1885
1886
1887
1888
1889
1890
1891
        err_msgs:  A list of pairs of regular expressions (or simple
                strings for that matter) and error messages that are chosen
                if the regular expression matches the text where the error
                occurred.
        skip_list: A list of regular expressions. The rest of the text is searched for
                each of these. The closest match is the point where parsing will be
                resumed.
1892
    """
1893