parse.py 79.5 KB
Newer Older
1001
        return duplicate
eckhart's avatar
eckhart committed
1002
1003
1004

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        if text.startswith(self.text):
1005
            return Node(self, self.text, True), text[self.len:]
eckhart's avatar
eckhart committed
1006
1007
        return None, text

1008
1009
1010
    def __repr__(self):
        return ("'%s'" if self.text.find("'") <= 0 else '"%s"') % self.text

eckhart's avatar
eckhart committed
1011

1012
1013
1014
1015
1016
1017
1018
1019
1020
class RegExp(Parser):
    r"""
    Regular expression parser.

    The RegExp-parser parses text that matches a regular expression.
    RegExp can also be considered as the "atomic parser", because all
    other parsers delegate part of the parsing job to other parsers,
    but do not match text directly.

1021
1022
1023
1024
1025
    Example::

        >>> word = RegExp(r'\w+')
        >>> Grammar(word)("Haus").content
        'Haus'
1026

1027
    EBNF-Notation:  ``/ ... /``
1028

1029
    EBNF-Example:   ``word = /\w+/``
1030
1031
    """

1032
1033
    def __init__(self, regexp) -> None:
        super().__init__()
1034
1035
1036
1037
1038
1039
1040
1041
        self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

    def __deepcopy__(self, memo):
        # `regex` supports deep copies, but not `re`
        try:
            regexp = copy.deepcopy(self.regexp, memo)
        except TypeError:
            regexp = self.regexp.pattern
1042
1043
        duplicate = self.__class__(regexp)
        duplicate.name = self.name
eckhart's avatar
eckhart committed
1044
        duplicate.ptype = self.ptype
1045
        return duplicate
1046
1047

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1048
1049
1050
1051
        match = text.match(self.regexp)
        if match:
            capture = match.group(0)
            end = text.index(match.end())
1052
1053
            # regular expression must never match preprocessor-tokens!
            # TODO: Find a better solution here? e.g. static checking/re-mangling at compile time
1054
1055
1056
1057
1058
            i = capture.find(BEGIN_TOKEN)
            if i >= 0:
                capture = capture[:i]
                end = i
            return Node(self, capture, True), text[end:]
1059
1060
1061
        return None, text

    def __repr__(self):
1062
        return escape_control_characters('/%s/' % self.regexp.pattern)
1063
1064


eckhart's avatar
eckhart committed
1065
def withWS(parser_factory, wsL='', wsR=r'\s*'):
1066
    """Syntactic Sugar for 'Series(Whitespace(wsL), parser_factory(), Whitespace(wsR))'.
1067
    """
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
    if wsL and isinstance(wsL, str):
        wsL = Whitespace(wsL)
    if wsR and isinstance(wsR, str):
        wsR = Whitespace(wsR)
    if wsL and wsR:
        return Series(wsL, parser_factory(), wsR)
    elif wsL:
        return Series(wsL, parser_factory())
    elif wsR:
        return Series(parser_factory(), wsR)
    else:
        return parser_factory()
1080

1081

eckhart's avatar
eckhart committed
1082
def RE(regexp, wsL='', wsR=r'\s*'):
1083
    """Syntactic Sugar for 'Series(Whitespace(wsL), RegExp(regexp), Whitespace(wsR))'"""
eckhart's avatar
eckhart committed
1084
    return withWS(lambda: RegExp(regexp), wsL, wsR)
1085
1086


eckhart's avatar
eckhart committed
1087
def TKN(token, wsL='', wsR=r'\s*'):
1088
1089
    """Syntactic Sugar for 'Series(Whitespace(wsL), Token(token), Whitespace(wsR))'"""
    return withWS(lambda: Token(token), wsL, wsR)
1090
1091


1092
1093
1094
1095
class Whitespace(RegExp):
    """An variant of RegExp that signifies through its class name that it
    is a RegExp-parser for whitespace."""
    assert WHITESPACE_PTYPE == ":Whitespace"
eckhart's avatar
eckhart committed
1096

eckhart's avatar
eckhart committed
1097
1098
1099
    def __repr__(self):
        return '~'

1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119

########################################################################
#
# Containing parser classes, i.e. parsers that contain other parsers
# to which they delegate parsing
#
########################################################################


class UnaryOperator(Parser):
    """
    Base class of all unary parser operators, i.e. parser that contains
    one and only one other parser, like the optional parser for example.

    The UnaryOperator base class supplies __deepcopy__ and apply
    methods for unary parser operators. The __deepcopy__ method needs
    to be overwritten, however, if the constructor of a derived class
    has additional parameters.
    """

1120
1121
    def __init__(self, parser: Parser) -> None:
        super(UnaryOperator, self).__init__()
1122
1123
1124
1125
1126
        assert isinstance(parser, Parser), str(parser)
        self.parser = parser  # type: Parser

    def __deepcopy__(self, memo):
        parser = copy.deepcopy(self.parser, memo)
1127
1128
        duplicate = self.__class__(parser)
        duplicate.name = self.name
eckhart's avatar
eckhart committed
1129
        duplicate.ptype = self.ptype
1130
        return duplicate
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150

    def apply(self, func: Parser.ApplyFunc) -> bool:
        if super().apply(func):
            self.parser.apply(func)
            return True
        return False


class NaryOperator(Parser):
    """
    Base class of all Nnary parser operators, i.e. parser that
    contains one or more other parsers, like the alternative
    parser for example.

    The NnaryOperator base class supplies __deepcopy__ and apply methods
    for unary parser operators. The __deepcopy__ method needs to be
    overwritten, however, if the constructor of a derived class has
    additional parameters.
    """

1151
1152
    def __init__(self, *parsers: Parser) -> None:
        super().__init__()
1153
1154
1155
1156
1157
        assert all([isinstance(parser, Parser) for parser in parsers]), str(parsers)
        self.parsers = parsers  # type: Tuple[Parser, ...]

    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
1158
1159
        duplicate = self.__class__(*parsers)
        duplicate.name = self.name
eckhart's avatar
eckhart committed
1160
        duplicate.ptype = self.ptype
1161
        return duplicate
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172

    def apply(self, func: Parser.ApplyFunc) -> bool:
        if super().apply(func):
            for parser in self.parsers:
                parser.apply(func)
            return True
        return False


class Option(UnaryOperator):
    r"""
1173
    Parser ``Option`` always matches, even if its child-parser
1174
1175
    did not match.

1176
    If the child-parser did not match ``Option`` returns a node
1177
1178
    with no content and does not move forward in the text.

1179
    If the child-parser did match, ``Option`` returns the a node
eckhart's avatar
eckhart committed
1180
    with the node returned by the child-parser as its single
1181
1182
1183
    child and the text at the position where the child-parser
    left it.

1184
1185
    Examples::

1186
        >>> number = Option(TKN('-')) + RegExp(r'\d+') + Option(RegExp(r'\.\d+'))
1187
1188
1189
1190
1191
1192
        >>> Grammar(number)('3.14159').content
        '3.14159'
        >>> Grammar(number)('3.14159').structure
        '(:Series (:Option) (:RegExp "3") (:Option (:RegExp ".14159")))'
        >>> Grammar(number)('-1').content
        '-1'
1193

1194
    EBNF-Notation: ``[ ... ]``
1195

1196
    EBNF-Example:  ``number = ["-"]  /\d+/  [ /\.\d+/ ]``
1197
1198
    """

1199
1200
    def __init__(self, parser: Parser) -> None:
        super().__init__(parser)
1201
1202
        # assert isinstance(parser, Parser)
        assert not isinstance(parser, Option), \
eckhart's avatar
eckhart committed
1203
            "Redundant nesting of options: %s%s" % (str(parser.name), str(self.ptype))
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        node, text = self.parser(text)
        if node:
            return Node(self, node), text
        return Node(self, ()), text

    def __repr__(self):
        return '[' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + ']'


class ZeroOrMore(Option):
    r"""
    `ZeroOrMore` applies a parser repeatedly as long as this parser
    matches. Like `Option` the `ZeroOrMore` parser always matches. In
    case of zero repetitions, the empty match `((), text)` is returned.

1222
1223
    Examples::

1224
        >>> sentence = ZeroOrMore(RE(r'\w+,?')) + TKN('.')
1225
1226
1227
1228
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> Grammar(sentence)('.').content  # an empty sentence also matches
        '.'
1229

1230
    EBNF-Notation: ``{ ... }``
1231

1232
    EBNF-Example:  ``sentence = { /\w+,?/ } "."``
1233
1234
1235
1236
    """

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
di68kap's avatar
di68kap committed
1237
1238
        n = len(text) + 1  # type: int
        infinite_loop_error = None  # type: Optional[Error]
1239
1240
1241
1242
1243
1244
        while text and len(text) < n:
            n = len(text)
            node, text = self.parser(text)
            if not node:
                break
            if len(text) == n:
Eckhart Arnold's avatar
Eckhart Arnold committed
1245
                node.errors.append(Error("Infinite loop!", node.pos, Error.MESSAGE))
eckhart's avatar
eckhart committed
1246
                infinite_loop_error = Error(dsl_error_msg(self, 'Infinite Loop encountered.'),
Eckhart Arnold's avatar
Eckhart Arnold committed
1247
                                            node.pos)
1248
            results += (node,)
di68kap's avatar
di68kap committed
1249
1250
        node = Node(self, results)
        if infinite_loop_error:
eckhart's avatar
eckhart committed
1251
            self.grammar.tree__.add_error(node, infinite_loop_error)
di68kap's avatar
di68kap committed
1252
        return node, text
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + '}'


class OneOrMore(UnaryOperator):
    r"""
    `OneOrMore` applies a parser repeatedly as long as this parser
    matches. Other than `ZeroOrMore` which always matches, at least
    one match is required by `OneOrMore`.

1265
1266
    Examples::

1267
        >>> sentence = OneOrMore(RE(r'\w+,?')) + TKN('.')
1268
1269
1270
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> str(Grammar(sentence)('.'))  # an empty sentence also matches
di68kap's avatar
di68kap committed
1271
        ' <<< Error on "." | Parser did not match! >>> '
1272

1273
    EBNF-Notation: ``{ ... }+``
1274

1275
    EBNF-Example:  ``sentence = { /\w+,?/ }+``
1276
1277
    """

1278
1279
    def __init__(self, parser: Parser) -> None:
        super().__init__(parser)
1280
1281
        assert not isinstance(parser, Option), \
            "Use ZeroOrMore instead of nesting OneOrMore and Option: " \
di68kap's avatar
di68kap committed
1282
            "%s(%s)" % (str(self.ptype), str(parser.name))
1283
1284
1285
1286

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
di68kap's avatar
di68kap committed
1287
1288
        n = len(text) + 1  # type: int
        infinite_loop_error = None  # type: Optional[Error]
1289
1290
1291
1292
1293
1294
        while text_ and len(text_) < n:
            n = len(text_)
            node, text_ = self.parser(text_)
            if not node:
                break
            if len(text_) == n:
Eckhart Arnold's avatar
Eckhart Arnold committed
1295
                node.errors.append(Error("Infinite loop!", node.pos, Error.MESSAGE))
eckhart's avatar
eckhart committed
1296
                infinite_loop_error = Error(dsl_error_msg(self, 'Infinite Loop encountered.'),
Eckhart Arnold's avatar
Eckhart Arnold committed
1297
                                            node.pos)
1298
1299
1300
            results += (node,)
        if results == ():
            return None, text
di68kap's avatar
di68kap committed
1301
1302
        node = Node(self, results)
        if infinite_loop_error:
eckhart's avatar
eckhart committed
1303
            self.grammar.tree__.add_error(node, infinite_loop_error)
di68kap's avatar
di68kap committed
1304
        return node, text_
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + '}+'


class Series(NaryOperator):
    r"""
    Matches if each of a series of parsers matches exactly in the order of
    the series.

1316
    Attributes:
1317
1318
1319
1320
1321
1322
1323
1324
        mandatory (int):  Number of the element statring at which the element
                and all following elements are considered "mandatory". This
                means that rather than returning a non-match an error message
                is isssued. The default value is Series.NOPE, which means that
                no elements are mandatory.
        errmsg (str):  An optional error message that overrides the default
               message for mandatory continuation errors. This can be used to
               provide more helpful error messages to the user.
1325

1326
1327
    Example::

1328
        >>> variable_name = RegExp('(?!\d)\w') + RE('\w*')
1329
1330
1331
        >>> Grammar(variable_name)('variable_1').content
        'variable_1'
        >>> str(Grammar(variable_name)('1_variable'))
di68kap's avatar
di68kap committed
1332
        ' <<< Error on "1_variable" | Parser did not match! >>> '
1333

1334
    EBNF-Notation: ``... ...``    (sequence of parsers separated by a blank or new line)
1335

1336
    EBNF-Example:  ``series = letter letter_or_digit``
1337
1338
1339
1340
    """
    RX_ARGUMENT = re.compile(r'\s(\S)')
    NOPE = 1000

1341
    def __init__(self, *parsers: Parser, mandatory: int = NOPE, errmsg: str="") -> None:
1342
        super().__init__(*parsers)
1343
1344
1345
1346
1347
1348
1349
        length = len(self.parsers)
        assert 1 <= length < Series.NOPE, \
            'Length %i of series exceeds maximum length of %i' % (length, Series.NOPE)
        if mandatory < 0:
            mandatory += length
        assert 0 <= mandatory < length or mandatory == Series.NOPE
        self.mandatory = mandatory
1350
        self.errmsg = errmsg
1351
1352
1353

    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
1354
        duplicate = self.__class__(*parsers, mandatory=self.mandatory, errmsg=self.errmsg)
1355
        duplicate.name = self.name
eckhart's avatar
eckhart committed
1356
        duplicate.ptype = self.ptype
1357
        return duplicate
1358
1359
1360
1361

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
di68kap's avatar
di68kap committed
1362
        mandatory_violation = None
1363
        for pos, parser in enumerate(self.parsers):
1364
1365
1366
1367
1368
1369
            node, text_ = parser(text_)
            if not node:
                if pos < self.mandatory:
                    return None, text
                else:
                    # Provide useful error messages
Eckhart Arnold's avatar
Eckhart Arnold committed
1370
1371
1372
1373
                    # TODO: Change to accomodate to resuming after errors are caught.
                    # match = text.search(Series.RX_ARGUMENT)
                    # i = max(1, text.index(match.regs[1][0])) if match else 1
                    i = 0
di68kap's avatar
di68kap committed
1374
1375
                    location = self.grammar.document_length__ - len(text_)
                    node = Node(self, text_[:i]).init_pos(location)
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
                    # self.grammar.tree__.add_error(
                    #     node, Error("§ %s violation" % parser.repr, location, Error.MESSAGE))
                    # # node.errors.append(Error("§ %s violation" % parser.repr,
                    # #                          location, Error.MESSAGE))
                    found = text_[:10].replace('\n', '\\n ')
                    if self.errmsg:
                        msg = self.errmsg % found if self.errmsg.find("%s") >= 0 \
                            else self.errmsg
                    else:
                        msg = '%s expected, "%s" found!' % (parser.repr, found)
                    mandatory_violation = Error(msg, location, Error.MANDATORY_CONTINUATION)
                    self.grammar.tree__.add_error(node, mandatory_violation)
1388
                    text_ = text_[i:]
1389
1390
                    results += (node,)
                    break
1391
1392
1393
1394
            results += (node,)
            # if node.error_flag:  # break on first error
            #    break
        assert len(results) <= len(self.parsers)
di68kap's avatar
di68kap committed
1395
1396
        node = Node(self, results)
        if mandatory_violation:
1397
            raise ParserError(node, text, first_throw=True)
1398
            # self.grammar.tree__.add_error(node, mandatory_violation)
di68kap's avatar
di68kap committed
1399
        return node, text_
1400
1401
1402
1403
1404
1405
1406

    def __repr__(self):
        return " ".join([parser.repr for parser in self.parsers[:self.mandatory]]
                        + (['§'] if self.mandatory != Series.NOPE else [])
                        + [parser.repr for parser in self.parsers[self.mandatory:]])

    # The following operator definitions add syntactical sugar, so one can write:
1407
    # `RE('\d+') + Optional(RE('\.\d+)` instead of `Series(RE('\d+'), Optional(RE('\.\d+))`
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425

    @staticmethod
    def combined_mandatory(left: Parser, right: Parser):
        """
        Returns the position of the first mandatory element (if any) when
        parsers `left` and `right` are joined to a sequence.
        """
        left_mandatory, left_length = (left.mandatory, len(left.parsers)) \
            if isinstance(left, Series) else (Series.NOPE, 1)
        if left_mandatory != Series.NOPE:
            return left_mandatory
        right_mandatory = right.mandatory if isinstance(right, Series) else Series.NOPE
        if right_mandatory != Series.NOPE:
            return right_mandatory + left_length
        return Series.NOPE

    def __add__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
1426
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1427
1428
1429
1430
1431
        return Series(*(self.parsers + other_parsers),
                      mandatory=self.combined_mandatory(self, other))

    def __radd__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
1432
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1433
1434
1435
1436
1437
        return Series(*(other_parsers + self.parsers),
                      mandatory=self.combined_mandatory(other, self))

    def __iadd__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
1438
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
        self.parsers += other_parsers
        self.mandatory = self.combined_mandatory(self, other)
        return self


class Alternative(NaryOperator):
    r"""
    Matches if one of several alternatives matches. Returns
    the first match.

    This parser represents the EBNF-operator "|" with the qualification
    that both the symmetry and the ambiguity of the EBNF-or-operator
1451
    are broken by selecting the first match.::
1452

1453
        # the order of the sub-expression matters!
1454
        >>> number = RE('\d+') | RE('\d+') + RE('\.') + RE('\d+')
1455
1456
        >>> str(Grammar(number)("3.1416"))
        '3 <<< Error on ".141" | Parser stopped before end! trying to recover... >>> '
1457

1458
        # the most selective expression should be put first:
1459
        >>> number = RE('\d+') + RE('\.') + RE('\d+') | RE('\d+')
1460
1461
        >>> Grammar(number)("3.1416").content
        '3.1416'
1462

1463
    EBNF-Notation: ``... | ...``
1464

1465
    EBNF-Example:  ``sentence = /\d+\.\d+/ | /\d+/``
1466
1467
    """

1468
1469
    def __init__(self, *parsers: Parser) -> None:
        super().__init__(*parsers)
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
        assert len(self.parsers) >= 1
        # only the last alternative may be optional. Could this be checked at compile time?
        assert all(not isinstance(p, Option) for p in self.parsers[:-1]), \
            "Parser-specification Error: only the last alternative may be optional!"

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        for parser in self.parsers:
            node, text_ = parser(text)
            if node:
                return Node(self, node), text_
        return None, text

    def __repr__(self):
        return '(' + ' | '.join(parser.repr for parser in self.parsers) + ')'

    def reset(self):
        super(Alternative, self).reset()
        return self

    # The following operator definitions add syntactical sugar, so one can write:
1490
1491
    # `RE('\d+') + RE('\.') + RE('\d+') | RE('\d+')` instead of:
    # `Alternative(Series(RE('\d+'), RE('\.'), RE('\d+')), RE('\d+'))`
1492
1493
1494

    def __or__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
1495
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1496
1497
1498
1499
        return Alternative(*(self.parsers + other_parsers))

    def __ror__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
1500
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1501
1502
1503
1504
        return Alternative(*(other_parsers + self.parsers))

    def __ior__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
1505
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
        self.parsers += other_parsers
        return self


class AllOf(NaryOperator):
    """
    Matches if all elements of a list of parsers match. Each parser must
    match exactly once. Other than in a sequence, the order in which
    the parsers match is arbitrary, however.

1516
1517
    Example::

1518
        >>> prefixes = AllOf(TKN("A"), TKN("B"))
1519
1520
1521
1522
        >>> Grammar(prefixes)('A B').content
        'A B'
        >>> Grammar(prefixes)('B A').content
        'B A'
1523

1524
    EBNF-Notation: ``<... ...>``    (sequence of parsers enclosed by angular brackets)
1525

1526
    EBNF-Example:  ``set = <letter letter_or_digit>``
1527
1528
    """

1529
    def __init__(self, *parsers: Parser) -> None:
1530
1531
1532
1533
1534
1535
1536
1537
        if len(parsers) == 1 and isinstance(parsers[0], Series):
            assert isinstance(parsers[0], Series), \
                "Parser-specification Error: No single arguments other than a Series " \
                "allowed as arguments for AllOf-Parser !"
            series = cast(Series, parsers[0])
            assert series.mandatory == Series.NOPE, \
                "AllOf cannot contain mandatory (§) elements!"
            parsers = series.parsers
1538
        super().__init__(*parsers)
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
        parsers = list(self.parsers)  # type: List[Parser]
        while parsers:
            for i, parser in enumerate(parsers):
                node, text__ = parser(text_)
                if node:
                    results += (node,)
                    text_ = text__
                    del parsers[i]
                    break
            else:
                return None, text
        assert len(results) <= len(self.parsers)
        return Node(self, results), text_

    def __repr__(self):
        return '<' + ' '.join(parser.repr for parser in self.parsers) + '>'


class SomeOf(NaryOperator):
    """
    Matches if at least one element of a list of parsers match. No parser
    must match more than once . Other than in a sequence, the order in which
    the parsers match is arbitrary, however.

1567
1568
    Example::

1569
        >>> prefixes = SomeOf(TKN("A"), TKN("B"))
1570
1571
1572
1573
1574
1575
        >>> Grammar(prefixes)('A B').content
        'A B'
        >>> Grammar(prefixes)('B A').content
        'B A'
        >>> Grammar(prefixes)('B').content
        'B'
1576

1577
    EBNF-Notation: ``<... ...>``    (sequence of parsers enclosed by angular brackets)
1578

1579
    EBNF-Example:  ``set = <letter letter_or_digit>``
1580
1581
    """

1582
    def __init__(self, *parsers: Parser) -> None:
1583
1584
1585
1586
1587
1588
        if len(parsers) == 1:
            assert isinstance(parsers[0], Alternative), \
                "Parser-specification Error: No single arguments other than a Alternative " \
                "allowed as arguments for SomeOf-Parser !"
            alternative = cast(Alternative, parsers[0])
            parsers = alternative.parsers
1589
        super().__init__(*parsers)
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
        parsers = list(self.parsers)  # type: List[Parser]
        while parsers:
            for i, parser in enumerate(parsers):
                node, text__ = parser(text_)
                if node:
                    results += (node,)
                    text_ = text__
                    del parsers[i]
                    break
            else:
                parsers = []
        assert len(results) <= len(self.parsers)
        if results:
            return Node(self, results), text_
        else:
            return None, text

    def __repr__(self):
        return '<' + ' | '.join(parser.repr for parser in self.parsers) + '>'


1615
def Unordered(parser: NaryOperator) -> NaryOperator:
1616
1617
1618
1619
1620
    """
    Returns an AllOf- or SomeOf-parser depending on whether `parser`
    is a Series (AllOf) or an Alternative (SomeOf).
    """
    if isinstance(parser, Series):
1621
        return AllOf(parser)
1622
    elif isinstance(parser, Alternative):
1623
        return SomeOf(parser)
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
    else:
        raise AssertionError("Unordered can take only Series or Alternative as parser.")


########################################################################
#
# Flow control operators
#
########################################################################

class FlowOperator(UnaryOperator):
    """
    Base class for all flow operator parsers like Lookahead and Lookbehind.
    """
    def sign(self, bool_value) -> bool:
        """Returns the value. Can be overriden to return the inverted bool."""
        return bool_value


eckhart's avatar
eckhart committed
1643
1644
1645
1646
def Required(parser: Parser) -> Parser:
    return Series(parser, mandatory=0)


1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
# class Required(FlowOperator):
#     """OBSOLETE. Use mandatory-parameter of Series-parser instead!
#     """
#     RX_ARGUMENT = re.compile(r'\s(\S)')
#
#     def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
#         node, text_ = self.parser(text)
#         if not node:
#             m = text.search(Required.RX_ARGUMENT)  # re.search(r'\s(\S)', text)
#             i = max(1, text.index(m.regs[1][0])) if m else 1
#             node = Node(self, text[:i])
#             text_ = text[i:]
eckhart's avatar
eckhart committed
1659
#             self.grammar.tree__.new_error(node,
1660
1661
#                                           '%s expected; "%s" found!' % (str(self.parser),
#                                           text[:10]), code=Error.MANDATORY_CONTINUATION)
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
#         return node, text_
#
#     def __repr__(self):
#         return '§' + self.parser.repr


class Lookahead(FlowOperator):
    """
    Matches, if the contained parser would match for the following text,
    but does not consume any text.
    """
    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        node, _ = self.parser(text)
        if self.sign(node is not None):
            return Node(self, ''), text
        else:
            return None, text

    def __repr__(self):
        return '&' + self.parser.repr


class NegativeLookahead(Lookahead):
    """
    Matches, if the contained parser would *not* match for the following
    text.
    """
    def __repr__(self):
        return '!' + self.parser.repr

    def sign(self, bool_value) -> bool:
        return not bool_value


class Lookbehind(FlowOperator):
    """
    Matches, if the contained parser would match backwards. Requires
1699
    the contained parser to be a RegExp, _RE, PlainText or _Token parser.
1700
1701

    EXPERIMENTAL
1702
    """
1703
    def __init__(self, parser: Parser) -> None:
1704
1705
1706
        p = parser
        while isinstance(p, Synonym):
            p = p.parser
1707
        assert isinstance(p, RegExp) or isinstance(p, Token)
eckhart's avatar
eckhart committed
1708
        self.regexp = None
eckhart's avatar
eckhart committed
1709
        self.text = ''  # type: str
1710
        if isinstance(p, RegExp):
eckhart's avatar
eckhart committed
1711
1712
            self.regexp = cast(RegExp, p).regexp
        else:  # p is of type PlainText
1713
            self.text = cast(Token, p).text
1714
        super().__init__(parser)
1715
1716
1717

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        backwards_text = self.grammar.reversed__[len(text):]
eckhart's avatar
eckhart committed
1718
1719
1720
1721
1722
        if self.regexp is None:  # assert self.text is not None
            does_match = backwards_text[:len(self.text)] == self.text
        else:  # assert self.regexp is not None
            does_match = backwards_text.match(self.regexp)
        return (Node(self, ''), text) if self.sign(does_match) else (None, text)
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756

    def __repr__(self):
        return '-&' + self.parser.repr


class NegativeLookbehind(Lookbehind):
    """
    Matches, if the contained parser would *not* match backwards. Requires
    the contained parser to be a RegExp-parser.
    """
    def __repr__(self):
        return '-!' + self.parser.repr

    def sign(self, bool_value) -> bool:
        return not bool(bool_value)


########################################################################
#
# Capture and Retrieve operators (for passing variables in the parser)
#
########################################################################


class Capture(UnaryOperator):
    """
    Applies the contained parser and, in case of a match, saves the result
    in a variable. A variable is a stack of values associated with the
    contained parser's name. This requires the contained parser to be named.
    """
    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        node, text_ = self.parser(text)
        if node:
            assert self.name, """Tried to apply an unnamed capture-parser!"""
1757
            stack = self.grammar.variables__[self.name]
1758
            stack.append(node.content)
Eckhart Arnold's avatar
Eckhart Arnold committed
1759
1760
            location = self.grammar.document_length__ - len(text)
            self.grammar.push_rollback__(location, lambda: stack.pop())
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
            # caching will be blocked by parser guard (see way above),
            # because it would prevent recapturing of rolled back captures
            return Node(self, node), text_
        else:
            return None, text

    def __repr__(self):
        return self.parser.repr


RetrieveFilter = Callable[[List[str]], str]


def last_value(stack: List[str]) -> str:
    return stack[-1]


def counterpart(stack: List[str]) -> str:
    value = stack[-1]
    return value.replace("(", ")").replace("[", "]").replace("{", "}").replace("<", ">")


def accumulate(stack: List[str]) -> str:
    return "".join(stack) if len(stack) > 1 else stack[-1]  # provoke IndexError if stack empty


class Retrieve(Parser):
    """
    Matches if the following text starts with the value of a particular
    variable. As a variable in this context means a stack of values,
    the last value will be compared with the following text. It will not
    be removed from the stack! (This is the difference between the
    `Retrieve` and the `Pop` parser.)
    The constructor parameter `symbol` determines which variable is
    used.
1796
1797
1798
1799
1800
1801
1802
1803

    Attributes:
        symbol: The parser that has stored the value to be retrieved, in
            other words: "the observed parser"
        rfilter: a procedure that through which the processing to the
            retrieved symbols is channeld. In the simplemost case it merely
            returns the last string stored by the observed parser. This can
            be (mis-)used to execute any kind of semantic action.
1804
1805
    """

1806
1807
    def __init__(self, symbol: Parser, rfilter: RetrieveFilter = None) -> None:
        super().__init__()
1808
1809
1810
1811
        self.symbol = symbol
        self.filter = rfilter if rfilter else last_value

    def __deepcopy__(self, memo):
1812
1813
        duplicate = self.__class__(self.symbol, self.filter)
        duplicate.name = self.name
eckhart's avatar
eckhart committed
1814
        duplicate.ptype = self.ptype
1815
        return duplicate
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        # the following indirection allows the call() method to be called
        # from subclass without triggering the parser guard a second time
        return self.retrieve_and_match(text)

    def __repr__(self):
        return ':' + self.symbol.repr

    def retrieve_and_match(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        """
        Retrieves variable from stack through the filter function passed to
        the class' constructor and tries to match the variable's value with
        the following text. Returns a Node containing the value or `None`
        accordingly.

        This functionality has been move from the __call__ method to an
        independent method to allow calling it from a subclasses __call__
        method without triggering the parser guard a second time.
        """
        try:
            stack = self.grammar.variables__[self.symbol.name]
            value = self.filter(stack)
        except (KeyError, IndexError):
1840
            node = Node(self, '')
eckhart's avatar
eckhart committed
1841
            self.grammar.tree__.new_error(
1842
1843
                node, dsl_error_msg(self, "'%s' undefined or exhausted." % self.symbol.name))
            return node, text
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
        if text.startswith(value):
            return Node(self, value), text[len(value):]
        else:
            return None, text


class Pop(Retrieve):
    """
    Matches if the following text starts with the value of a particular
    variable. As a variable in this context means a stack of values,
    the last value will be compared with the following text. Other
    than the `Retrieve`-parser, the `Pop`-parser removes the value
    from the stack in case of a match.

    The constructor parameter `symbol` determines which variable is
    used.
    """

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        node, txt = super().retrieve_and_match(text)
1864
        if node and not node.errors:
1865
1866
            stack = self.grammar.variables__[self.symbol.name]
            value = stack.pop()
Eckhart Arnold's avatar
Eckhart Arnold committed
1867
1868
            location = self.grammar.document_length__ - len(text)
            self.grammar.push_rollback__(location, lambda: stack.append(value))
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
        return node, txt

    def __repr__(self):
        return '::' + self.symbol.repr


########################################################################
#
# Aliasing parser classes
#
########################################################################


class Synonym(UnaryOperator):
    r"""
    Simply calls another parser and encapsulates the result in
    another node if that parser matches.

1887
    This parser is needed to support synonyms in EBNF, e.g.::
1888

1889
1890
        jahr       = JAHRESZAHL
        JAHRESZAHL = /\d\d\d\d/
1891

1892
1893
    Otherwise the first line could not be represented by any parser
    class, in which case it would be unclear whether the parser
1894
    RegExp('\d\d\d\d') carries the name 'JAHRESZAHL' or 'jahr'.
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
    """

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        node, text = self.parser(text)
        if node:
            return Node(self, node), text
        return None, text

    def __repr__(self):
        return self.name or self.parser.repr


class Forward(Parser):
    r"""
    Forward allows to declare a parser before it is actually defined.
    Forward declarations are needed for parsers that are recursively
1911
    nested, e.g.::
1912
1913
1914
1915
1916
1917
1918
1919
1920

        class Arithmetic(Grammar):
            '''
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
1921
1922
1923
1924
            INTEGER    = RE('\\d+')
            factor     = INTEGER | TKN("(") + expression + TKN(")")
            term       = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
1925
            root__     = expression
1926
1927
1928
    """

    def __init__(self):
1929
        super().__init__()
1930
1931
1932
1933
1934
        self.parser = None
        self.cycle_reached = False

    def __deepcopy__(self, memo):
        duplicate = self.__class__()
di68kap's avatar
di68kap committed
1935
        # duplicate.name = self.name  # Forward-Parsers should never have a name!
eckhart's avatar
eckhart committed
1936
        duplicate.ptype = self.ptype
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
        memo[id(self)] = duplicate
        parser = copy.deepcopy(self.parser, memo)
        duplicate.set(parser)
        return duplicate

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        return self.parser(text)

    def __cycle_guard(self, func, alt_return):
        """
        Returns the value of `func()` or `alt_return` if a cycle has
        been reached (which can happen if `func` calls methods of
        child parsers).
        """
        if self.cycle_reached:
            return alt_return
        else:
            self.cycle_reached = True
            ret = func()
            self.cycle_reached = False
            return ret

    def __repr__(self):
        return self.__cycle_guard(lambda: repr(self.parser), '...')

    def __str__(self):
        return self.__cycle_guard(lambda: str(self.parser), '...')

1965
1966
    @property
    def repr(self) -> str:
di68kap's avatar
di68kap committed
1967
        """Returns the parser's name if it has a name or repr(self) if not."""
1968
1969
        return self.parser.name if self.parser.name else repr(self)

1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
    def set(self, parser: Parser):
        """
        Sets the parser to which the calls to this Forward-object
        shall be delegated.
        """
        self.parser = parser

    def apply(self, func: Parser.ApplyFunc) -> bool:
        if super().apply(func):
            self.parser.apply(func)
            return True
        return False