parse.py 81.1 KB
Newer Older
eckhart's avatar
eckhart committed
1001
1002
                self.grammar.tree__.new_error(
                    node,
1003
1004
                    'Preprocessor-token cannot have zero length. '
                    '(Most likely due to a preprocessor bug!)')
1005
                return node, text[2:]
1006
1007
            elif text.find(BEGIN_TOKEN, 1, end) >= 0:
                node = Node(self, text[len(self.name) + 1:end])
eckhart's avatar
eckhart committed
1008
1009
                self.grammar.tree__.new_error(
                    node,
1010
1011
1012
1013
1014
                    'Preprocessor-tokens must not be nested or contain '
                    'BEGIN_TOKEN delimiter as part of their argument. '
                    '(Most likely due to a preprocessor bug!)')
                return node, text[end:]
            if text[1:len(self.name) + 1] == self.name:
1015
                return Node(self, text[len(self.name) + 2:end]), text[end + 1:]
1016
1017
1018
        return None, text


1019
class Token(Parser):
eckhart's avatar
eckhart committed
1020
    """
1021
    Parses plain text strings. (Could be done by RegExp as well, but is faster.)
eckhart's avatar
eckhart committed
1022

eckhart's avatar
eckhart committed
1023
1024
    Example::

1025
        >>> while_token = Token("while")
eckhart's avatar
eckhart committed
1026
1027
        >>> Grammar(while_token)("while").content
        'while'
eckhart's avatar
eckhart committed
1028
    """
1029
    assert TOKEN_PTYPE == ":Token"
eckhart's avatar
eckhart committed
1030

1031
1032
    def __init__(self, text: str) -> None:
        super().__init__()
eckhart's avatar
eckhart committed
1033
        self.text = text
1034
        self.len = len(text)
eckhart's avatar
eckhart committed
1035
1036

    def __deepcopy__(self, memo):
1037
1038
        duplicate = self.__class__(self.text)
        duplicate.name = self.name
1039
        duplicate.ptype = self.ptype
1040
        return duplicate
eckhart's avatar
eckhart committed
1041
1042
1043

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        if text.startswith(self.text):
1044
            return Node(self, self.text, True), text[self.len:]
eckhart's avatar
eckhart committed
1045
1046
        return None, text

1047
1048
1049
    def __repr__(self):
        return ("'%s'" if self.text.find("'") <= 0 else '"%s"') % self.text

eckhart's avatar
eckhart committed
1050

1051
1052
1053
1054
1055
1056
1057
1058
1059
class RegExp(Parser):
    r"""
    Regular expression parser.

    The RegExp-parser parses text that matches a regular expression.
    RegExp can also be considered as the "atomic parser", because all
    other parsers delegate part of the parsing job to other parsers,
    but do not match text directly.

1060
1061
1062
1063
1064
    Example::

        >>> word = RegExp(r'\w+')
        >>> Grammar(word)("Haus").content
        'Haus'
1065

1066
    EBNF-Notation:  ``/ ... /``
1067

1068
    EBNF-Example:   ``word = /\w+/``
1069
1070
    """

1071
1072
    def __init__(self, regexp) -> None:
        super().__init__()
1073
1074
1075
1076
1077
1078
1079
1080
        self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

    def __deepcopy__(self, memo):
        # `regex` supports deep copies, but not `re`
        try:
            regexp = copy.deepcopy(self.regexp, memo)
        except TypeError:
            regexp = self.regexp.pattern
1081
1082
        duplicate = self.__class__(regexp)
        duplicate.name = self.name
eckhart's avatar
eckhart committed
1083
        duplicate.ptype = self.ptype
1084
        return duplicate
1085
1086

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
1087
1088
1089
1090
        match = text.match(self.regexp)
        if match:
            capture = match.group(0)
            end = text.index(match.end())
1091
1092
            # regular expression must never match preprocessor-tokens!
            # TODO: Find a better solution here? e.g. static checking/re-mangling at compile time
1093
1094
1095
1096
1097
            i = capture.find(BEGIN_TOKEN)
            if i >= 0:
                capture = capture[:i]
                end = i
            return Node(self, capture, True), text[end:]
1098
1099
1100
        return None, text

    def __repr__(self):
1101
        return escape_control_characters('/%s/' % self.regexp.pattern)
1102
1103


eckhart's avatar
eckhart committed
1104
def withWS(parser_factory, wsL='', wsR=r'\s*'):
1105
    """Syntactic Sugar for 'Series(Whitespace(wsL), parser_factory(), Whitespace(wsR))'.
1106
    """
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
    if wsL and isinstance(wsL, str):
        wsL = Whitespace(wsL)
    if wsR and isinstance(wsR, str):
        wsR = Whitespace(wsR)
    if wsL and wsR:
        return Series(wsL, parser_factory(), wsR)
    elif wsL:
        return Series(wsL, parser_factory())
    elif wsR:
        return Series(parser_factory(), wsR)
    else:
        return parser_factory()
1119

1120

eckhart's avatar
eckhart committed
1121
def RE(regexp, wsL='', wsR=r'\s*'):
1122
    """Syntactic Sugar for 'Series(Whitespace(wsL), RegExp(regexp), Whitespace(wsR))'"""
eckhart's avatar
eckhart committed
1123
    return withWS(lambda: RegExp(regexp), wsL, wsR)
1124
1125


eckhart's avatar
eckhart committed
1126
def TKN(token, wsL='', wsR=r'\s*'):
1127
1128
    """Syntactic Sugar for 'Series(Whitespace(wsL), Token(token), Whitespace(wsR))'"""
    return withWS(lambda: Token(token), wsL, wsR)
1129
1130


1131
1132
1133
1134
class Whitespace(RegExp):
    """An variant of RegExp that signifies through its class name that it
    is a RegExp-parser for whitespace."""
    assert WHITESPACE_PTYPE == ":Whitespace"
eckhart's avatar
eckhart committed
1135

eckhart's avatar
eckhart committed
1136
1137
1138
    def __repr__(self):
        return '~'

1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158

########################################################################
#
# Containing parser classes, i.e. parsers that contain other parsers
# to which they delegate parsing
#
########################################################################


class UnaryOperator(Parser):
    """
    Base class of all unary parser operators, i.e. parser that contains
    one and only one other parser, like the optional parser for example.

    The UnaryOperator base class supplies __deepcopy__ and apply
    methods for unary parser operators. The __deepcopy__ method needs
    to be overwritten, however, if the constructor of a derived class
    has additional parameters.
    """

1159
1160
    def __init__(self, parser: Parser) -> None:
        super(UnaryOperator, self).__init__()
1161
1162
1163
1164
1165
        assert isinstance(parser, Parser), str(parser)
        self.parser = parser  # type: Parser

    def __deepcopy__(self, memo):
        parser = copy.deepcopy(self.parser, memo)
1166
1167
        duplicate = self.__class__(parser)
        duplicate.name = self.name
eckhart's avatar
eckhart committed
1168
        duplicate.ptype = self.ptype
1169
        return duplicate
1170

eckhart's avatar
eckhart committed
1171
1172
1173
    def _apply(self, func: ApplyFunc, flip: FlagFunc) -> bool:
        if super()._apply(func, flip):
            self.parser._apply(func, flip)
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
            return True
        return False


class NaryOperator(Parser):
    """
    Base class of all Nnary parser operators, i.e. parser that
    contains one or more other parsers, like the alternative
    parser for example.

    The NnaryOperator base class supplies __deepcopy__ and apply methods
    for unary parser operators. The __deepcopy__ method needs to be
    overwritten, however, if the constructor of a derived class has
    additional parameters.
    """

1190
1191
    def __init__(self, *parsers: Parser) -> None:
        super().__init__()
1192
1193
1194
1195
1196
        assert all([isinstance(parser, Parser) for parser in parsers]), str(parsers)
        self.parsers = parsers  # type: Tuple[Parser, ...]

    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
1197
1198
        duplicate = self.__class__(*parsers)
        duplicate.name = self.name
eckhart's avatar
eckhart committed
1199
        duplicate.ptype = self.ptype
1200
        return duplicate
1201

eckhart's avatar
eckhart committed
1202
1203
    def _apply(self, func: ApplyFunc, flip: FlagFunc) -> bool:
        if super()._apply(func, flip):
1204
            for parser in self.parsers:
eckhart's avatar
eckhart committed
1205
                parser._apply(func, flip)
1206
1207
1208
1209
1210
1211
            return True
        return False


class Option(UnaryOperator):
    r"""
1212
    Parser ``Option`` always matches, even if its child-parser
1213
1214
    did not match.

1215
    If the child-parser did not match ``Option`` returns a node
1216
1217
    with no content and does not move forward in the text.

1218
    If the child-parser did match, ``Option`` returns the a node
eckhart's avatar
eckhart committed
1219
    with the node returned by the child-parser as its single
1220
1221
1222
    child and the text at the position where the child-parser
    left it.

1223
1224
    Examples::

1225
        >>> number = Option(TKN('-')) + RegExp(r'\d+') + Option(RegExp(r'\.\d+'))
1226
1227
1228
1229
1230
1231
        >>> Grammar(number)('3.14159').content
        '3.14159'
        >>> Grammar(number)('3.14159').structure
        '(:Series (:Option) (:RegExp "3") (:Option (:RegExp ".14159")))'
        >>> Grammar(number)('-1').content
        '-1'
1232

1233
    EBNF-Notation: ``[ ... ]``
1234

1235
    EBNF-Example:  ``number = ["-"]  /\d+/  [ /\.\d+/ ]``
1236
1237
    """

1238
1239
    def __init__(self, parser: Parser) -> None:
        super().__init__(parser)
1240
1241
        # assert isinstance(parser, Parser)
        assert not isinstance(parser, Option), \
eckhart's avatar
eckhart committed
1242
            "Redundant nesting of options: %s%s" % (str(parser.name), str(self.ptype))
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        node, text = self.parser(text)
        if node:
            return Node(self, node), text
        return Node(self, ()), text

    def __repr__(self):
        return '[' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + ']'


class ZeroOrMore(Option):
    r"""
    `ZeroOrMore` applies a parser repeatedly as long as this parser
    matches. Like `Option` the `ZeroOrMore` parser always matches. In
    case of zero repetitions, the empty match `((), text)` is returned.

1261
1262
    Examples::

1263
        >>> sentence = ZeroOrMore(RE(r'\w+,?')) + TKN('.')
1264
1265
1266
1267
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> Grammar(sentence)('.').content  # an empty sentence also matches
        '.'
1268

1269
    EBNF-Notation: ``{ ... }``
1270

1271
    EBNF-Example:  ``sentence = { /\w+,?/ } "."``
1272
1273
1274
1275
    """

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
di68kap's avatar
di68kap committed
1276
1277
        n = len(text) + 1  # type: int
        infinite_loop_error = None  # type: Optional[Error]
1278
1279
1280
1281
1282
1283
        while text and len(text) < n:
            n = len(text)
            node, text = self.parser(text)
            if not node:
                break
            if len(text) == n:
Eckhart Arnold's avatar
Eckhart Arnold committed
1284
                node.errors.append(Error("Infinite loop!", node.pos, Error.MESSAGE))
eckhart's avatar
eckhart committed
1285
                infinite_loop_error = Error(dsl_error_msg(self, 'Infinite Loop encountered.'),
Eckhart Arnold's avatar
Eckhart Arnold committed
1286
                                            node.pos)
1287
            results += (node,)
di68kap's avatar
di68kap committed
1288
1289
        node = Node(self, results)
        if infinite_loop_error:
eckhart's avatar
eckhart committed
1290
            self.grammar.tree__.add_error(node, infinite_loop_error)
di68kap's avatar
di68kap committed
1291
        return node, text
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + '}'


class OneOrMore(UnaryOperator):
    r"""
    `OneOrMore` applies a parser repeatedly as long as this parser
    matches. Other than `ZeroOrMore` which always matches, at least
    one match is required by `OneOrMore`.

1304
1305
    Examples::

1306
        >>> sentence = OneOrMore(RE(r'\w+,?')) + TKN('.')
1307
1308
1309
        >>> Grammar(sentence)('Wo viel der Weisheit, da auch viel des Grämens.').content
        'Wo viel der Weisheit, da auch viel des Grämens.'
        >>> str(Grammar(sentence)('.'))  # an empty sentence also matches
di68kap's avatar
di68kap committed
1310
        ' <<< Error on "." | Parser did not match! >>> '
1311

1312
    EBNF-Notation: ``{ ... }+``
1313

1314
    EBNF-Example:  ``sentence = { /\w+,?/ }+``
1315
1316
    """

1317
1318
    def __init__(self, parser: Parser) -> None:
        super().__init__(parser)
1319
1320
        assert not isinstance(parser, Option), \
            "Use ZeroOrMore instead of nesting OneOrMore and Option: " \
di68kap's avatar
di68kap committed
1321
            "%s(%s)" % (str(self.ptype), str(parser.name))
1322
1323
1324
1325

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
di68kap's avatar
di68kap committed
1326
1327
        n = len(text) + 1  # type: int
        infinite_loop_error = None  # type: Optional[Error]
1328
1329
1330
1331
1332
1333
        while text_ and len(text_) < n:
            n = len(text_)
            node, text_ = self.parser(text_)
            if not node:
                break
            if len(text_) == n:
Eckhart Arnold's avatar
Eckhart Arnold committed
1334
                node.errors.append(Error("Infinite loop!", node.pos, Error.MESSAGE))
eckhart's avatar
eckhart committed
1335
                infinite_loop_error = Error(dsl_error_msg(self, 'Infinite Loop encountered.'),
Eckhart Arnold's avatar
Eckhart Arnold committed
1336
                                            node.pos)
1337
1338
1339
            results += (node,)
        if results == ():
            return None, text
di68kap's avatar
di68kap committed
1340
1341
        node = Node(self, results)
        if infinite_loop_error:
eckhart's avatar
eckhart committed
1342
            self.grammar.tree__.add_error(node, infinite_loop_error)
di68kap's avatar
di68kap committed
1343
        return node, text_
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354

    def __repr__(self):
        return '{' + (self.parser.repr[1:-1] if isinstance(self.parser, Alternative)
                      and not self.parser.name else self.parser.repr) + '}+'


class Series(NaryOperator):
    r"""
    Matches if each of a series of parsers matches exactly in the order of
    the series.

1355
    Attributes:
1356
1357
1358
1359
1360
1361
1362
1363
        mandatory (int):  Number of the element statring at which the element
                and all following elements are considered "mandatory". This
                means that rather than returning a non-match an error message
                is isssued. The default value is Series.NOPE, which means that
                no elements are mandatory.
        errmsg (str):  An optional error message that overrides the default
               message for mandatory continuation errors. This can be used to
               provide more helpful error messages to the user.
1364

1365
1366
    Example::

1367
        >>> variable_name = RegExp('(?!\d)\w') + RE('\w*')
1368
1369
1370
        >>> Grammar(variable_name)('variable_1').content
        'variable_1'
        >>> str(Grammar(variable_name)('1_variable'))
di68kap's avatar
di68kap committed
1371
        ' <<< Error on "1_variable" | Parser did not match! >>> '
1372

1373
    EBNF-Notation: ``... ...``    (sequence of parsers separated by a blank or new line)
1374

1375
    EBNF-Example:  ``series = letter letter_or_digit``
1376
1377
1378
1379
    """
    RX_ARGUMENT = re.compile(r'\s(\S)')
    NOPE = 1000

1380
    def __init__(self, *parsers: Parser, mandatory: int = NOPE, errmsg: str="") -> None:
1381
        super().__init__(*parsers)
1382
1383
1384
1385
1386
1387
1388
        length = len(self.parsers)
        assert 1 <= length < Series.NOPE, \
            'Length %i of series exceeds maximum length of %i' % (length, Series.NOPE)
        if mandatory < 0:
            mandatory += length
        assert 0 <= mandatory < length or mandatory == Series.NOPE
        self.mandatory = mandatory
1389
        self.errmsg = errmsg
1390
1391
1392

    def __deepcopy__(self, memo):
        parsers = copy.deepcopy(self.parsers, memo)
1393
        duplicate = self.__class__(*parsers, mandatory=self.mandatory, errmsg=self.errmsg)
1394
        duplicate.name = self.name
eckhart's avatar
eckhart committed
1395
        duplicate.ptype = self.ptype
1396
        return duplicate
1397
1398
1399
1400

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
di68kap's avatar
di68kap committed
1401
        mandatory_violation = None
1402
        for pos, parser in enumerate(self.parsers):
1403
1404
1405
1406
1407
1408
            node, text_ = parser(text_)
            if not node:
                if pos < self.mandatory:
                    return None, text
                else:
                    # Provide useful error messages
Eckhart Arnold's avatar
Eckhart Arnold committed
1409
1410
1411
1412
                    # TODO: Change to accomodate to resuming after errors are caught.
                    # match = text.search(Series.RX_ARGUMENT)
                    # i = max(1, text.index(match.regs[1][0])) if match else 1
                    i = 0
di68kap's avatar
di68kap committed
1413
                    location = self.grammar.document_length__ - len(text_)
1414
                    node = Node(None, text_[:i]).init_pos(location)
1415
1416
1417
1418
1419
1420
                    # self.grammar.tree__.add_error(
                    #     node, Error("§ %s violation" % parser.repr, location, Error.MESSAGE))
                    # # node.errors.append(Error("§ %s violation" % parser.repr,
                    # #                          location, Error.MESSAGE))
                    found = text_[:10].replace('\n', '\\n ')
                    if self.errmsg:
1421
                        msg = self.errmsg.format(parser.repr, found)
1422
1423
1424
1425
                    else:
                        msg = '%s expected, "%s" found!' % (parser.repr, found)
                    mandatory_violation = Error(msg, location, Error.MANDATORY_CONTINUATION)
                    self.grammar.tree__.add_error(node, mandatory_violation)
1426
                    text_ = text_[i:]
1427
1428
                    results += (node,)
                    break
1429
1430
1431
1432
            results += (node,)
            # if node.error_flag:  # break on first error
            #    break
        assert len(results) <= len(self.parsers)
di68kap's avatar
di68kap committed
1433
1434
        node = Node(self, results)
        if mandatory_violation:
1435
            raise ParserError(node, text, first_throw=True)
1436
            # self.grammar.tree__.add_error(node, mandatory_violation)
di68kap's avatar
di68kap committed
1437
        return node, text_
1438
1439
1440
1441
1442
1443
1444

    def __repr__(self):
        return " ".join([parser.repr for parser in self.parsers[:self.mandatory]]
                        + (['§'] if self.mandatory != Series.NOPE else [])
                        + [parser.repr for parser in self.parsers[self.mandatory:]])

    # The following operator definitions add syntactical sugar, so one can write:
1445
    # `RE('\d+') + Optional(RE('\.\d+)` instead of `Series(RE('\d+'), Optional(RE('\.\d+))`
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463

    @staticmethod
    def combined_mandatory(left: Parser, right: Parser):
        """
        Returns the position of the first mandatory element (if any) when
        parsers `left` and `right` are joined to a sequence.
        """
        left_mandatory, left_length = (left.mandatory, len(left.parsers)) \
            if isinstance(left, Series) else (Series.NOPE, 1)
        if left_mandatory != Series.NOPE:
            return left_mandatory
        right_mandatory = right.mandatory if isinstance(right, Series) else Series.NOPE
        if right_mandatory != Series.NOPE:
            return right_mandatory + left_length
        return Series.NOPE

    def __add__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
1464
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1465
1466
1467
1468
1469
        return Series(*(self.parsers + other_parsers),
                      mandatory=self.combined_mandatory(self, other))

    def __radd__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
1470
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1471
1472
1473
1474
1475
        return Series(*(other_parsers + self.parsers),
                      mandatory=self.combined_mandatory(other, self))

    def __iadd__(self, other: Parser) -> 'Series':
        other_parsers = cast('Series', other).parsers if isinstance(other, Series) \
1476
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
        self.parsers += other_parsers
        self.mandatory = self.combined_mandatory(self, other)
        return self


class Alternative(NaryOperator):
    r"""
    Matches if one of several alternatives matches. Returns
    the first match.

    This parser represents the EBNF-operator "|" with the qualification
    that both the symmetry and the ambiguity of the EBNF-or-operator
1489
    are broken by selecting the first match.::
1490

1491
        # the order of the sub-expression matters!
1492
        >>> number = RE('\d+') | RE('\d+') + RE('\.') + RE('\d+')
1493
1494
        >>> str(Grammar(number)("3.1416"))
        '3 <<< Error on ".141" | Parser stopped before end! trying to recover... >>> '
1495

1496
        # the most selective expression should be put first:
1497
        >>> number = RE('\d+') + RE('\.') + RE('\d+') | RE('\d+')
1498
1499
        >>> Grammar(number)("3.1416").content
        '3.1416'
1500

1501
    EBNF-Notation: ``... | ...``
1502

1503
    EBNF-Example:  ``sentence = /\d+\.\d+/ | /\d+/``
1504
1505
    """

1506
1507
    def __init__(self, *parsers: Parser) -> None:
        super().__init__(*parsers)
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
        assert len(self.parsers) >= 1
        # only the last alternative may be optional. Could this be checked at compile time?
        assert all(not isinstance(p, Option) for p in self.parsers[:-1]), \
            "Parser-specification Error: only the last alternative may be optional!"

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        for parser in self.parsers:
            node, text_ = parser(text)
            if node:
                return Node(self, node), text_
        return None, text

    def __repr__(self):
        return '(' + ' | '.join(parser.repr for parser in self.parsers) + ')'

    def reset(self):
        super(Alternative, self).reset()
        return self

    # The following operator definitions add syntactical sugar, so one can write:
1528
1529
    # `RE('\d+') + RE('\.') + RE('\d+') | RE('\d+')` instead of:
    # `Alternative(Series(RE('\d+'), RE('\.'), RE('\d+')), RE('\d+'))`
1530
1531
1532

    def __or__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
1533
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1534
1535
1536
1537
        return Alternative(*(self.parsers + other_parsers))

    def __ror__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
1538
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1539
1540
1541
1542
        return Alternative(*(other_parsers + self.parsers))

    def __ior__(self, other: Parser) -> 'Alternative':
        other_parsers = cast('Alternative', other).parsers if isinstance(other, Alternative) \
1543
            else cast(Tuple[Parser, ...], (other,))  # type: Tuple[Parser, ...]
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
        self.parsers += other_parsers
        return self


class AllOf(NaryOperator):
    """
    Matches if all elements of a list of parsers match. Each parser must
    match exactly once. Other than in a sequence, the order in which
    the parsers match is arbitrary, however.

1554
1555
    Example::

1556
        >>> prefixes = AllOf(TKN("A"), TKN("B"))
1557
1558
1559
1560
        >>> Grammar(prefixes)('A B').content
        'A B'
        >>> Grammar(prefixes)('B A').content
        'B A'
1561

1562
    EBNF-Notation: ``<... ...>``    (sequence of parsers enclosed by angular brackets)
1563

1564
    EBNF-Example:  ``set = <letter letter_or_digit>``
1565
1566
    """

1567
    def __init__(self, *parsers: Parser) -> None:
1568
1569
1570
1571
1572
1573
1574
1575
        if len(parsers) == 1 and isinstance(parsers[0], Series):
            assert isinstance(parsers[0], Series), \
                "Parser-specification Error: No single arguments other than a Series " \
                "allowed as arguments for AllOf-Parser !"
            series = cast(Series, parsers[0])
            assert series.mandatory == Series.NOPE, \
                "AllOf cannot contain mandatory (§) elements!"
            parsers = series.parsers
1576
        super().__init__(*parsers)
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
        parsers = list(self.parsers)  # type: List[Parser]
        while parsers:
            for i, parser in enumerate(parsers):
                node, text__ = parser(text_)
                if node:
                    results += (node,)
                    text_ = text__
                    del parsers[i]
                    break
            else:
                return None, text
        assert len(results) <= len(self.parsers)
        return Node(self, results), text_

    def __repr__(self):
        return '<' + ' '.join(parser.repr for parser in self.parsers) + '>'


class SomeOf(NaryOperator):
    """
    Matches if at least one element of a list of parsers match. No parser
    must match more than once . Other than in a sequence, the order in which
    the parsers match is arbitrary, however.

1605
1606
    Example::

1607
        >>> prefixes = SomeOf(TKN("A"), TKN("B"))
1608
1609
1610
1611
1612
1613
        >>> Grammar(prefixes)('A B').content
        'A B'
        >>> Grammar(prefixes)('B A').content
        'B A'
        >>> Grammar(prefixes)('B').content
        'B'
1614

1615
    EBNF-Notation: ``<... ...>``    (sequence of parsers enclosed by angular brackets)
1616

1617
    EBNF-Example:  ``set = <letter letter_or_digit>``
1618
1619
    """

1620
    def __init__(self, *parsers: Parser) -> None:
1621
1622
1623
1624
1625
1626
        if len(parsers) == 1:
            assert isinstance(parsers[0], Alternative), \
                "Parser-specification Error: No single arguments other than a Alternative " \
                "allowed as arguments for SomeOf-Parser !"
            alternative = cast(Alternative, parsers[0])
            parsers = alternative.parsers
1627
        super().__init__(*parsers)
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        results = ()  # type: Tuple[Node, ...]
        text_ = text  # type: StringView
        parsers = list(self.parsers)  # type: List[Parser]
        while parsers:
            for i, parser in enumerate(parsers):
                node, text__ = parser(text_)
                if node:
                    results += (node,)
                    text_ = text__
                    del parsers[i]
                    break
            else:
                parsers = []
        assert len(results) <= len(self.parsers)
        if results:
            return Node(self, results), text_
        else:
            return None, text

    def __repr__(self):
        return '<' + ' | '.join(parser.repr for parser in self.parsers) + '>'


1653
def Unordered(parser: NaryOperator) -> NaryOperator:
1654
1655
1656
1657
1658
    """
    Returns an AllOf- or SomeOf-parser depending on whether `parser`
    is a Series (AllOf) or an Alternative (SomeOf).
    """
    if isinstance(parser, Series):
1659
        return AllOf(parser)
1660
    elif isinstance(parser, Alternative):
1661
        return SomeOf(parser)
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
    else:
        raise AssertionError("Unordered can take only Series or Alternative as parser.")


########################################################################
#
# Flow control operators
#
########################################################################

class FlowOperator(UnaryOperator):
    """
    Base class for all flow operator parsers like Lookahead and Lookbehind.
    """
    def sign(self, bool_value) -> bool:
        """Returns the value. Can be overriden to return the inverted bool."""
        return bool_value


eckhart's avatar
eckhart committed
1681
1682
1683
1684
def Required(parser: Parser) -> Parser:
    return Series(parser, mandatory=0)


1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
# class Required(FlowOperator):
#     """OBSOLETE. Use mandatory-parameter of Series-parser instead!
#     """
#     RX_ARGUMENT = re.compile(r'\s(\S)')
#
#     def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
#         node, text_ = self.parser(text)
#         if not node:
#             m = text.search(Required.RX_ARGUMENT)  # re.search(r'\s(\S)', text)
#             i = max(1, text.index(m.regs[1][0])) if m else 1
#             node = Node(self, text[:i])
#             text_ = text[i:]
eckhart's avatar
eckhart committed
1697
#             self.grammar.tree__.new_error(node,
1698
1699
#                                           '%s expected; "%s" found!' % (str(self.parser),
#                                           text[:10]), code=Error.MANDATORY_CONTINUATION)
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
#         return node, text_
#
#     def __repr__(self):
#         return '§' + self.parser.repr


class Lookahead(FlowOperator):
    """
    Matches, if the contained parser would match for the following text,
    but does not consume any text.
    """
    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        node, _ = self.parser(text)
        if self.sign(node is not None):
            return Node(self, ''), text
        else:
            return None, text

    def __repr__(self):
        return '&' + self.parser.repr


class NegativeLookahead(Lookahead):
    """
    Matches, if the contained parser would *not* match for the following
    text.
    """
    def __repr__(self):
        return '!' + self.parser.repr

    def sign(self, bool_value) -> bool:
        return not bool_value


class Lookbehind(FlowOperator):
    """
    Matches, if the contained parser would match backwards. Requires
1737
    the contained parser to be a RegExp, _RE, PlainText or _Token parser.
1738
1739

    EXPERIMENTAL
1740
    """
1741
    def __init__(self, parser: Parser) -> None:
1742
1743
1744
        p = parser
        while isinstance(p, Synonym):
            p = p.parser
1745
        assert isinstance(p, RegExp) or isinstance(p, Token)
eckhart's avatar
eckhart committed
1746
        self.regexp = None
eckhart's avatar
eckhart committed
1747
        self.text = ''  # type: str
1748
        if isinstance(p, RegExp):
eckhart's avatar
eckhart committed
1749
1750
            self.regexp = cast(RegExp, p).regexp
        else:  # p is of type PlainText
1751
            self.text = cast(Token, p).text
1752
        super().__init__(parser)
1753
1754
1755

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        backwards_text = self.grammar.reversed__[len(text):]
eckhart's avatar
eckhart committed
1756
1757
1758
1759
1760
        if self.regexp is None:  # assert self.text is not None
            does_match = backwards_text[:len(self.text)] == self.text
        else:  # assert self.regexp is not None
            does_match = backwards_text.match(self.regexp)
        return (Node(self, ''), text) if self.sign(does_match) else (None, text)
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794

    def __repr__(self):
        return '-&' + self.parser.repr


class NegativeLookbehind(Lookbehind):
    """
    Matches, if the contained parser would *not* match backwards. Requires
    the contained parser to be a RegExp-parser.
    """
    def __repr__(self):
        return '-!' + self.parser.repr

    def sign(self, bool_value) -> bool:
        return not bool(bool_value)


########################################################################
#
# Capture and Retrieve operators (for passing variables in the parser)
#
########################################################################


class Capture(UnaryOperator):
    """
    Applies the contained parser and, in case of a match, saves the result
    in a variable. A variable is a stack of values associated with the
    contained parser's name. This requires the contained parser to be named.
    """
    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        node, text_ = self.parser(text)
        if node:
            assert self.name, """Tried to apply an unnamed capture-parser!"""
1795
            stack = self.grammar.variables__[self.name]
1796
            stack.append(node.content)
Eckhart Arnold's avatar
Eckhart Arnold committed
1797
1798
            location = self.grammar.document_length__ - len(text)
            self.grammar.push_rollback__(location, lambda: stack.pop())
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
            # caching will be blocked by parser guard (see way above),
            # because it would prevent recapturing of rolled back captures
            return Node(self, node), text_
        else:
            return None, text

    def __repr__(self):
        return self.parser.repr


RetrieveFilter = Callable[[List[str]], str]


def last_value(stack: List[str]) -> str:
    return stack[-1]


def counterpart(stack: List[str]) -> str:
    value = stack[-1]
    return value.replace("(", ")").replace("[", "]").replace("{", "}").replace("<", ">")


def accumulate(stack: List[str]) -> str:
    return "".join(stack) if len(stack) > 1 else stack[-1]  # provoke IndexError if stack empty


class Retrieve(Parser):
    """
    Matches if the following text starts with the value of a particular
    variable. As a variable in this context means a stack of values,
    the last value will be compared with the following text. It will not
    be removed from the stack! (This is the difference between the
    `Retrieve` and the `Pop` parser.)
    The constructor parameter `symbol` determines which variable is
    used.
1834
1835
1836
1837
1838
1839
1840
1841

    Attributes:
        symbol: The parser that has stored the value to be retrieved, in
            other words: "the observed parser"
        rfilter: a procedure that through which the processing to the
            retrieved symbols is channeld. In the simplemost case it merely
            returns the last string stored by the observed parser. This can
            be (mis-)used to execute any kind of semantic action.
1842
1843
    """

1844
1845
    def __init__(self, symbol: Parser, rfilter: RetrieveFilter = None) -> None:
        super().__init__()
1846
1847
1848
1849
        self.symbol = symbol
        self.filter = rfilter if rfilter else last_value

    def __deepcopy__(self, memo):
1850
1851
        duplicate = self.__class__(self.symbol, self.filter)
        duplicate.name = self.name
eckhart's avatar
eckhart committed
1852
        duplicate.ptype = self.ptype
1853
        return duplicate
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        # the following indirection allows the call() method to be called
        # from subclass without triggering the parser guard a second time
        return self.retrieve_and_match(text)

    def __repr__(self):
        return ':' + self.symbol.repr

    def retrieve_and_match(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        """
        Retrieves variable from stack through the filter function passed to
        the class' constructor and tries to match the variable's value with
        the following text. Returns a Node containing the value or `None`
        accordingly.

        This functionality has been move from the __call__ method to an
        independent method to allow calling it from a subclasses __call__
        method without triggering the parser guard a second time.
        """
        try:
            stack = self.grammar.variables__[self.symbol.name]
            value = self.filter(stack)
        except (KeyError, IndexError):
1878
            node = Node(self, '')
eckhart's avatar
eckhart committed
1879
            self.grammar.tree__.new_error(
1880
1881
                node, dsl_error_msg(self, "'%s' undefined or exhausted." % self.symbol.name))
            return node, text
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
        if text.startswith(value):
            return Node(self, value), text[len(value):]
        else:
            return None, text


class Pop(Retrieve):
    """
    Matches if the following text starts with the value of a particular
    variable. As a variable in this context means a stack of values,
    the last value will be compared with the following text. Other
    than the `Retrieve`-parser, the `Pop`-parser removes the value
    from the stack in case of a match.

    The constructor parameter `symbol` determines which variable is
    used.
    """

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        node, txt = super().retrieve_and_match(text)
1902
        if node and not node.errors:
1903
1904
            stack = self.grammar.variables__[self.symbol.name]
            value = stack.pop()
Eckhart Arnold's avatar
Eckhart Arnold committed
1905
1906
            location = self.grammar.document_length__ - len(text)
            self.grammar.push_rollback__(location, lambda: stack.append(value))
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
        return node, txt

    def __repr__(self):
        return '::' + self.symbol.repr


########################################################################
#
# Aliasing parser classes
#
########################################################################


class Synonym(UnaryOperator):
    r"""
    Simply calls another parser and encapsulates the result in
    another node if that parser matches.

1925
    This parser is needed to support synonyms in EBNF, e.g.::
1926

1927
1928
        jahr       = JAHRESZAHL
        JAHRESZAHL = /\d\d\d\d/
1929

1930
1931
    Otherwise the first line could not be represented by any parser
    class, in which case it would be unclear whether the parser
1932
    RegExp('\d\d\d\d') carries the name 'JAHRESZAHL' or 'jahr'.
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
    """

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        node, text = self.parser(text)
        if node:
            return Node(self, node), text
        return None, text

    def __repr__(self):
        return self.name or self.parser.repr


class Forward(Parser):
    r"""
    Forward allows to declare a parser before it is actually defined.
    Forward declarations are needed for parsers that are recursively
1949
    nested, e.g.::
1950
1951
1952
1953
1954
1955
1956
1957
1958

        class Arithmetic(Grammar):
            '''
            expression =  term  { ("+" | "-") term }
            term       =  factor  { ("*" | "/") factor }
            factor     =  INTEGER | "("  expression  ")"
            INTEGER    =  /\d+/~
            '''
            expression = Forward()
1959
1960
1961
1962
            INTEGER    = RE('\\d+')
            factor     = INTEGER | TKN("(") + expression + TKN(")")
            term       = factor + ZeroOrMore((TKN("*") | TKN("/")) + factor)
            expression.set(term + ZeroOrMore((TKN("+") | TKN("-")) + term))
1963
            root__     = expression
1964
1965
1966
    """

    def __init__(self):
1967
        super().__init__()
1968
1969
1970
1971
1972
        self.parser = None
        self.cycle_reached = False

    def __deepcopy__(self, memo):
        duplicate = self.__class__()
di68kap's avatar
di68kap committed
1973
        # duplicate.name = self.name  # Forward-Parsers should never have a name!
eckhart's avatar
eckhart committed
1974
        duplicate.ptype = self.ptype
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
        memo[id(self)] = duplicate
        parser = copy.deepcopy(self.parser, memo)
        duplicate.set(parser)
        return duplicate

    def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
        return self.parser(text)

    def __cycle_guard(self, func, alt_return):
        """
        Returns the value of `func()` or `alt_return` if a cycle has
        been reached (which can happen if `func` calls methods of
        child parsers).
        """
        if self.cycle_reached:
            return alt_return
        else:
            self.cycle_reached = True
            ret = func()
            self.cycle_reached = False
            return ret

    def __repr__(self):
        return self.__cycle_guard(lambda: repr(self.parser), '...')

    def __str__(self):