preprocess.py 4.78 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
""" preprocess.py - preprocessing of source files for DHParser

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

import bisect
import collections
import functools
from typing import Union, Callable

Eckhart Arnold's avatar
Eckhart Arnold committed
24
25
from DHParser.toolkit import re

26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
__all__ = ('RX_TOKEN_NAME',
           'BEGIN_TOKEN',
           'TOKEN_DELIMITER',
           'END_TOKEN',
           'PreprocessorFunc',
           'make_token',
           'nil_preprocessor',
           'pp_tokenized',
           'tokenized_to_original_mapping',
           'source_map')

BEGIN_TOKEN = '\x1b'
TOKEN_DELIMITER = '\x1c'
END_TOKEN = '\x1d'
RESERVED_TOKEN_CHARS = BEGIN_TOKEN + TOKEN_DELIMITER + END_TOKEN

RX_TOKEN_NAME = re.compile(r'\w+')
RX_TOKEN_ARGUMENT = re.compile(r'[^\x1b\x1c\x1d]*')
RX_TOKEN = re.compile(r'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d')

PreprocessorFunc = Union[Callable[[str], str], functools.partial]


def make_token(token: str, argument: str = '') -> str:
    """
    Turns the ``token`` and ``argument`` into a special token that
    will be caught by the `PreprocessorToken`-parser.

    This function is a support function that should be used by
    preprocessors to inject preprocessor tokens into the source text.
    """
    assert RX_TOKEN_NAME.match(token)
    assert RX_TOKEN_ARGUMENT.match(argument)

    return BEGIN_TOKEN + token + TOKEN_DELIMITER + argument + END_TOKEN


def nil_preprocessor(text: str) -> str:
    """A preprocessor that does nothing, i.e. just returns the input."""
    return text


def pp_tokenized(tokenized: str) -> str:
    """Returns a pretty-printable version of a document that contains tokens."""
    return tokenized.replace('\x1b', '<').replace('\x1c', '|').replace('\x1d', '>')


#######################################################################
#
# Source Maps - mapping source code positions between different
#               transformations of the source text
#
#######################################################################


SourceMap = collections.namedtuple('SourceMap', ['positions', 'offsets'])


def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap:
    """
    Generates a source map for mapping positions in a text that has
    been enriched with token markers to their original positions.

    Args:
        tokenized_source: the source text enriched with token markers
    Returns:
        a source map, i.e. a list of positions and a list of corresponding
        offsets. The list of positions is ordered from smallest to highest.
        An offset is valid for its associated position and all following
        positions until (and excluding) the next position in the list of
        positions.
    """
    positions, offsets = [0], [0]
    o = 0
    i = tokenized_source.find(BEGIN_TOKEN)
    while i >= 0:
        d = tokenized_source.find(TOKEN_DELIMITER, i)
        e = tokenized_source.find(END_TOKEN, i)
        assert 0 <= d < e
Eckhart Arnold's avatar
Eckhart Arnold committed
105
        o -= (d - i + 3)
106
        positions.extend([d + 1, e + 1])
Eckhart Arnold's avatar
Eckhart Arnold committed
107
        offsets.extend([o + 1, o])
108
        i = tokenized_source.find(BEGIN_TOKEN, e + 1)
109
110
111
    if e + 1 < len(tokenized_source):
        positions.append(len(tokenized_source))
        offsets.append(offsets[-1])
112
113
114
115
116

    # post conditions
    assert len(positions) == len(offsets), '\n' + str(positions) + '\n' + str(offsets)
    assert positions[0] == 0
    assert all(positions[i] < positions[i + 1] for i in range(len(positions) - 1))
117
    assert all(offsets[i] >= offsets[i + 1] for i in range(len(offsets) - 1))
118

119
    return SourceMap(positions, offsets, len(positions))
120
121
122
123
124
125
126
127
128
129
130
131
132
133


def source_map(position: int, srcmap: SourceMap) -> int:
    """
    Maps a position in a (pre-)processed text to its corresponding
    position in the original document according to the given source map.

    Args:
        position: the position in the processed text
        srcmap:   the source map, i.e. a mapping of locations to
                  offset values
    Returns:
        the mapped position
    """
134
    i = bisect.bisect_right(srcmap.positions, position)
135
    if i:
136
        return min(position + srcmap.offsets[i - 1], srcmap.positions[i] + srcmap.offsets[i])
137
    raise ValueError
138
139
140

# TODO: allow preprocessors to return their own source map (really a map or a function (easier)?)
# TODO: apply source maps in sequence.