preprocess.py 16.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# preprocess.py - preprocessing of source files for DHParser
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
17
18
19


"""
20
21
22
23
24
25
Module ``preprocess`` contains functions for preprocessing source
code before the parsing stage as well as source mapping facilities
to map the locations of parser and compiler errors to the
non-preprocessed source text.

Preprocessing (and source mapping of errors) will only be needed
26
27
for some domain specific languages, most notably those that
cannot completely be described entirely with context-free grammars.
28
29
"""

30
31
32

import bisect
import functools
33
import os
34
from typing import Union, Optional, Callable, Tuple, NamedTuple, List, Any
35

36
from DHParser.toolkit import re, dataclasses
Eckhart Arnold's avatar
Eckhart Arnold committed
37

di68kap's avatar
di68kap committed
38

39
40
41
42
__all__ = ('RX_TOKEN_NAME',
           'BEGIN_TOKEN',
           'TOKEN_DELIMITER',
           'END_TOKEN',
43
           'SourceMap',
44
           'SourceMapFunc',
45
           'PreprocessorFunc',
46
           'Preprocessed',
47
           'PreprocessorResult',
48
           'make_token',
eckhart's avatar
eckhart committed
49
           'strip_tokens',
50
           'nil_preprocessor',
51
52
           'chain_preprocessors',
           'prettyprint_tokenized',
53
           'neutral_mapping',
54
           'tokenized_to_original_mapping',
55
           'source_map',
56
57
58
           'with_source_mapping',
           'gen_find_include_func',
           'preprocess_includes')
59

60
61
62
63
64
65
66

#######################################################################
#
# Types and constants
#
#######################################################################

67
68
69
70
71
72
73
74
75
BEGIN_TOKEN = '\x1b'
TOKEN_DELIMITER = '\x1c'
END_TOKEN = '\x1d'
RESERVED_TOKEN_CHARS = BEGIN_TOKEN + TOKEN_DELIMITER + END_TOKEN

RX_TOKEN_NAME = re.compile(r'\w+')
RX_TOKEN_ARGUMENT = re.compile(r'[^\x1b\x1c\x1d]*')
RX_TOKEN = re.compile(r'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d')

76

77
78
79
80
81
@dataclasses.dataclass
class SourceMap:
    source_name: str       # nome or path or uri of the original source file
    positions: List[int]   # a list of locations
    offsets: List[int]     # the corresponding offsets to be added from these locations onward
82
83
84


class SourceLocation(NamedTuple):
85
    source_name: str  # the file name (or path or uri) of the source code
86
87
88
    pos: int   # a position within this file


89
90
91
92
93
94
95
96
97
98
99
100
101
102
SourceMapFunc = Union[Callable[[int], SourceLocation],
                      functools.partial]


class Preprocessed(NamedTuple):
    preprocessed_text: str
    back_mapping: SourceMapFunc


@dataclasses.dataclass
class IncludeMap(SourceMap):
    file_names: List[str]  # list of file_names to which the source locations relate

    def has_includes(self) -> bool:
103
        return any(fname != self.source_name for fname in self.file_names)
104
105
106
107
108
109
110


class IncludeInfo(NamedTuple):
    begin: int
    length: int
    file_name: str

111

112
PreprocessorResult = Union[str, Preprocessed]
113

114
115
116

FindIncludeFunc = Union[Callable[[str, int], IncludeInfo],   # (document: str,  start: int)
                        functools.partial]
117
PreprocessorFunc = Union[Callable[[str, str], PreprocessorResult],  # text: str, filename: str
118
119
120
121
122
123
124
125
126
127
128
                         functools.partial]


#######################################################################
#
# Chaining of preprocessors
#
#######################################################################


def nil_preprocessor(source_text: str, source_name: str) -> Preprocessed:
129
130
131
    """
    A preprocessor that does nothing, i.e. just returns the input.
    """
132
    return Preprocessed(source_text, lambda i: SourceLocation(source_name, i))
133
134


135
def _apply_mappings(position: int, mappings: List[SourceMapFunc]) -> SourceLocation:
136
137
138
    """
    Sequentially apply a number of mapping functions to a source position.
    In the context of source mapping, the source position usually is a
139
    position within a preprocessed source text and mappings should therefore
140
141
    be a list of reverse-mappings in reversed order.
    """
142
    filename = ''
143
    for mapping in mappings:
144
145
        filename, position = mapping(position)
    return SourceLocation(filename, position)
146
147


148
149
def _apply_preprocessors(source_text: str, source_name: str,
                         preprocessors: Tuple[PreprocessorFunc, ...]) \
150
        -> Preprocessed:
151
152
153
154
155
156
    """
    Applies several preprocessing functions sequentially to a source text
    and returns the preprocessed text as well as a function that maps text-
    positions in the processed text onto the corresponding position in the
    original source test.
    """
157
    processed = source_text
158
159
    mapping_chain = []
    for prep in preprocessors:
160
        processed, mapping_func = with_source_mapping(prep(processed, source_name))
161
162
        mapping_chain.append(mapping_func)
    mapping_chain.reverse()
163
    return Preprocessed(processed, functools.partial(_apply_mappings, mappings=mapping_chain))
164
165
166
167


def chain_preprocessors(*preprocessors) -> PreprocessorFunc:
    """
168
    Merges a sequence of preprocessor functions in to a single function.
169
170
171
172
173
174
175
176
177
178
179
180
181
182
    """
    return functools.partial(_apply_preprocessors, preprocessors=preprocessors)


#######################################################################
#
# Tokenization support
#
# In DHParser the source text is usually not tokenized, but,
# optionally, it can be enriched by tokens (or parts of it replaced
# by tokens) to, say indicate beginnings and endings of indented
# or quoted blocks that are difficult to capture with an EBNF-parser.
#
######################################################################
183
184
185
186
187


def make_token(token: str, argument: str = '') -> str:
    """
    Turns the ``token`` and ``argument`` into a special token that
188
    will be caught by the ``PreprocessorToken``-parser.
189
190
191
192
193
194
195
196
197
198

    This function is a support function that should be used by
    preprocessors to inject preprocessor tokens into the source text.
    """
    assert RX_TOKEN_NAME.match(token)
    assert RX_TOKEN_ARGUMENT.match(argument)

    return BEGIN_TOKEN + token + TOKEN_DELIMITER + argument + END_TOKEN


199
def prettyprint_tokenized(tokenized: str) -> str:
200
201
202
203
    """Returns a pretty-printable version of a document that contains tokens."""
    return tokenized.replace('\x1b', '<').replace('\x1c', '|').replace('\x1d', '>')


eckhart's avatar
eckhart committed
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
def strip_tokens(tokenized: str) -> str:
    """Replaces all tokens with the token's arguments."""
    result = []
    pos = 0
    match = RX_TOKEN.search(tokenized, pos)
    while match:
        start, end = match.span()
        result.append(tokenized[pos:start])
        result.append(match.groupdict()['argument'])
        pos = end
        match = RX_TOKEN.search(tokenized, pos)
    result.append(tokenized[pos:])
    return ''.join(result)


219
220
221
222
223
224
225
226
#######################################################################
#
# Source Maps - mapping source code positions between different
#               transformations of the source text
#
#######################################################################


227
228
229
230
def neutral_mapping(pos: int) -> SourceLocation:
    '''Maps source locations on itself and sets the source file name
    to the empty string.'''
    return SourceLocation('', pos)
231
232


233
def tokenized_to_original_mapping(tokenized_text: str, source_name: str='UNKNOWN_FILE') -> SourceMap:
234
235
236
237
    """
    Generates a source map for mapping positions in a text that has
    been enriched with token markers to their original positions.

238
239
240
    :param tokenized_text:  the source text enriched with token markers.
    :poram source_name:  the name or path or uri of the original source file.
    :returns:  a source map, i.e. a list of positions and a list of corresponding
241
242
243
244
245
246
247
        offsets. The list of positions is ordered from smallest to highest.
        An offset is valid for its associated position and all following
        positions until (and excluding) the next position in the list of
        positions.
    """
    positions, offsets = [0], [0]
    o = 0
248
    i = tokenized_text.find(BEGIN_TOKEN)
di68kap's avatar
di68kap committed
249
    e = -2
250
    while i >= 0:
251
252
        d = tokenized_text.find(TOKEN_DELIMITER, i)
        e = tokenized_text.find(END_TOKEN, i)
253
        assert 0 <= d < e
eckhart's avatar
eckhart committed
254
        o -= (d - i + 2)
255
        positions.extend([d + 1, e + 1])
Eckhart Arnold's avatar
Eckhart Arnold committed
256
        offsets.extend([o + 1, o])
257
258
259
        i = tokenized_text.find(BEGIN_TOKEN, e + 1)
    if e + 1 < len(tokenized_text):
        positions.append(len(tokenized_text) + 1)
260
        offsets.append(offsets[-1])
261
262
263
264
265

    # post conditions
    assert len(positions) == len(offsets), '\n' + str(positions) + '\n' + str(offsets)
    assert positions[0] == 0
    assert all(positions[i] < positions[i + 1] for i in range(len(positions) - 1))
Eckhart Arnold's avatar
Eckhart Arnold committed
266
267

    # specific condition for preprocessor tokens
268
    assert all(offsets[i] > offsets[i + 1] for i in range(len(offsets) - 2))
269

270
    return SourceMap(source_name, positions, offsets)
271
272


273
def source_map(position: int, srcmap: SourceMap) -> SourceLocation:
274
275
276
277
    """
    Maps a position in a (pre-)processed text to its corresponding
    position in the original document according to the given source map.

278
279
280
    :param  position: the position in the processed text
    :param  srcmap:  the source map, i.e. a mapping of locations to offset values
    :returns:  the mapped position
281
    """
282
    i = bisect.bisect_right(srcmap.positions, position)
283
    if i:
284
285
286
        return SourceLocation(
            srcmap.source_name,
            min(position + srcmap.offsets[i - 1], srcmap.positions[i] + srcmap.offsets[i]))
287
    raise ValueError
288

289

290
def with_source_mapping(result: PreprocessorResult) -> Preprocessed:
291
292
293
294
295
    """
    Normalizes preprocessors results, by adding a mapping if a preprocessor
    only returns the transformed source code and no mapping by itself. It is
    assumed that in this case the preprocessor has just enriched the source
    code with tokens, so that a source mapping can be derived automatically
296
    with :func:`tokenized_to_original_mapping` (see above).
297
298
299
300
301
302
303
304

    :param result:  Either a preprocessed text as atring containing
            preprocessor tokens, or a tuple of a preprocessed text AND a source
            mapping function. In the former case the source mapping will be
            generated, in the latter it will simply be passed through.
    :returns:  A tuple of the preprocessed text and the source-mapping function
            that returns the original text position when called with a position
            in the preprocessed text.
305
306
307
    """
    if isinstance(result, str):
        srcmap = tokenized_to_original_mapping(result)
308
309
310
311
312
313
314
315
316
317
        token_mapping = functools.partial(source_map, srcmap=srcmap)
        return Preprocessed(result, token_mapping)
    # else: # DOES NOT WORK, because there is no way to reliably find out whether
    #       # token back-mapping has already been done by the provided mapping
    #     text, mapping = cast(Preprocessed, result)
    #     if not (hasattr(mapping, 'func') and mapping.func == source_map):
    #         srcmap = tokenized_to_original_mapping(text)
    #         token_mapping = functools.partial(source_map, srcmap=srcmap)
    #         return Preprocessed(
    #             text, functools.partial(_apply_mappings, mappings=[token_mapping, mapping]))
318
    return result
Eckhart Arnold's avatar
Eckhart Arnold committed
319
320
321
322


#######################################################################
#
323
# Includes - support for chaining source texts via an in clude command
324
#
Eckhart Arnold's avatar
Eckhart Arnold committed
325
326
#######################################################################

327

328
329
def gen_find_include_func(rx: Union[str, Any],
                          comment_rx: Optional[Union[str, Any]] = None) -> FindIncludeFunc:
330
    if isinstance(rx, str):  rx = re.compile(rx)
331
    if isinstance(comment_rx, str):  comment_rx = re.compile(comment_rx)
332
333
334

    def find_include(text: str, begin: int) -> IncludeInfo:
        nonlocal rx
335
336
        m = rx.search(text, begin)
        if m:
337
338
            begin = m.start()
            return IncludeInfo(begin, m.end() - begin, m.group('name'))
339
        else:
340
341
            return IncludeInfo(-1, 0, '')

342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
    def find_comment(text: str, begin: int) -> Tuple[int, int]:
        m = comment_rx.search(text, begin)
        return m.span() if m else (-1, -2)

    def meta_find_include(text: str, begin: int) -> IncludeInfo:
        a, b = find_comment(text, begin)
        info = find_include(text, begin)
        k, length, name = info
        while a < b <= k:
            a, b = find_comment(text, b)
        while (a < k < b) or (a < k + length < b):
            info = find_include(text, b)
            k, length, name = info
            while a < b <= k:
                a, b = find_comment(text, b)
        return info

    return find_include if comment_rx is None else meta_find_include
360
361


362
363
364
365
366
367
368
def generate_include_map(source_name: str,
                         source_text: str,
                         find_next_include: FindIncludeFunc) -> Tuple[IncludeMap, str]:
    file_names: set = set()

    def generate_map(source_name, source_text, find_next) -> Tuple[IncludeMap, str]:
        nonlocal file_names
369
370
371
        map = IncludeMap(source_name, [0], [0], [source_name])
        result = []

372
373
374
        if source_name in file_names:
            raise ValueError(f'Circular include of {source_name} detected!')
        file_names.add(source_name)
375

376
        dirname = os.path.dirname(source_name)
377
378
379
        source_pointer = 0
        source_offset = 0
        result_pointer = 0
380
381
        last_begin = -1
        begin, length, include_name = find_next(source_text, 0)
382
        include_name = os.path.join(dirname, include_name)
383
384
        while begin >= 0:
            assert begin > last_begin
385
386
387
            source_delta = begin - source_pointer
            source_pointer += source_delta
            result_pointer += source_delta
388
            with open(include_name, 'r', encoding='utf-8') as f:
389
390
391
392
393
                included_text = f.read()
            inner_map, inner_text = generate_map(include_name, included_text, find_next)
            inner_map.positions = [pos + result_pointer for pos in inner_map.positions]
            inner_map.offsets = [offset - result_pointer for offset in inner_map.offsets]
            if source_delta == 0:
394
395
396
                map.file_names = map.file_names[:-1] + inner_map.file_names[:-1]
                map.positions = map.positions[:-1] + inner_map.positions[:-1]
                map.offsets = map.offsets[:-1] + inner_map.offsets[:-1]
397
                result.append(inner_text)
398
            else:
399
                result.append(source_text[source_pointer - source_delta: source_pointer])
400
401
402
                map.file_names += inner_map.file_names[:-1]
                map.positions += inner_map.positions[:-1]
                map.offsets += inner_map.offsets[:-1]
403
404
405
                result.append(inner_text)
            inner_length = len(inner_text)
            result_pointer += inner_length
406
            map.file_names.append(source_name)
407
408
409
410
411
            map.positions.append(result_pointer)
            source_pointer += length
            source_offset += length - inner_length
            map.offsets.append(source_offset)
            begin, length, include_name = find_next(source_text, source_pointer)
412
            include_name = os.path.join(dirname, include_name)
413
        rest = source_text[source_pointer:]
414
        if rest:
415
            result.append(rest)
416
            map.positions.append(map.positions[-1] + len(rest))
417
            map.offsets.append(source_offset)
418
419
            map.file_names.append(source_name)
        file_names.remove(source_name)
420
        return map, ''.join(result)
421
422
423
424

    return generate_map(source_name, source_text, find_next_include)


425
def srcmap_includes(position: int, inclmap: IncludeMap) -> SourceLocation:
426
427
428
    i = bisect.bisect_right(inclmap.positions, position)
    if i:
        return SourceLocation(
429
430
            inclmap.file_names[i - 1],
            position + inclmap.offsets[i - 1])
431
432
433
    raise ValueError


434
435
def preprocess_includes(source_text: Optional[str],
                        source_name: str,
436
437
438
439
440
                        find_next_include: FindIncludeFunc) -> Preprocessed:
    if not source_text:
        with open(source_name, 'r', encoding='utf-8') as f:
            source_text = f.read()
    include_map, result = generate_include_map(source_name, source_text, find_next_include)
441
    mapping_func = functools.partial(srcmap_includes, inclmap=include_map)
442
    return Preprocessed(result, mapping_func)
443
444