Commit a985e61e authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

preprocess.py: aglorithm for includes reworked

parent d4e21b3e
......@@ -48,9 +48,9 @@ column-number
"""
import os
from typing import Iterable, Iterator, Union, List, Any, Sequence, Tuple
from typing import Iterable, Iterator, Union, List, Optional, Sequence, Tuple
from DHParser.preprocess import SourceMapFunc, neutral_mapping
from DHParser.preprocess import SourceMapFunc, SourceLocation
from DHParser.stringview import StringView
from DHParser.toolkit import linebreaks, line_col, is_filename
......@@ -195,8 +195,6 @@ class Error:
:ivar orig_doc: the name or path or url of the original source file to
which ``orig_pos`` is related. This is relevant, if the preprocessed
document has been plugged together from several source files.
:ivar orig_offset: the offset of the included ``oric_doc`` within the
outermost including document.
:ivar line: the line number where the error occurred in the original text.
Lines are counted from 1 onward.
:ivar column: the column where the error occurred in the original text.
......@@ -211,7 +209,7 @@ class Error:
__slots__ = ['message', 'code', '_pos', 'line', 'column', 'length',
'end_line', 'end_column', 'related', 'orig_pos', 'orig_doc',
'orig_offset', 'relatedUri']
'relatedUri']
def __init__(self, message: str, pos: int, code: ErrorCode = ERROR,
line: int = -1, column: int = -1, length: int = 1,
......@@ -229,7 +227,6 @@ class Error:
self.code = code # type: ErrorCode
self.orig_pos = orig_pos # type: int
self.orig_doc = orig_doc # type: str
self.orig_offset = 0 # type: int
self.line = line # type: int
self.column = column # type: int
# support for Language Server Protocol Diagnostics
......@@ -382,7 +379,7 @@ def only_errors(messages: Iterable[Error], level: int = ERROR) -> Iterator[Error
def adjust_error_locations(errors: List[Error],
original_text: Union[StringView, str],
source_mapping: SourceMapFunc = neutral_mapping):
source_mapping: Optional[SourceMapFunc] = None):
"""Adds (or adjusts) line and column numbers of error messages inplace.
Args:
......@@ -406,17 +403,18 @@ def adjust_error_locations(errors: List[Error],
return 1, c - base_c + 1
line_breaks = linebreaks(original_text)
if not source_mapping:
source_mapping = lambda pos: SourceLocation('', line_breaks, pos)
for err in errors:
assert err.pos >= 0
err.orig_doc, err.orig_offset, err.orig_pos = source_mapping(err.pos)
err.line, err.column = relative_lc(line_breaks, err.orig_pos, err.orig_offset)
err.orig_doc, lbreaks, err.orig_pos = source_mapping(err.pos)
err.line, err.column = line_col(line_breaks, err.orig_pos)
# adjust length in case it exceeds the text size. As this is non-fatal
# it should be adjusted rather than an error raised to avoid
# unnecessary special-case treatments in other places
if err.orig_pos + err.length > len(err.orig_doc):
err.length = len(err.orig_doc) - err.orig_pos
err.end_line, err.end_column = relative_lc(
line_breaks, err.orig_pos + err.length, err.orig_offset)
err.end_line, err.end_column = line_col(lbreaks, err.orig_pos + err.length)
# def canonical_error_strings(errors: List[Error], source_file_name: str = '') -> List[str]:
# """Returns the list of error strings in canonical form that can be parsed by most
......
......@@ -31,9 +31,9 @@ cannot completely be described entirely with context-free grammars.
import bisect
import functools
import os
from typing import Union, Optional, Callable, Tuple, NamedTuple, List, Any
from typing import Union, Optional, Callable, Tuple, NamedTuple, List, Dict, Any
from DHParser.toolkit import re, dataclasses
from DHParser.toolkit import re, linebreaks
__all__ = ('RX_TOKEN_NAME',
......@@ -50,7 +50,6 @@ __all__ = ('RX_TOKEN_NAME',
'nil_preprocessor',
'chain_preprocessors',
'prettyprint_tokenized',
'neutral_mapping',
'tokenized_to_original_mapping',
'source_map',
'with_source_mapping',
......@@ -74,17 +73,28 @@ RX_TOKEN_ARGUMENT = re.compile(r'[^\x1b\x1c\x1d]*')
RX_TOKEN = re.compile(r'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d')
@dataclasses.dataclass
class SourceMap:
source_name: str # nome or path or uri of the original source file
positions: List[int] # a list of locations
offsets: List[int] # the corresponding offsets to be added from these locations onward
class IncludeInfo(NamedTuple):
begin: int
length: int
file_name: str
class SourceMap(NamedTuple):
source_name: str # nome or path or uri of the original source file
positions: List[int] # a list of locations
offsets: List[int] # the corresponding offsets to be added from these locations onward
file_names: List[str] # list of file_names to which the source locations relate
lbreaks_dict: Dict[str, List[int]] # line breaks of the included texts
def has_includes(sm: SourceMap) -> bool:
return any(fname != sm.source_name for fname in sm.file_names)
class SourceLocation(NamedTuple):
source_name: str # the file name (or path or uri) of the source code
source_offset: int # the offset of this file within the complete source text
pos: int # a position within this file
source_name: str # the file name (or path or uri) of the source code
lbreaks: List[int] # positions of the line-breaks in the source file
pos: int # a position within this file
SourceMapFunc = Union[Callable[[int], SourceLocation],
......@@ -96,23 +106,7 @@ class Preprocessed(NamedTuple):
back_mapping: SourceMapFunc
@dataclasses.dataclass
class IncludeMap(SourceMap):
file_names: List[str] # list of file_names to which the source locations relate
def has_includes(self) -> bool:
return any(fname != self.source_name for fname in self.file_names)
class IncludeInfo(NamedTuple):
begin: int
length: int
file_name: str
PreprocessorResult = Union[str, Preprocessed]
FindIncludeFunc = Union[Callable[[str, int], IncludeInfo], # (document: str, start: int)
functools.partial]
PreprocessorFunc = Union[Callable[[str, str], PreprocessorResult], # text: str, filename: str
......@@ -130,7 +124,8 @@ def nil_preprocessor(source_text: str, source_name: str) -> Preprocessed:
"""
A preprocessor that does nothing, i.e. just returns the input.
"""
return Preprocessed(source_text, lambda i: SourceLocation(source_name, 0, i))
lbreaks = linebreaks(source_text)
return Preprocessed(source_text, lambda i: SourceLocation(source_name, lbreaks, i))
def _apply_mappings(position: int, mappings: List[SourceMapFunc]) -> SourceLocation:
......@@ -140,10 +135,10 @@ def _apply_mappings(position: int, mappings: List[SourceMapFunc]) -> SourceLocat
position within a preprocessed source text and mappings should therefore
be a list of reverse-mappings in reversed order.
"""
filename = ''
filename, lbreaks = '', []
for mapping in mappings:
filename, offset, position = mapping(position)
return SourceLocation(filename, offset, position)
filename, lbreaks, position = mapping(position)
return SourceLocation(filename, lbreaks, position)
def _apply_preprocessors(source_text: str, source_name: str,
......@@ -228,12 +223,6 @@ def strip_tokens(tokenized: str) -> str:
#######################################################################
def neutral_mapping(pos: int) -> SourceLocation:
'''Maps source locations on itself and sets the source file name
to the empty string.'''
return SourceLocation('', 0, pos)
def tokenized_to_original_mapping(tokenized_text: str, source_name: str='UNKNOWN_FILE') -> SourceMap:
"""
Generates a source map for mapping positions in a text that has
......@@ -271,7 +260,9 @@ def tokenized_to_original_mapping(tokenized_text: str, source_name: str='UNKNOWN
# specific condition for preprocessor tokens
assert all(offsets[i] > offsets[i + 1] for i in range(len(offsets) - 2))
return SourceMap(source_name, positions, offsets)
lbreaks = linebreaks(tokenized_text)
L = len(positions)
return SourceMap(source_name, positions, offsets, [source_name] * L, { source_name: lbreaks})
def source_map(position: int, srcmap: SourceMap) -> SourceLocation:
......@@ -281,13 +272,15 @@ def source_map(position: int, srcmap: SourceMap) -> SourceLocation:
:param position: the position in the processed text
:param srcmap: the source map, i.e. a mapping of locations to offset values
and source texts.
:returns: the mapped position
"""
i = bisect.bisect_right(srcmap.positions, position)
if i:
source_name = srcmap.file_names[i - 1]
return SourceLocation(
srcmap.source_name,
0,
source_name,
srcmap.lbreaks_dict[source_name],
min(position + srcmap.offsets[i - 1], srcmap.positions[i] + srcmap.offsets[i]))
raise ValueError
......@@ -366,12 +359,12 @@ def gen_find_include_func(rx: Union[str, Any],
def generate_include_map(source_name: str,
source_text: str,
find_next_include: FindIncludeFunc) -> Tuple[IncludeMap, str]:
find_next_include: FindIncludeFunc) -> Tuple[SourceMap, str]:
file_names: set = set()
def generate_map(source_name, source_text, find_next) -> Tuple[IncludeMap, str]:
def generate_map(source_name, source_text, find_next) -> Tuple[SourceMap, str]:
nonlocal file_names
map = IncludeMap(source_name, [0], [0], [source_name])
map = SourceMap(source_name, [0], [0], [source_name], {source_name: linebreaks(source_text)})
result = []
if source_name in file_names:
......@@ -393,19 +386,21 @@ def generate_include_map(source_name: str,
with open(include_name, 'r', encoding='utf-8') as f:
included_text = f.read()
inner_map, inner_text = generate_map(include_name, included_text, find_next)
inner_map.positions = [pos + result_pointer for pos in inner_map.positions]
inner_map.offsets = [offset - result_pointer for offset in inner_map.offsets]
assert len(inner_map.positions) == len(inner_map.offsets) == len(inner_map.file_names)
for i in range(len(inner_map.positions)):
inner_map.positions[i] += result_pointer
inner_map.offsets[i] -= result_pointer
if source_delta == 0:
map.file_names = map.file_names[:-1] + inner_map.file_names[:-1]
map.positions = map.positions[:-1] + inner_map.positions[:-1]
map.offsets = map.offsets[:-1] + inner_map.offsets[:-1]
result.append(inner_text)
map.file_names.pop()
map.positions.pop()
map.offsets.pop()
else:
result.append(source_text[source_pointer - source_delta: source_pointer])
map.file_names += inner_map.file_names[:-1]
map.positions += inner_map.positions[:-1]
map.offsets += inner_map.offsets[:-1]
result.append(inner_text)
map.file_names.extend(inner_map.file_names[:-1])
map.positions.extend(inner_map.positions[:-1])
map.offsets.extend(inner_map.offsets[:-1])
map.lbreaks_dict.update(inner_map.lbreaks_dict)
result.append(inner_text)
inner_length = len(inner_text)
result_pointer += inner_length
map.file_names.append(source_name)
......@@ -422,19 +417,20 @@ def generate_include_map(source_name: str,
map.offsets.append(source_offset)
map.file_names.append(source_name)
file_names.remove(source_name)
# map.file_offsets = [-offset for offset in map.offsets] # only for debugging!
return map, ''.join(result)
return generate_map(source_name, source_text, find_next_include)
def srcmap_includes(position: int, inclmap: IncludeMap) -> SourceLocation:
def srcmap_includes(position: int, inclmap: SourceMap) -> SourceLocation:
i = bisect.bisect_right(inclmap.positions, position)
if i:
offset = inclmap.offsets[i - 1]
source_name = inclmap.file_names[i - 1]
return SourceLocation(
inclmap.file_names[i - 1],
-offset,
position + offset)
source_name,
inclmap.lbreaks_dict[source_name],
position + inclmap.offsets[i - 1])
raise ValueError
......
......@@ -592,7 +592,7 @@ from typing import Callable, cast, Iterator, Sequence, List, Set, Union, \
from DHParser.configuration import get_config_value, ALLOWED_PRESET_VALUES
from DHParser.error import Error, ErrorCode, ERROR, PARSER_STOPPED_BEFORE_END, \
adjust_error_locations
from DHParser.preprocess import SourceMapFunc, neutral_mapping
from DHParser.preprocess import SourceMapFunc, SourceLocation
from DHParser.stringview import StringView # , real_indices
from DHParser.toolkit import re, cython, linebreaks, line_col, JSONnull, \
validate_XML_attribute_value, fix_XML_attribute_value, lxml_XML_attribute_value, \
......@@ -2688,7 +2688,7 @@ class RootNode(Node):
def __init__(self, node: Optional[Node] = None,
source: Union[str, StringView] = '',
source_mapping: SourceMapFunc = neutral_mapping):
source_mapping: Optional[SourceMapFunc] = None):
super().__init__('__not_yet_ready__', '')
self.errors = [] # type: List[Error]
self.error_nodes = dict() # type: Dict[int, List[Error]] # id(node) -> error list
......@@ -2696,7 +2696,11 @@ class RootNode(Node):
self.error_flag = 0
# info on source code (to be carried along all stages of tree-processing)
self.source = source # type: str
self.source_mapping = source_mapping # type: SourceMapFunc
if source_mapping is None:
line_breaks = linebreaks(source)
self.source_mapping = lambda pos: SourceLocation('', line_breaks, pos)
else:
self.source_mapping = source_mapping # type: SourceMapFunc
self.lbreaks = linebreaks(source) # List[int]
# customization for XML-Representation
self.inline_tags = set() # type: Set[str]
......
......@@ -49,7 +49,7 @@ from DHParser import start_logging, suspend_logging, resume_logging, is_filename
positions_of, replace_tag_names, add_attributes, delimit_children, merge_connected, \
has_attr, has_parent, ThreadLocalSingletonFactory, Error, canonical_error_strings, \
has_errors, apply_unless, WARNING, ERROR, FATAL, EMPTY_NODE, TreeReduction, CombinedParser, \
Preprocessed, neutral_mapping, preprocess_includes, gen_find_include_func, flatten_sxpr, \
Preprocessed, preprocess_includes, gen_find_include_func, flatten_sxpr, \
replace_content_with
......
......@@ -130,7 +130,7 @@ def preprocess_comments(src: str, src_name: str) -> Tuple[str, SourceMapFunc]:
positions.append(pos)
offsets.append(offsets[-1])
return '\n'.join(lines), \
partial(source_map, srcmap=SourceMap('FILE_NAME_DUMMY', positions, offsets))
partial(source_map, srcmap=SourceMap('DUMMY', positions, offsets, ['DUMMY']*len(positions), {'DUMMY': []}))
class TestTokenParsing:
......@@ -284,7 +284,7 @@ class TestIncludes:
name, offset, k = mapping(i)
# print(i, k, name)
txt = main if name == 'main.txt' else sub
assert text[i] == txt[k], f'{i}: {text[i]} != {txt[k]} in {name}'
assert text[i] == txt[k], f'{i},{k}: {text[i]} != {txt[k]} in {name}'
perform('include(sub.txt)xyz', 'abc')
perform('012include(sub.txt)xyz', 'abc')
......@@ -310,8 +310,10 @@ class TestIncludes:
assert text == substrings['main']
# print(text)
for i in range(len(text)):
name, offset, k = mapping(i)
name, lbreaks, k = mapping(i)
txt = ensemble[name]
# print(name, substrings[name], text[offset:offset + len(substrings[name])])
# assert text[offset:offset + len(substrings[name])] == substrings[name]
# print(name, txt, i, k)
assert text[i] == txt[k], f'{i}: {text[i]} != {txt[k]} in {name}'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment