Commit 16a4c0de authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

preprocess.py: preprocessor functions now yield source file name and error...

preprocess.py: preprocessor functions now yield source file name and error location instead of just error location
parent 489b6732
DHParser Version 0.9.5
......................
- Python 3.5 compatibility dropped! DHParser now requires at least Python 3.6.
This allows to use typing.NamedTuples and dataclasses for a clearer
expression of types.
- batch processing: just use several files or a directory or a
shell pattern as parameters for the xxxParser.py or xxxServer.py-Skripts
- DHParser.compile.Compiler is now more tolerant w.r.t to compiler-methods
......@@ -13,6 +16,7 @@ DHParser Version 0.9.5
- DHParser.parse: added early horizontal tree-reduction to class CombinedParser
- DHParser.ebnf: added ""@reduction"-directive
DHParser Version 0.9.4 (6.10.2020)
..................................
......
......@@ -41,7 +41,8 @@ import traceback
from typing import Any, Optional, Tuple, List, Set, Union, Callable, cast
from DHParser.configuration import get_config_value
from DHParser.preprocess import with_source_mapping, PreprocessorFunc, SourceMapFunc
from DHParser.preprocess import with_source_mapping, PreprocessorFunc, SourceMapFunc, \
SourceLocation
from DHParser.syntaxtree import Node, RootNode, EMPTY_PTYPE, TreeContext
from DHParser.transform import TransformationFunc
from DHParser.parse import Grammar
......@@ -343,6 +344,7 @@ def compile_source(source: str,
"""
ast = None # type: Optional[Node]
original_text = load_if_file(source) # type: str
source_name = source if is_filename(source) else 'source'
compiler.source = original_text
log_file_name = logfile_basename(source, compiler) if is_logging() else '' # type: str
if not hasattr(parser, 'free_char_parsefunc__') or parser.history_tracking__:
......@@ -355,9 +357,9 @@ def compile_source(source: str,
if preprocessor is None:
source_text = original_text # type: str
source_mapping = identity # type: SourceMapFunc
source_mapping = lambda i: SourceLocation(source_name, i) # type: SourceMapFunc
else:
source_text, source_mapping = with_source_mapping(preprocessor(original_text))
source_text, source_mapping = with_source_mapping(preprocessor(original_text, source_name))
# parsing
......
......@@ -2262,8 +2262,9 @@ class EBNFCompiler(Compiler):
the previously compiled formal language.
"""
name = self.grammar_name + "Preprocessor"
return "def nop(arg):\n return arg\n\n\n" \
"def %s(text):\n return text, nop\n" % name \
return "def nop(pos, source_name):\n return SourceLocation(source_name, pos)\n\n\n" \
"def %s(source_text, source_name):\n"\
" return source_text, partial(nop, source_name)\n" % name \
+ PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
......
......@@ -50,7 +50,7 @@ column-number
import os
from typing import Iterable, Iterator, Union, List, Any, Sequence, Tuple
from DHParser.preprocess import SourceMapFunc
from DHParser.preprocess import SourceMapFunc, neutral_mapping
from DHParser.stringview import StringView
from DHParser.toolkit import linebreaks, line_col, is_filename
......@@ -180,6 +180,31 @@ RECURSION_DEPTH_LIMIT_HIT = ErrorCode(10400)
class Error:
"""The Error class encapsulates the all information for a single
error.
:ivar message: the error message as text string
:ivar pos: the position where the error occurred in the preprocessed text
:ivar code: the error-code, which also indicates the severity of the
error. 0: no error
< 100: notice
< 1000: warning
< 10000: error
>= 10000: fatal error. syntax tree will not be passed on to
the next compilation stage!
:ivar orig_pos: the position of the error in the original source file,
not in the preprocessed document.
:ivar orig_doc: the name or path or url of the original source file to
which ``orig_pos`` is related. This is relevant, if the preprocessed
document has been plugged together from several source files.
:ivar line: the line number where the error occurred in the original text.
Lines are counted from 1 onward.
:ivar column: the column where the error occurred in the original text.
Columns are counted from 1 onward.
:ivar length: the length in characters of the faulty passage (default is 1)
:ivar end_line: the line number of the position after the last character
covered by the error in the original source.
:ivar end_column: the column number of the position after the last character
covered by the error in the original source.
:ivar related: a sequence of related errors.
"""
__slots__ = ['message', 'code', '_pos', 'line', 'column', 'length',
......@@ -188,7 +213,7 @@ class Error:
def __init__(self, message: str, pos: int, code: ErrorCode = ERROR,
line: int = -1, column: int = -1, length: int = 1,
related: Sequence[Tuple['Error', str]] = [],
related: Sequence['Error'] = [],
orig_pos: int = -1, orig_doc: str = '') -> None:
assert isinstance(code, ErrorCode)
assert not isinstance(pos, ErrorCode)
......@@ -209,7 +234,7 @@ class Error:
self.length = length # type: int
self.end_line = -1 # type: int
self.end_column = -1 # type: int
self.related = tuple(related) # type: Sequence[Tuple['Error', str]]
self.related = tuple(related) # type: Sequence['Error']
def __str__(self):
prefix = ''
......@@ -267,11 +292,11 @@ class Error:
"""Returns the Error as as Language Server Protocol Diagnostic object.
https://microsoft.github.io/language-server-protocol/specifications/specification-current/#diagnostic
"""
def relatedObj(relatedError: Sequence[Tuple['Error', str]]) -> dict:
err, uri = relatedError
def relatedObj(relatedError: 'Error') -> dict:
uri = relatedError.orig_doc
return {
'location': {'uri': uri, 'range': err.rangeObj()},
'message': err.message
'location': {'uri': uri, 'range': relatedError.rangeObj()},
'message': relatedError.message
}
if self.code < WARNING:
......@@ -354,7 +379,7 @@ def only_errors(messages: Iterable[Error], level: int = ERROR) -> Iterator[Error
def adjust_error_locations(errors: List[Error],
original_text: Union[StringView, str],
source_mapping: SourceMapFunc = lambda i: i):
source_mapping: SourceMapFunc = neutral_mapping):
"""Adds (or adjusts) line and column numbers of error messages inplace.
Args:
......@@ -368,7 +393,7 @@ def adjust_error_locations(errors: List[Error],
line_breaks = linebreaks(original_text)
for err in errors:
assert err.pos >= 0
err.orig_pos = source_mapping(err.pos)
err.orig_doc, err.orig_pos = source_mapping(err.pos)
err.line, err.column = line_col(line_breaks, err.orig_pos)
# adjust length in case it exceeds the text size. As this is non-fatal
# it should be adjusted rather than an error raised to avoid
......
......@@ -29,9 +29,8 @@ cannot completely be described entirely with context-free grammars.
import bisect
import collections
import functools
from typing import Union, Callable, Tuple, List
from typing import Union, Callable, Tuple, NamedTuple, List
from DHParser.toolkit import re
......@@ -49,6 +48,7 @@ __all__ = ('RX_TOKEN_NAME',
'chain_preprocessors',
'prettyprint_tokenized',
'SourceMap',
'neutral_mapping',
'tokenized_to_original_mapping',
'source_map',
'with_source_mapping')
......@@ -62,19 +62,31 @@ RX_TOKEN_NAME = re.compile(r'\w+')
RX_TOKEN_ARGUMENT = re.compile(r'[^\x1b\x1c\x1d]*')
RX_TOKEN = re.compile(r'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d')
SourceMapFunc = Union[Callable[[int], int], functools.partial]
class SourceMap(NamedTuple):
source_name: str # nome or path or uri of the original source file
positions: List[int] # a list of locations
offsets: List[int] # the corresponding offsets to be added from these locations onward
class SourceLocation(NamedTuple):
name: str # the file name (or path or uri) of the source code
pos: int # a position within this file
SourceMapFunc = Union[Callable[[int], SourceLocation], functools.partial]
PreprocessorResult = Union[str, Tuple[str, SourceMapFunc]]
PreprocessorFunc = Union[Callable[[str], PreprocessorResult], functools.partial]
PreprocessorFunc = Union[Callable[[str, str], PreprocessorResult], functools.partial]
def nil_preprocessor(text: str) -> Tuple[str, SourceMapFunc]:
def nil_preprocessor(source_text: str, source_name: str) -> Tuple[str, SourceMapFunc]:
"""
A preprocessor that does nothing, i.e. just returns the input.
"""
return text, lambda i: i
return source_text, lambda i: SourceLocation(source_name, i)
def _apply_mappings(position: int, mappings: List[SourceMapFunc]) -> int:
def _apply_mappings(position: int, mappings: List[SourceMapFunc]) -> SourceLocation:
"""
Sequentially apply a number of mapping functions to a source position.
In the context of source mapping, the source position usually is a
......@@ -82,11 +94,12 @@ def _apply_mappings(position: int, mappings: List[SourceMapFunc]) -> int:
be a list of reverse-mappings in reversed order.
"""
for mapping in mappings:
position = mapping(position)
return position
filename, position = mapping(position)
return SourceLocation(filename, position)
def _apply_preprocessors(text: str, preprocessors: Tuple[PreprocessorFunc, ...]) \
def _apply_preprocessors(source_text: str, source_name: str,
preprocessors: Tuple[PreprocessorFunc, ...]) \
-> Tuple[str, SourceMapFunc]:
"""
Applies several preprocessing functions sequentially to a source text
......@@ -94,10 +107,10 @@ def _apply_preprocessors(text: str, preprocessors: Tuple[PreprocessorFunc, ...])
positions in the processed text onto the corresponding position in the
original source test.
"""
processed = text
processed = source_text
mapping_chain = []
for prep in preprocessors:
processed, mapping_func = with_source_mapping(prep(processed))
processed, mapping_func = with_source_mapping(prep(processed, source_name))
mapping_chain.append(mapping_func)
mapping_chain.reverse()
return processed, functools.partial(_apply_mappings, mappings=mapping_chain)
......@@ -164,18 +177,20 @@ def strip_tokens(tokenized: str) -> str:
#######################################################################
SourceMap = collections.namedtuple('SourceMap', ['positions', 'offsets'])
def neutral_mapping(pos: int) -> SourceLocation:
'''Maps source locations on itself and sets the source file name
to the empty string.'''
return SourceLocation('', pos)
def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap:
def tokenized_to_original_mapping(tokenized_text: str, source_name: str='UNKNOWN_FILE') -> SourceMap:
"""
Generates a source map for mapping positions in a text that has
been enriched with token markers to their original positions.
Args:
tokenized_source: the source text enriched with token markers
Returns:
a source map, i.e. a list of positions and a list of corresponding
:param tokenized_text: the source text enriched with token markers.
:poram source_name: the name or path or uri of the original source file.
:returns: a source map, i.e. a list of positions and a list of corresponding
offsets. The list of positions is ordered from smallest to highest.
An offset is valid for its associated position and all following
positions until (and excluding) the next position in the list of
......@@ -183,18 +198,18 @@ def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap:
"""
positions, offsets = [0], [0]
o = 0
i = tokenized_source.find(BEGIN_TOKEN)
i = tokenized_text.find(BEGIN_TOKEN)
e = -2
while i >= 0:
d = tokenized_source.find(TOKEN_DELIMITER, i)
e = tokenized_source.find(END_TOKEN, i)
d = tokenized_text.find(TOKEN_DELIMITER, i)
e = tokenized_text.find(END_TOKEN, i)
assert 0 <= d < e
o -= (d - i + 2)
positions.extend([d + 1, e + 1])
offsets.extend([o + 1, o])
i = tokenized_source.find(BEGIN_TOKEN, e + 1)
if e + 1 < len(tokenized_source):
positions.append(len(tokenized_source) + 1)
i = tokenized_text.find(BEGIN_TOKEN, e + 1)
if e + 1 < len(tokenized_text):
positions.append(len(tokenized_text) + 1)
offsets.append(offsets[-1])
# post conditions
......@@ -205,23 +220,23 @@ def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap:
# specific condition for preprocessor tokens
assert all(offsets[i] > offsets[i + 1] for i in range(len(offsets) - 2))
return SourceMap(positions, offsets)
return SourceMap(source_name, positions, offsets)
def source_map(position: int, srcmap: SourceMap) -> int:
def source_map(position: int, srcmap: SourceMap) -> SourceLocation:
"""
Maps a position in a (pre-)processed text to its corresponding
position in the original document according to the given source map.
Args:
position: the position in the processed text
srcmap: the source map, i.e. a mapping of locations to offset values
Returns:
the mapped position
:param position: the position in the processed text
:param srcmap: the source map, i.e. a mapping of locations to offset values
:returns: the mapped position
"""
i = bisect.bisect_right(srcmap.positions, position)
if i:
return min(position + srcmap.offsets[i - 1], srcmap.positions[i] + srcmap.offsets[i])
return SourceLocation(
srcmap.source_name,
min(position + srcmap.offsets[i - 1], srcmap.positions[i] + srcmap.offsets[i]))
raise ValueError
......@@ -232,6 +247,14 @@ def with_source_mapping(result: PreprocessorResult) -> Tuple[str, SourceMapFunc]
assumed that in this case the preprocessor has just enriched the source
code with tokens, so that a source mapping can be derived automatically
with :func:`tokenized_to_original_mapping` (see above).
:param result: Either a preprocessed text as atring containing
preprocessor tokens, or a tuple of a preprocessed text AND a source
mapping function. In the former case the source mapping will be
generated, in the latter it will simply be passed through.
:returns: A tuple of the preprocessed text and the source-mapping function
that returns the original text position when called with a position
in the preprocessed text.
"""
if isinstance(result, str):
srcmap = tokenized_to_original_mapping(result)
......
......@@ -592,7 +592,7 @@ from typing import Callable, cast, Iterator, Sequence, List, Set, Union, \
from DHParser.configuration import get_config_value, ALLOWED_PRESET_VALUES
from DHParser.error import Error, ErrorCode, ERROR, PARSER_STOPPED_BEFORE_END, \
adjust_error_locations
from DHParser.preprocess import SourceMapFunc
from DHParser.preprocess import SourceMapFunc, neutral_mapping
from DHParser.stringview import StringView # , real_indices
from DHParser.toolkit import re, cython, linebreaks, line_col, JSONnull, \
validate_XML_attribute_value, fix_XML_attribute_value, lxml_XML_attribute_value, \
......@@ -2688,7 +2688,7 @@ class RootNode(Node):
def __init__(self, node: Optional[Node] = None,
source: Union[str, StringView] = '',
source_mapping: SourceMapFunc = identity):
source_mapping: SourceMapFunc = neutral_mapping):
super().__init__('__not_yet_ready__', '')
self.errors = [] # type: List[Error]
self.error_nodes = dict() # type: Dict[int, List[Error]] # id(node) -> error list
......
......@@ -200,6 +200,7 @@ class ThreadLocalSingletonFactory:
return singleton
@functools.lru_cache()
def is_filename(strg: str) -> bool:
"""
Tries to guess whether string ``strg`` is a file name.
......
......@@ -938,7 +938,7 @@ if __name__ == "__main__":
result, errors = compile_src(file_names[0])
if errors:
for err_str in canonical_erorr_strings(errors, file_names[0]):
for err_str in canonical_error_strings(errors, file_names[0]):
print(err_str)
sys.exit(1)
else:
......
......@@ -84,7 +84,7 @@ class TestSourceMapping:
pos = source_map(0, srcmap)
def preprocess_indentation(src: str) -> str:
def preprocess_indentation(src: str, src_name: str) -> str:
transformed = []
indent_level = 0
for line in src.split('\n'):
......@@ -109,7 +109,8 @@ def preprocess_indentation(src: str) -> str:
# print(prettyprint_tokenized(tokenized))
return tokenized
def preprocess_comments(src: str) -> Tuple[str, SourceMapFunc]:
def preprocess_comments(src: str, src_name: str) -> Tuple[str, SourceMapFunc]:
lines = src.split('\n')
positions, offsets = [0], [0]
pos = 0
......@@ -123,7 +124,8 @@ def preprocess_comments(src: str) -> Tuple[str, SourceMapFunc]:
pos += len(lines[i])
positions.append(pos)
offsets.append(offsets[-1])
return '\n'.join(lines), partial(source_map, srcmap=SourceMap(positions, offsets))
return '\n'.join(lines), \
partial(source_map, srcmap=SourceMap('FILE_NAME_DUMMY', positions, offsets))
class TestTokenParsing:
......@@ -143,13 +145,13 @@ class TestTokenParsing:
print(x) # another comment
print(y)
""")
tokenized = preprocess_indentation(code)
tokenized = preprocess_indentation(code, 'no_uri')
srcmap = tokenized_to_original_mapping(tokenized)
def verify_mapping(self, teststr, orig_text, preprocessed_text, mapping):
mapped_pos = preprocessed_text.find(teststr)
assert mapped_pos >= 0
original_pos = mapping(mapped_pos)
file_name, original_pos = mapping(mapped_pos)
# original_pos = source_map(mapped_pos, self.srcmap)
assert orig_text[original_pos:original_pos + len(teststr)] == teststr, \
'"%s" (%i) wrongly mapped onto "%s" (%i)' % \
......@@ -174,13 +176,13 @@ class TestTokenParsing:
previous_index = 0
L = len(self.code)
for mapped_index in range(len(self.tokenized)):
index = source_map(mapped_index, self.srcmap)
_, index = source_map(mapped_index, self.srcmap)
assert previous_index <= index <= L, \
"%i <= %i <= %i violated" % (previous_index, index, L)
previous_index = index
def test_non_token_preprocessor(self):
tokenized, mapping = preprocess_comments(self.code)
tokenized, mapping = preprocess_comments(self.code, 'no_uri')
self.verify_mapping("def func", self.code, tokenized, mapping)
self.verify_mapping("x > 0:", self.code, tokenized, mapping)
self.verify_mapping("if y > 0:", self.code, tokenized, mapping)
......@@ -189,7 +191,7 @@ class TestTokenParsing:
def test_chained_preprocessors(self):
pchain = chain_preprocessors(preprocess_comments, preprocess_indentation)
tokenized, mapping = pchain(self.code)
tokenized, mapping = pchain(self.code, 'no_uri')
self.verify_mapping("def func", self.code, tokenized, mapping)
self.verify_mapping("x > 0:", self.code, tokenized, mapping)
self.verify_mapping("if y > 0:", self.code, tokenized, mapping)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment