Commit 319fdeac authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

- proper source mapping of destination characters that are mapped onto the same source

parent 309c7376
...@@ -157,7 +157,7 @@ def error_str(messages: Iterable[Error]) -> str: ...@@ -157,7 +157,7 @@ def error_str(messages: Iterable[Error]) -> str:
Returns all true errors (i.e. not just warnings) from the Returns all true errors (i.e. not just warnings) from the
`messages` as a concatenated multiline string. `messages` as a concatenated multiline string.
""" """
return '\n\n'.join(str(m) for m in messages if is_error(m.level)) return '\n\n'.join(str(m) for m in messages if is_error(m.code))
def grammar_instance(grammar_representation) -> Tuple[Grammar, str]: def grammar_instance(grammar_representation) -> Tuple[Grammar, str]:
...@@ -287,7 +287,8 @@ def grammar_provider(ebnf_src: str, branding="DSL") -> Grammar: ...@@ -287,7 +287,8 @@ def grammar_provider(ebnf_src: str, branding="DSL") -> Grammar:
def load_compiler_suite(compiler_suite: str) -> \ def load_compiler_suite(compiler_suite: str) -> \
Tuple[PreprocessorFactoryFunc, ParserFactoryFunc, TransformerFactoryFunc, CompilerFactoryFunc]: Tuple[PreprocessorFactoryFunc, ParserFactoryFunc,
TransformerFactoryFunc, CompilerFactoryFunc]:
""" """
Extracts a compiler suite from file or string ``compiler suite`` Extracts a compiler suite from file or string ``compiler suite``
and returns it as a tuple (preprocessor, parser, ast, compiler). and returns it as a tuple (preprocessor, parser, ast, compiler).
......
...@@ -417,7 +417,7 @@ class EBNFCompiler(Compiler): ...@@ -417,7 +417,7 @@ class EBNFCompiler(Compiler):
the previously compiled formal language. the previously compiled formal language.
""" """
name = self.grammar_name + "Preprocessor" name = self.grammar_name + "Preprocessor"
return "def %s(text):\n return text\n" % name \ return "def %s(text):\n return text, lambda i: i\n" % name \
+ PREPROCESSOR_FACTORY.format(NAME=self.grammar_name) + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
......
...@@ -43,6 +43,7 @@ class Error: ...@@ -43,6 +43,7 @@ class Error:
# warning codes # warning codes
REDEFINED_DIRECTIVE_WARNING = 101 REDEFINED_DIRECTIVE_WARNING = 101
REDECLARED_TOKEN_WARNING = 102
# error codes # error codes
...@@ -106,7 +107,7 @@ def only_errors(messages: Iterable[Error], level: int = Error.ERROR) -> Iterator ...@@ -106,7 +107,7 @@ def only_errors(messages: Iterable[Error], level: int = Error.ERROR) -> Iterator
Returns an Iterator that yields only those messages that have Returns an Iterator that yields only those messages that have
at least the given error level. at least the given error level.
""" """
return (err for err in messages if err.level >= level) return (err for err in messages if err.code >= level)
def linebreaks(text: Union[StringView, str]) -> List[int]: def linebreaks(text: Union[StringView, str]) -> List[int]:
......
...@@ -65,7 +65,7 @@ from DHParser.stringview import StringView, EMPTY_STRING_VIEW ...@@ -65,7 +65,7 @@ from DHParser.stringview import StringView, EMPTY_STRING_VIEW
from DHParser.syntaxtree import Node, TransformationFunc, ParserBase, WHITESPACE_PTYPE, \ from DHParser.syntaxtree import Node, TransformationFunc, ParserBase, WHITESPACE_PTYPE, \
TOKEN_PTYPE, ZOMBIE_PARSER TOKEN_PTYPE, ZOMBIE_PARSER
from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME, \ from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME, \
PreprocessorFunc PreprocessorFunc, with_source_mapping
from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name, \ from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name, \
escape_control_characters, load_if_file, re, typing escape_control_characters, load_if_file, re, typing
from typing import Any, Callable, cast, Dict, List, Set, Tuple, Union, Optional from typing import Any, Callable, cast, Dict, List, Set, Tuple, Union, Optional
...@@ -859,7 +859,8 @@ class Grammar: ...@@ -859,7 +859,8 @@ class Grammar:
Returns: Returns:
Node: The root node ot the parse tree. Node: The root node ot the parse tree.
""" """
def tail_pos(predecessors: List[Node]) -> int:
def tail_pos(predecessors: Union[List[Node], Tuple[Node, ...]]) -> int:
"""Adds the position after the last node in the list of """Adds the position after the last node in the list of
predecessors to the node.""" predecessors to the node."""
return predecessors[-1].pos + len(predecessors[-1]) if predecessors else 0 return predecessors[-1].pos + len(predecessors[-1]) if predecessors else 0
...@@ -1007,7 +1008,9 @@ class Grammar: ...@@ -1007,7 +1008,9 @@ class Grammar:
log_file_name = name[:-7] if name.lower().endswith('grammar') else name log_file_name = name[:-7] if name.lower().endswith('grammar') else name
elif log_file_name.lower().endswith('.log'): elif log_file_name.lower().endswith('.log'):
log_file_name = log_file_name[:-4] log_file_name = log_file_name[:-4]
full_history, match_history, errors_only = [], [], [] full_history = [] # type: List[str]
match_history = [] # type: List[str]
errors_only = [] # type: List[str]
for record in self.history__: for record in self.history__:
line = record.as_html_tr() if html else str(record) line = record.as_html_tr() if html else str(record)
append_line(full_history, line) append_line(full_history, line)
...@@ -1359,8 +1362,7 @@ class Option(UnaryOperator): ...@@ -1359,8 +1362,7 @@ class Option(UnaryOperator):
super(Option, self).__init__(parser, name) super(Option, self).__init__(parser, name)
# assert isinstance(parser, Parser) # assert isinstance(parser, Parser)
assert not isinstance(parser, Option), \ assert not isinstance(parser, Option), \
"Redundant nesting of options: %s(%s)" % \ "Redundant nesting of options: %s(%s)" % (str(name), str(parser.name))
(str(name), str(parser.name))
# assert not isinstance(parser, Required), \ # assert not isinstance(parser, Required), \
# "Nesting options with required elements is contradictory: " \ # "Nesting options with required elements is contradictory: " \
# "%s(%s)" % (str(name), str(parser.name)) # "%s(%s)" % (str(name), str(parser.name))
...@@ -2218,7 +2220,7 @@ def compile_source(source: str, ...@@ -2218,7 +2220,7 @@ def compile_source(source: str,
source_text = load_if_file(source) source_text = load_if_file(source)
log_file_name = logfile_basename(source, compiler) log_file_name = logfile_basename(source, compiler)
if preprocessor is not None: if preprocessor is not None:
source_text = preprocessor(source_text) source_text, source_mapping = with_source_mapping(preprocessor(source_text))
syntax_tree = parser(source_text) syntax_tree = parser(source_text)
if is_logging(): if is_logging():
syntax_tree.log(log_file_name + '.cst') syntax_tree.log(log_file_name + '.cst')
......
...@@ -19,7 +19,7 @@ permissions and limitations under the License. ...@@ -19,7 +19,7 @@ permissions and limitations under the License.
import bisect import bisect
import collections import collections
import functools import functools
from typing import Union, Callable from typing import Union, Callable, Tuple, List
from DHParser.toolkit import re from DHParser.toolkit import re
...@@ -27,12 +27,17 @@ __all__ = ('RX_TOKEN_NAME', ...@@ -27,12 +27,17 @@ __all__ = ('RX_TOKEN_NAME',
'BEGIN_TOKEN', 'BEGIN_TOKEN',
'TOKEN_DELIMITER', 'TOKEN_DELIMITER',
'END_TOKEN', 'END_TOKEN',
'SourceMapFunc',
'PreprocessorFunc', 'PreprocessorFunc',
'PreprocessorResult',
'make_token', 'make_token',
'nil_preprocessor', 'nil_preprocessor',
'pp_tokenized', 'chain_preprocessors',
'prettyprint_tokenized',
'SourceMap',
'tokenized_to_original_mapping', 'tokenized_to_original_mapping',
'source_map') 'source_map',
'with_source_mapping')
BEGIN_TOKEN = '\x1b' BEGIN_TOKEN = '\x1b'
TOKEN_DELIMITER = '\x1c' TOKEN_DELIMITER = '\x1c'
...@@ -43,7 +48,64 @@ RX_TOKEN_NAME = re.compile(r'\w+') ...@@ -43,7 +48,64 @@ RX_TOKEN_NAME = re.compile(r'\w+')
RX_TOKEN_ARGUMENT = re.compile(r'[^\x1b\x1c\x1d]*') RX_TOKEN_ARGUMENT = re.compile(r'[^\x1b\x1c\x1d]*')
RX_TOKEN = re.compile(r'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d') RX_TOKEN = re.compile(r'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d')
PreprocessorFunc = Union[Callable[[str], str], functools.partial] SourceMapFunc = Union[Callable[[int], int], functools.partial]
PreprocessorResult = Union[str, Tuple[str, SourceMapFunc]]
PreprocessorFunc = Union[Callable[[str], PreprocessorResult], functools.partial]
def nil_preprocessor(text: str) -> Tuple[str, SourceMapFunc]:
"""
A preprocessor that does nothing, i.e. just returns the input.
"""
return text, lambda i: i
def _apply_mappings(position: int, mappings: List[SourceMapFunc]) -> int:
"""
Sequentially apply a number of mapping functions to a source position.
In the context of source mapping, the source position usually is a
position within a preprocessed source text and `mappings` should therefore
be a list of reverse-mappings in reversed order.
"""
for mapping in mappings:
position = mapping(position)
return position
def _apply_preprocessors(text: str, preprocessors: Tuple[PreprocessorFunc, ...]) \
-> Tuple[str, SourceMapFunc]:
"""
Applies several preprocessing functions sequentially to a source text
and returns the preprocessed text as well as a function that maps text-
positions in the processed text onto the corresponding position in the
original source test.
"""
processed = text
mapping_chain = []
for prep in preprocessors:
processed, mapping_func = with_source_mapping(prep(processed))
mapping_chain.append(mapping_func)
mapping_chain.reverse()
return processed, functools.partial(_apply_mappings, mappings=mapping_chain)
def chain_preprocessors(*preprocessors) -> PreprocessorFunc:
"""
Merges a seuqence of preprocessor functions in to a single function.
"""
return functools.partial(_apply_preprocessors, preprocessors=preprocessors)
#######################################################################
#
# Tokenization support
#
# In DHParser the source text is usually not tokenized, but,
# optionally, it can be enriched by tokens (or parts of it replaced
# by tokens) to, say indicate beginnings and endings of indented
# or quoted blocks that are difficult to capture with an EBNF-parser.
#
######################################################################
def make_token(token: str, argument: str = '') -> str: def make_token(token: str, argument: str = '') -> str:
...@@ -60,12 +122,7 @@ def make_token(token: str, argument: str = '') -> str: ...@@ -60,12 +122,7 @@ def make_token(token: str, argument: str = '') -> str:
return BEGIN_TOKEN + token + TOKEN_DELIMITER + argument + END_TOKEN return BEGIN_TOKEN + token + TOKEN_DELIMITER + argument + END_TOKEN
def nil_preprocessor(text: str) -> str: def prettyprint_tokenized(tokenized: str) -> str:
"""A preprocessor that does nothing, i.e. just returns the input."""
return text
def pp_tokenized(tokenized: str) -> str:
"""Returns a pretty-printable version of a document that contains tokens.""" """Returns a pretty-printable version of a document that contains tokens."""
return tokenized.replace('\x1b', '<').replace('\x1c', '|').replace('\x1d', '>') return tokenized.replace('\x1b', '<').replace('\x1c', '|').replace('\x1d', '>')
...@@ -98,6 +155,7 @@ def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap: ...@@ -98,6 +155,7 @@ def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap:
positions, offsets = [0], [0] positions, offsets = [0], [0]
o = 0 o = 0
i = tokenized_source.find(BEGIN_TOKEN) i = tokenized_source.find(BEGIN_TOKEN)
e = -1
while i >= 0: while i >= 0:
d = tokenized_source.find(TOKEN_DELIMITER, i) d = tokenized_source.find(TOKEN_DELIMITER, i)
e = tokenized_source.find(END_TOKEN, i) e = tokenized_source.find(END_TOKEN, i)
...@@ -114,9 +172,9 @@ def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap: ...@@ -114,9 +172,9 @@ def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap:
assert len(positions) == len(offsets), '\n' + str(positions) + '\n' + str(offsets) assert len(positions) == len(offsets), '\n' + str(positions) + '\n' + str(offsets)
assert positions[0] == 0 assert positions[0] == 0
assert all(positions[i] < positions[i + 1] for i in range(len(positions) - 1)) assert all(positions[i] < positions[i + 1] for i in range(len(positions) - 1))
assert all(offsets[i] >= offsets[i + 1] for i in range(len(offsets) - 1)) assert all(offsets[i] > offsets[i + 1] for i in range(len(offsets) - 2))
return SourceMap(positions, offsets, len(positions)) return SourceMap(positions, offsets)
def source_map(position: int, srcmap: SourceMap) -> int: def source_map(position: int, srcmap: SourceMap) -> int:
...@@ -136,5 +194,17 @@ def source_map(position: int, srcmap: SourceMap) -> int: ...@@ -136,5 +194,17 @@ def source_map(position: int, srcmap: SourceMap) -> int:
return min(position + srcmap.offsets[i - 1], srcmap.positions[i] + srcmap.offsets[i]) return min(position + srcmap.offsets[i - 1], srcmap.positions[i] + srcmap.offsets[i])
raise ValueError raise ValueError
# TODO: allow preprocessors to return their own source map (really a map or a function (easier)?)
# TODO: apply source maps in sequence. def with_source_mapping(result: PreprocessorResult) -> Tuple[str, SourceMapFunc]:
"""
Normalizes preprocessors results, by adding a mapping if a preprocessor
only returns the transformed source code and no mapping by itself. It is
assumed that in this case the preprocessor has just enriched the source
code with tokens, so that a source mapping can be derived automatically
with `tokenized_to_original_mapping` (see above).
"""
if isinstance(result, str):
srcmap = tokenized_to_original_mapping(result)
mapping_func = functools.partial(source_map, srcmap=srcmap)
return result, mapping_func
return result
...@@ -326,7 +326,7 @@ def replace_by_child(context: List[Node], criteria: CriteriaType=single_child): ...@@ -326,7 +326,7 @@ def replace_by_child(context: List[Node], criteria: CriteriaType=single_child):
@transformation_factory(int, str, Callable) @transformation_factory(int, str, Callable)
def content_from_child(context: List[None], criteria: CriteriaType=single_child): def content_from_child(context: List[Node], criteria: CriteriaType = single_child):
""" """
Reduces a node, by transferring the result of the first of its Reduces a node, by transferring the result of the first of its
immediate descendants that meets the `criteria` to this node, immediate descendants that meets the `criteria` to this node,
......
...@@ -77,7 +77,7 @@ Match-test "m2" ...@@ -77,7 +77,7 @@ Match-test "m2"
"" ""
"" ""
) )
(:RE (:Whitespace
"// Kommentar" "// Kommentar"
) )
) )
...@@ -253,19 +253,6 @@ Match-test "m4" ...@@ -253,19 +253,6 @@ Match-test "m4"
) )
) )
) )
(LZ
(:RegExp
""
""
)
(:RegExp
"// Kommentar"
)
(:RegExp
""
""
)
)
) )
Match-test "m5" Match-test "m5"
...@@ -304,7 +291,7 @@ Match-test "m5" ...@@ -304,7 +291,7 @@ Match-test "m5"
"" ""
"" ""
) )
(:RE (:Whitespace
"// Kommentar" "// Kommentar"
) )
) )
...@@ -546,12 +533,6 @@ Match-test "m3" ...@@ -546,12 +533,6 @@ Match-test "m3"
"// Kommentar" "// Kommentar"
) )
) )
(LZ
(:RegExp
""
""
)
)
) )
Match-test "m4" Match-test "m4"
...@@ -571,13 +552,6 @@ Match-test "m4" ...@@ -571,13 +552,6 @@ Match-test "m4"
"" ""
) )
) )
(LZ
(:RegExp
""
""
""
)
)
) )
Fail-test "f1" Fail-test "f1"
......
...@@ -14,9 +14,7 @@ Match-test "1" ...@@ -14,9 +14,7 @@ Match-test "1"
(Lemma (Lemma
(LemmaWort (LemmaWort
(LAT_WORT (LAT_WORT
(:RegExp "facitergula"
"facitergula"
)
) )
) )
) )
...@@ -44,9 +42,7 @@ Match-test "1" ...@@ -44,9 +42,7 @@ Match-test "1"
### AST ### AST
(LemmaVarianten (LemmaVarianten
(LAT_WORT (LAT_WORT
(:RegExp "fascitergula"
"fascitergula"
)
) )
(:ZeroOrMore (:ZeroOrMore
(:Series (:Series
...@@ -59,9 +55,7 @@ Match-test "1" ...@@ -59,9 +55,7 @@ Match-test "1"
) )
) )
(LAT_WORT (LAT_WORT
(:RegExp "facietergula"
"facietergula"
)
) )
) )
(:Series (:Series
...@@ -74,9 +68,7 @@ Match-test "1" ...@@ -74,9 +68,7 @@ Match-test "1"
) )
) )
(LAT_WORT (LAT_WORT
(:RegExp "facistergula"
"facistergula"
)
) )
) )
(:Series (:Series
...@@ -89,9 +81,7 @@ Match-test "1" ...@@ -89,9 +81,7 @@ Match-test "1"
) )
) )
(LAT_WORT (LAT_WORT
(:RegExp "farcutergula"
"farcutergula"
)
) )
) )
) )
...@@ -106,9 +96,7 @@ Match-test "2" ...@@ -106,9 +96,7 @@ Match-test "2"
### AST ### AST
(LemmaVarianten (LemmaVarianten
(LAT_WORT (LAT_WORT
(:RegExp "fascitergula"
"fascitergula"
)
) )
) )
...@@ -138,20 +126,8 @@ Match-test "3" ...@@ -138,20 +126,8 @@ Match-test "3"
) )
) )
(Zusatz (Zusatz
(:Series (DEU_WORT
(:Token "sim."
"{"
)
(DEU_WORT
(DEU_KLEIN
(:RegExp
"sim."
)
)
)
(:Token
"}"
)
) )
) )
) )
...@@ -166,60 +142,24 @@ Match-test "4" ...@@ -166,60 +142,24 @@ Match-test "4"
### AST ### AST
(LemmaVarianten (LemmaVarianten
(LAT_WORT (LAT_WORT
(:RegExp "fascitergula"
"fascitergula"
)
) )
(:ZeroOrMore (:ZeroOrMore
(:Series (LAT_WORT
(:Token "facietergula"
(:RegExp
","
)
(:Whitespace
" "
)
)
(LAT_WORT
(:RegExp
"facietergula"
)
)
) )
(:Series (LemmaVariante
(:Token (LAT_WORT
(:RegExp (:RegExp
"," "fascistergula"
) )
(:Whitespace (:Whitespace
" " " "
) )
) )
(LemmaVariante (Zusatz
(LAT_WORT (DEU_WORT
(:RegExp "sim."
"fascistergula"
)
(:Whitespace
" "
)
)
(Zusatz
(:Series
(:Token
"{"
)
(DEU_WORT
(DEU_KLEIN
(:RegExp
"sim."
)
)
)
(:Token
"}"
)
)
) )
) )
)