Commit 319fdeac authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

- proper source mapping of destination characters that are mapped onto the same source

parent 309c7376
......@@ -157,7 +157,7 @@ def error_str(messages: Iterable[Error]) -> str:
Returns all true errors (i.e. not just warnings) from the
`messages` as a concatenated multiline string.
"""
return '\n\n'.join(str(m) for m in messages if is_error(m.level))
return '\n\n'.join(str(m) for m in messages if is_error(m.code))
def grammar_instance(grammar_representation) -> Tuple[Grammar, str]:
......@@ -287,7 +287,8 @@ def grammar_provider(ebnf_src: str, branding="DSL") -> Grammar:
def load_compiler_suite(compiler_suite: str) -> \
Tuple[PreprocessorFactoryFunc, ParserFactoryFunc, TransformerFactoryFunc, CompilerFactoryFunc]:
Tuple[PreprocessorFactoryFunc, ParserFactoryFunc,
TransformerFactoryFunc, CompilerFactoryFunc]:
"""
Extracts a compiler suite from file or string ``compiler suite``
and returns it as a tuple (preprocessor, parser, ast, compiler).
......
......@@ -417,7 +417,7 @@ class EBNFCompiler(Compiler):
the previously compiled formal language.
"""
name = self.grammar_name + "Preprocessor"
return "def %s(text):\n return text\n" % name \
return "def %s(text):\n return text, lambda i: i\n" % name \
+ PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
......
......@@ -43,6 +43,7 @@ class Error:
# warning codes
REDEFINED_DIRECTIVE_WARNING = 101
REDECLARED_TOKEN_WARNING = 102
# error codes
......@@ -106,7 +107,7 @@ def only_errors(messages: Iterable[Error], level: int = Error.ERROR) -> Iterator
Returns an Iterator that yields only those messages that have
at least the given error level.
"""
return (err for err in messages if err.level >= level)
return (err for err in messages if err.code >= level)
def linebreaks(text: Union[StringView, str]) -> List[int]:
......
......@@ -65,7 +65,7 @@ from DHParser.stringview import StringView, EMPTY_STRING_VIEW
from DHParser.syntaxtree import Node, TransformationFunc, ParserBase, WHITESPACE_PTYPE, \
TOKEN_PTYPE, ZOMBIE_PARSER
from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME, \
PreprocessorFunc
PreprocessorFunc, with_source_mapping
from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name, \
escape_control_characters, load_if_file, re, typing
from typing import Any, Callable, cast, Dict, List, Set, Tuple, Union, Optional
......@@ -859,7 +859,8 @@ class Grammar:
Returns:
Node: The root node ot the parse tree.
"""
def tail_pos(predecessors: List[Node]) -> int:
def tail_pos(predecessors: Union[List[Node], Tuple[Node, ...]]) -> int:
"""Adds the position after the last node in the list of
predecessors to the node."""
return predecessors[-1].pos + len(predecessors[-1]) if predecessors else 0
......@@ -1007,7 +1008,9 @@ class Grammar:
log_file_name = name[:-7] if name.lower().endswith('grammar') else name
elif log_file_name.lower().endswith('.log'):
log_file_name = log_file_name[:-4]
full_history, match_history, errors_only = [], [], []
full_history = [] # type: List[str]
match_history = [] # type: List[str]
errors_only = [] # type: List[str]
for record in self.history__:
line = record.as_html_tr() if html else str(record)
append_line(full_history, line)
......@@ -1359,8 +1362,7 @@ class Option(UnaryOperator):
super(Option, self).__init__(parser, name)
# assert isinstance(parser, Parser)
assert not isinstance(parser, Option), \
"Redundant nesting of options: %s(%s)" % \
(str(name), str(parser.name))
"Redundant nesting of options: %s(%s)" % (str(name), str(parser.name))
# assert not isinstance(parser, Required), \
# "Nesting options with required elements is contradictory: " \
# "%s(%s)" % (str(name), str(parser.name))
......@@ -2218,7 +2220,7 @@ def compile_source(source: str,
source_text = load_if_file(source)
log_file_name = logfile_basename(source, compiler)
if preprocessor is not None:
source_text = preprocessor(source_text)
source_text, source_mapping = with_source_mapping(preprocessor(source_text))
syntax_tree = parser(source_text)
if is_logging():
syntax_tree.log(log_file_name + '.cst')
......
......@@ -19,7 +19,7 @@ permissions and limitations under the License.
import bisect
import collections
import functools
from typing import Union, Callable
from typing import Union, Callable, Tuple, List
from DHParser.toolkit import re
......@@ -27,12 +27,17 @@ __all__ = ('RX_TOKEN_NAME',
'BEGIN_TOKEN',
'TOKEN_DELIMITER',
'END_TOKEN',
'SourceMapFunc',
'PreprocessorFunc',
'PreprocessorResult',
'make_token',
'nil_preprocessor',
'pp_tokenized',
'chain_preprocessors',
'prettyprint_tokenized',
'SourceMap',
'tokenized_to_original_mapping',
'source_map')
'source_map',
'with_source_mapping')
BEGIN_TOKEN = '\x1b'
TOKEN_DELIMITER = '\x1c'
......@@ -43,7 +48,64 @@ RX_TOKEN_NAME = re.compile(r'\w+')
RX_TOKEN_ARGUMENT = re.compile(r'[^\x1b\x1c\x1d]*')
RX_TOKEN = re.compile(r'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d')
PreprocessorFunc = Union[Callable[[str], str], functools.partial]
SourceMapFunc = Union[Callable[[int], int], functools.partial]
PreprocessorResult = Union[str, Tuple[str, SourceMapFunc]]
PreprocessorFunc = Union[Callable[[str], PreprocessorResult], functools.partial]
def nil_preprocessor(text: str) -> Tuple[str, SourceMapFunc]:
"""
A preprocessor that does nothing, i.e. just returns the input.
"""
return text, lambda i: i
def _apply_mappings(position: int, mappings: List[SourceMapFunc]) -> int:
"""
Sequentially apply a number of mapping functions to a source position.
In the context of source mapping, the source position usually is a
position within a preprocessed source text and `mappings` should therefore
be a list of reverse-mappings in reversed order.
"""
for mapping in mappings:
position = mapping(position)
return position
def _apply_preprocessors(text: str, preprocessors: Tuple[PreprocessorFunc, ...]) \
-> Tuple[str, SourceMapFunc]:
"""
Applies several preprocessing functions sequentially to a source text
and returns the preprocessed text as well as a function that maps text-
positions in the processed text onto the corresponding position in the
original source test.
"""
processed = text
mapping_chain = []
for prep in preprocessors:
processed, mapping_func = with_source_mapping(prep(processed))
mapping_chain.append(mapping_func)
mapping_chain.reverse()
return processed, functools.partial(_apply_mappings, mappings=mapping_chain)
def chain_preprocessors(*preprocessors) -> PreprocessorFunc:
"""
Merges a seuqence of preprocessor functions in to a single function.
"""
return functools.partial(_apply_preprocessors, preprocessors=preprocessors)
#######################################################################
#
# Tokenization support
#
# In DHParser the source text is usually not tokenized, but,
# optionally, it can be enriched by tokens (or parts of it replaced
# by tokens) to, say indicate beginnings and endings of indented
# or quoted blocks that are difficult to capture with an EBNF-parser.
#
######################################################################
def make_token(token: str, argument: str = '') -> str:
......@@ -60,12 +122,7 @@ def make_token(token: str, argument: str = '') -> str:
return BEGIN_TOKEN + token + TOKEN_DELIMITER + argument + END_TOKEN
def nil_preprocessor(text: str) -> str:
"""A preprocessor that does nothing, i.e. just returns the input."""
return text
def pp_tokenized(tokenized: str) -> str:
def prettyprint_tokenized(tokenized: str) -> str:
"""Returns a pretty-printable version of a document that contains tokens."""
return tokenized.replace('\x1b', '<').replace('\x1c', '|').replace('\x1d', '>')
......@@ -98,6 +155,7 @@ def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap:
positions, offsets = [0], [0]
o = 0
i = tokenized_source.find(BEGIN_TOKEN)
e = -1
while i >= 0:
d = tokenized_source.find(TOKEN_DELIMITER, i)
e = tokenized_source.find(END_TOKEN, i)
......@@ -114,9 +172,9 @@ def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap:
assert len(positions) == len(offsets), '\n' + str(positions) + '\n' + str(offsets)
assert positions[0] == 0
assert all(positions[i] < positions[i + 1] for i in range(len(positions) - 1))
assert all(offsets[i] >= offsets[i + 1] for i in range(len(offsets) - 1))
assert all(offsets[i] > offsets[i + 1] for i in range(len(offsets) - 2))
return SourceMap(positions, offsets, len(positions))
return SourceMap(positions, offsets)
def source_map(position: int, srcmap: SourceMap) -> int:
......@@ -136,5 +194,17 @@ def source_map(position: int, srcmap: SourceMap) -> int:
return min(position + srcmap.offsets[i - 1], srcmap.positions[i] + srcmap.offsets[i])
raise ValueError
# TODO: allow preprocessors to return their own source map (really a map or a function (easier)?)
# TODO: apply source maps in sequence.
def with_source_mapping(result: PreprocessorResult) -> Tuple[str, SourceMapFunc]:
"""
Normalizes preprocessors results, by adding a mapping if a preprocessor
only returns the transformed source code and no mapping by itself. It is
assumed that in this case the preprocessor has just enriched the source
code with tokens, so that a source mapping can be derived automatically
with `tokenized_to_original_mapping` (see above).
"""
if isinstance(result, str):
srcmap = tokenized_to_original_mapping(result)
mapping_func = functools.partial(source_map, srcmap=srcmap)
return result, mapping_func
return result
......@@ -326,7 +326,7 @@ def replace_by_child(context: List[Node], criteria: CriteriaType=single_child):
@transformation_factory(int, str, Callable)
def content_from_child(context: List[None], criteria: CriteriaType=single_child):
def content_from_child(context: List[Node], criteria: CriteriaType = single_child):
"""
Reduces a node, by transferring the result of the first of its
immediate descendants that meets the `criteria` to this node,
......
......@@ -77,7 +77,7 @@ Match-test "m2"
""
""
)
(:RE
(:Whitespace
"// Kommentar"
)
)
......@@ -253,19 +253,6 @@ Match-test "m4"
)
)
)
(LZ
(:RegExp
""
""
)
(:RegExp
"// Kommentar"
)
(:RegExp
""
""
)
)
)
Match-test "m5"
......@@ -304,7 +291,7 @@ Match-test "m5"
""
""
)
(:RE
(:Whitespace
"// Kommentar"
)
)
......@@ -546,12 +533,6 @@ Match-test "m3"
"// Kommentar"
)
)
(LZ
(:RegExp
""
""
)
)
)
Match-test "m4"
......@@ -571,13 +552,6 @@ Match-test "m4"
""
)
)
(LZ
(:RegExp
""
""
""
)
)
)
Fail-test "f1"
......
......@@ -14,9 +14,7 @@ Match-test "1"
(Lemma
(LemmaWort
(LAT_WORT
(:RegExp
"facitergula"
)
"facitergula"
)
)
)
......@@ -44,9 +42,7 @@ Match-test "1"
### AST
(LemmaVarianten
(LAT_WORT
(:RegExp
"fascitergula"
)
"fascitergula"
)
(:ZeroOrMore
(:Series
......@@ -59,9 +55,7 @@ Match-test "1"
)
)
(LAT_WORT
(:RegExp
"facietergula"
)
"facietergula"
)
)
(:Series
......@@ -74,9 +68,7 @@ Match-test "1"
)
)
(LAT_WORT
(:RegExp
"facistergula"
)
"facistergula"
)
)
(:Series
......@@ -89,9 +81,7 @@ Match-test "1"
)
)
(LAT_WORT
(:RegExp
"farcutergula"
)
"farcutergula"
)
)
)
......@@ -106,9 +96,7 @@ Match-test "2"
### AST
(LemmaVarianten
(LAT_WORT
(:RegExp
"fascitergula"
)
"fascitergula"
)
)
......@@ -138,20 +126,8 @@ Match-test "3"
)
)
(Zusatz
(:Series
(:Token
"{"
)
(DEU_WORT
(DEU_KLEIN
(:RegExp
"sim."
)
)
)
(:Token
"}"
)
(DEU_WORT
"sim."
)
)
)
......@@ -166,60 +142,24 @@ Match-test "4"
### AST
(LemmaVarianten
(LAT_WORT
(:RegExp
"fascitergula"
)
"fascitergula"
)
(:ZeroOrMore
(:Series
(:Token
(:RegExp
","
)
(:Whitespace
" "
)
)
(LAT_WORT
(:RegExp
"facietergula"
)
)
(LAT_WORT
"facietergula"
)
(:Series
(:Token
(LemmaVariante
(LAT_WORT
(:RegExp
","
"fascistergula"
)
(:Whitespace
" "
)
)
(LemmaVariante
(LAT_WORT
(:RegExp
"fascistergula"
)
(:Whitespace
" "
)
)
(Zusatz
(:Series
(:Token
"{"
)
(DEU_WORT
(DEU_KLEIN
(:RegExp
"sim."
)
)
)
(:Token
"}"
)
)
(Zusatz
(DEU_WORT
"sim."
)
)
)
......@@ -253,42 +193,16 @@ Match-test "1"
### AST
(LemmaPosition
(:Token
(:RegExp
"LEMMA"
)
(:Whitespace
" "
)
)
(Lemma
(LemmaWort
(LAT_WORT
(:RegExp
"facitergula"
)
)
)
)
(ZWW
(ZEILENSPRUNG
(:RegExp
""
""
)
)
(LZ
(:RegExp
""
""
"facitergula"
)
)
)
(LemmaVarianten
(LAT_WORT
(:RegExp
"fascitergula"
)
"fascitergula"
)
(:ZeroOrMore
(:Series
......@@ -301,9 +215,7 @@ Match-test "1"
)
)
(LAT_WORT
(:RegExp
"facietergula"
)
"facietergula"
)
)
(:Series
......@@ -316,9 +228,7 @@ Match-test "1"
)
)
(LAT_WORT
(:RegExp
"facistergula"
)
"facistergula"
)
)
(:Series
......@@ -340,20 +250,8 @@ Match-test "1"
)
)
(Zusatz
(:Series
(:Token
"{"
)
(DEU_WORT
(DEU_KLEIN
(:RegExp
"sim."
)
)
)
(:Token
"}"
)
(DEU_WORT
"sim."
)
)
)
......@@ -361,49 +259,15 @@ Match-test "1"
)
)
(GrammatikPosition
(ZWW
(ZEILENSPRUNG
(:RegExp
""
""
)
)
(LZ
(flexion
(FLEX
(:RegExp
""
""
"-ae"
)
)
)
(:Token
"GRAMMATIK"
)
(LZ
(:RegExp
""
""
)
)
(Grammatik
(wortart
"nomen"
)
(ABS
"; "
)
(flexion
(FLEX
(:RegExp
"-ae"
)
(:Whitespace
" "
)
(:Whitespace
" "
)
)
(genus
"f."
)
)
)
)
\ No newline at end of file
......@@ -22,10 +22,13 @@ limitations under the License.
# import sys
# sys.path.append('../')
from functools import partial
from DHParser.dsl import grammar_provider
from DHParser.preprocess import make_token, tokenized_to_original_mapping, source_map, \
BEGIN_TOKEN, END_TOKEN, TOKEN_DELIMITER
from DHParser.toolkit import lstrip_docstring
BEGIN_TOKEN, END_TOKEN, TOKEN_DELIMITER, SourceMapFunc, SourceMap, chain_preprocessors
from DHParser.toolkit import lstrip_docstring, typing
from typing import Tuple
class TestMakeToken:
......@@ -60,7 +63,8 @@ class TestSourceMapping:
assert len(positions) == len(offsets)