Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
badw-it
DHParser
Commits
319fdeac
Commit
319fdeac
authored
Dec 28, 2017
by
Eckhart Arnold
Browse files
- proper source mapping of destination characters that are mapped onto the same source
parent
309c7376
Changes
9
Hide whitespace changes
Inline
Side-by-side
DHParser/dsl.py
View file @
319fdeac
...
...
@@ -157,7 +157,7 @@ def error_str(messages: Iterable[Error]) -> str:
Returns all true errors (i.e. not just warnings) from the
`messages` as a concatenated multiline string.
"""
return
'
\n\n
'
.
join
(
str
(
m
)
for
m
in
messages
if
is_error
(
m
.
level
))
return
'
\n\n
'
.
join
(
str
(
m
)
for
m
in
messages
if
is_error
(
m
.
code
))
def
grammar_instance
(
grammar_representation
)
->
Tuple
[
Grammar
,
str
]:
...
...
@@ -287,7 +287,8 @@ def grammar_provider(ebnf_src: str, branding="DSL") -> Grammar:
def
load_compiler_suite
(
compiler_suite
:
str
)
->
\
Tuple
[
PreprocessorFactoryFunc
,
ParserFactoryFunc
,
TransformerFactoryFunc
,
CompilerFactoryFunc
]:
Tuple
[
PreprocessorFactoryFunc
,
ParserFactoryFunc
,
TransformerFactoryFunc
,
CompilerFactoryFunc
]:
"""
Extracts a compiler suite from file or string ``compiler suite``
and returns it as a tuple (preprocessor, parser, ast, compiler).
...
...
DHParser/ebnf.py
View file @
319fdeac
...
...
@@ -417,7 +417,7 @@ class EBNFCompiler(Compiler):
the previously compiled formal language.
"""
name
=
self
.
grammar_name
+
"Preprocessor"
return
"def %s(text):
\n
return text
\n
"
%
name
\
return
"def %s(text):
\n
return text
, lambda i: i
\n
"
%
name
\
+
PREPROCESSOR_FACTORY
.
format
(
NAME
=
self
.
grammar_name
)
...
...
DHParser/error.py
View file @
319fdeac
...
...
@@ -43,6 +43,7 @@ class Error:
# warning codes
REDEFINED_DIRECTIVE_WARNING
=
101
REDECLARED_TOKEN_WARNING
=
102
# error codes
...
...
@@ -106,7 +107,7 @@ def only_errors(messages: Iterable[Error], level: int = Error.ERROR) -> Iterator
Returns an Iterator that yields only those messages that have
at least the given error level.
"""
return
(
err
for
err
in
messages
if
err
.
level
>=
level
)
return
(
err
for
err
in
messages
if
err
.
code
>=
level
)
def
linebreaks
(
text
:
Union
[
StringView
,
str
])
->
List
[
int
]:
...
...
DHParser/parse.py
View file @
319fdeac
...
...
@@ -65,7 +65,7 @@ from DHParser.stringview import StringView, EMPTY_STRING_VIEW
from
DHParser.syntaxtree
import
Node
,
TransformationFunc
,
ParserBase
,
WHITESPACE_PTYPE
,
\
TOKEN_PTYPE
,
ZOMBIE_PARSER
from
DHParser.preprocess
import
BEGIN_TOKEN
,
END_TOKEN
,
RX_TOKEN_NAME
,
\
PreprocessorFunc
PreprocessorFunc
,
with_source_mapping
from
DHParser.toolkit
import
is_logging
,
log_dir
,
logfile_basename
,
escape_re
,
sane_parser_name
,
\
escape_control_characters
,
load_if_file
,
re
,
typing
from
typing
import
Any
,
Callable
,
cast
,
Dict
,
List
,
Set
,
Tuple
,
Union
,
Optional
...
...
@@ -859,7 +859,8 @@ class Grammar:
Returns:
Node: The root node ot the parse tree.
"""
def
tail_pos
(
predecessors
:
List
[
Node
])
->
int
:
def
tail_pos
(
predecessors
:
Union
[
List
[
Node
],
Tuple
[
Node
,
...]])
->
int
:
"""Adds the position after the last node in the list of
predecessors to the node."""
return
predecessors
[
-
1
].
pos
+
len
(
predecessors
[
-
1
])
if
predecessors
else
0
...
...
@@ -1007,7 +1008,9 @@ class Grammar:
log_file_name
=
name
[:
-
7
]
if
name
.
lower
().
endswith
(
'grammar'
)
else
name
elif
log_file_name
.
lower
().
endswith
(
'.log'
):
log_file_name
=
log_file_name
[:
-
4
]
full_history
,
match_history
,
errors_only
=
[],
[],
[]
full_history
=
[]
# type: List[str]
match_history
=
[]
# type: List[str]
errors_only
=
[]
# type: List[str]
for
record
in
self
.
history__
:
line
=
record
.
as_html_tr
()
if
html
else
str
(
record
)
append_line
(
full_history
,
line
)
...
...
@@ -1359,8 +1362,7 @@ class Option(UnaryOperator):
super
(
Option
,
self
).
__init__
(
parser
,
name
)
# assert isinstance(parser, Parser)
assert
not
isinstance
(
parser
,
Option
),
\
"Redundant nesting of options: %s(%s)"
%
\
(
str
(
name
),
str
(
parser
.
name
))
"Redundant nesting of options: %s(%s)"
%
(
str
(
name
),
str
(
parser
.
name
))
# assert not isinstance(parser, Required), \
# "Nesting options with required elements is contradictory: " \
# "%s(%s)" % (str(name), str(parser.name))
...
...
@@ -2218,7 +2220,7 @@ def compile_source(source: str,
source_text
=
load_if_file
(
source
)
log_file_name
=
logfile_basename
(
source
,
compiler
)
if
preprocessor
is
not
None
:
source_text
=
preprocessor
(
source_text
)
source_text
,
source_mapping
=
with_source_mapping
(
preprocessor
(
source_text
)
)
syntax_tree
=
parser
(
source_text
)
if
is_logging
():
syntax_tree
.
log
(
log_file_name
+
'.cst'
)
...
...
DHParser/preprocess.py
View file @
319fdeac
...
...
@@ -19,7 +19,7 @@ permissions and limitations under the License.
import
bisect
import
collections
import
functools
from
typing
import
Union
,
Callable
from
typing
import
Union
,
Callable
,
Tuple
,
List
from
DHParser.toolkit
import
re
...
...
@@ -27,12 +27,17 @@ __all__ = ('RX_TOKEN_NAME',
'BEGIN_TOKEN'
,
'TOKEN_DELIMITER'
,
'END_TOKEN'
,
'SourceMapFunc'
,
'PreprocessorFunc'
,
'PreprocessorResult'
,
'make_token'
,
'nil_preprocessor'
,
'pp_tokenized'
,
'chain_preprocessors'
,
'prettyprint_tokenized'
,
'SourceMap'
,
'tokenized_to_original_mapping'
,
'source_map'
)
'source_map'
,
'with_source_mapping'
)
BEGIN_TOKEN
=
'
\x1b
'
TOKEN_DELIMITER
=
'
\x1c
'
...
...
@@ -43,7 +48,64 @@ RX_TOKEN_NAME = re.compile(r'\w+')
RX_TOKEN_ARGUMENT
=
re
.
compile
(
r
'[^\x1b\x1c\x1d]*'
)
RX_TOKEN
=
re
.
compile
(
r
'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d'
)
PreprocessorFunc
=
Union
[
Callable
[[
str
],
str
],
functools
.
partial
]
SourceMapFunc
=
Union
[
Callable
[[
int
],
int
],
functools
.
partial
]
PreprocessorResult
=
Union
[
str
,
Tuple
[
str
,
SourceMapFunc
]]
PreprocessorFunc
=
Union
[
Callable
[[
str
],
PreprocessorResult
],
functools
.
partial
]
def
nil_preprocessor
(
text
:
str
)
->
Tuple
[
str
,
SourceMapFunc
]:
"""
A preprocessor that does nothing, i.e. just returns the input.
"""
return
text
,
lambda
i
:
i
def
_apply_mappings
(
position
:
int
,
mappings
:
List
[
SourceMapFunc
])
->
int
:
"""
Sequentially apply a number of mapping functions to a source position.
In the context of source mapping, the source position usually is a
position within a preprocessed source text and `mappings` should therefore
be a list of reverse-mappings in reversed order.
"""
for
mapping
in
mappings
:
position
=
mapping
(
position
)
return
position
def
_apply_preprocessors
(
text
:
str
,
preprocessors
:
Tuple
[
PreprocessorFunc
,
...])
\
->
Tuple
[
str
,
SourceMapFunc
]:
"""
Applies several preprocessing functions sequentially to a source text
and returns the preprocessed text as well as a function that maps text-
positions in the processed text onto the corresponding position in the
original source test.
"""
processed
=
text
mapping_chain
=
[]
for
prep
in
preprocessors
:
processed
,
mapping_func
=
with_source_mapping
(
prep
(
processed
))
mapping_chain
.
append
(
mapping_func
)
mapping_chain
.
reverse
()
return
processed
,
functools
.
partial
(
_apply_mappings
,
mappings
=
mapping_chain
)
def
chain_preprocessors
(
*
preprocessors
)
->
PreprocessorFunc
:
"""
Merges a seuqence of preprocessor functions in to a single function.
"""
return
functools
.
partial
(
_apply_preprocessors
,
preprocessors
=
preprocessors
)
#######################################################################
#
# Tokenization support
#
# In DHParser the source text is usually not tokenized, but,
# optionally, it can be enriched by tokens (or parts of it replaced
# by tokens) to, say indicate beginnings and endings of indented
# or quoted blocks that are difficult to capture with an EBNF-parser.
#
######################################################################
def
make_token
(
token
:
str
,
argument
:
str
=
''
)
->
str
:
...
...
@@ -60,12 +122,7 @@ def make_token(token: str, argument: str = '') -> str:
return
BEGIN_TOKEN
+
token
+
TOKEN_DELIMITER
+
argument
+
END_TOKEN
def
nil_preprocessor
(
text
:
str
)
->
str
:
"""A preprocessor that does nothing, i.e. just returns the input."""
return
text
def
pp_tokenized
(
tokenized
:
str
)
->
str
:
def
prettyprint_tokenized
(
tokenized
:
str
)
->
str
:
"""Returns a pretty-printable version of a document that contains tokens."""
return
tokenized
.
replace
(
'
\x1b
'
,
'<'
).
replace
(
'
\x1c
'
,
'|'
).
replace
(
'
\x1d
'
,
'>'
)
...
...
@@ -98,6 +155,7 @@ def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap:
positions
,
offsets
=
[
0
],
[
0
]
o
=
0
i
=
tokenized_source
.
find
(
BEGIN_TOKEN
)
e
=
-
1
while
i
>=
0
:
d
=
tokenized_source
.
find
(
TOKEN_DELIMITER
,
i
)
e
=
tokenized_source
.
find
(
END_TOKEN
,
i
)
...
...
@@ -114,9 +172,9 @@ def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap:
assert
len
(
positions
)
==
len
(
offsets
),
'
\n
'
+
str
(
positions
)
+
'
\n
'
+
str
(
offsets
)
assert
positions
[
0
]
==
0
assert
all
(
positions
[
i
]
<
positions
[
i
+
1
]
for
i
in
range
(
len
(
positions
)
-
1
))
assert
all
(
offsets
[
i
]
>
=
offsets
[
i
+
1
]
for
i
in
range
(
len
(
offsets
)
-
1
))
assert
all
(
offsets
[
i
]
>
offsets
[
i
+
1
]
for
i
in
range
(
len
(
offsets
)
-
2
))
return
SourceMap
(
positions
,
offsets
,
len
(
positions
)
)
return
SourceMap
(
positions
,
offsets
)
def
source_map
(
position
:
int
,
srcmap
:
SourceMap
)
->
int
:
...
...
@@ -136,5 +194,17 @@ def source_map(position: int, srcmap: SourceMap) -> int:
return
min
(
position
+
srcmap
.
offsets
[
i
-
1
],
srcmap
.
positions
[
i
]
+
srcmap
.
offsets
[
i
])
raise
ValueError
# TODO: allow preprocessors to return their own source map (really a map or a function (easier)?)
# TODO: apply source maps in sequence.
def
with_source_mapping
(
result
:
PreprocessorResult
)
->
Tuple
[
str
,
SourceMapFunc
]:
"""
Normalizes preprocessors results, by adding a mapping if a preprocessor
only returns the transformed source code and no mapping by itself. It is
assumed that in this case the preprocessor has just enriched the source
code with tokens, so that a source mapping can be derived automatically
with `tokenized_to_original_mapping` (see above).
"""
if
isinstance
(
result
,
str
):
srcmap
=
tokenized_to_original_mapping
(
result
)
mapping_func
=
functools
.
partial
(
source_map
,
srcmap
=
srcmap
)
return
result
,
mapping_func
return
result
DHParser/transform.py
View file @
319fdeac
...
...
@@ -326,7 +326,7 @@ def replace_by_child(context: List[Node], criteria: CriteriaType=single_child):
@
transformation_factory
(
int
,
str
,
Callable
)
def
content_from_child
(
context
:
List
[
No
n
e
],
criteria
:
CriteriaType
=
single_child
):
def
content_from_child
(
context
:
List
[
No
d
e
],
criteria
:
CriteriaType
=
single_child
):
"""
Reduces a node, by transferring the result of the first of its
immediate descendants that meets the `criteria` to this node,
...
...
examples/MLW/grammar_tests/REPORT/01_test_regexes.md
View file @
319fdeac
...
...
@@ -77,7 +77,7 @@ Match-test "m2"
""
""
)
(:
RE
(:
Whitespace
"// Kommentar"
)
)
...
...
@@ -253,19 +253,6 @@ Match-test "m4"
)
)
)
(LZ
(:RegExp
""
""
)
(:RegExp
"// Kommentar"
)
(:RegExp
""
""
)
)
)
Match-test "m5"
...
...
@@ -304,7 +291,7 @@ Match-test "m5"
""
""
)
(:
RE
(:
Whitespace
"// Kommentar"
)
)
...
...
@@ -546,12 +533,6 @@ Match-test "m3"
"// Kommentar"
)
)
(LZ
(:RegExp
""
""
)
)
)
Match-test "m4"
...
...
@@ -571,13 +552,6 @@ Match-test "m4"
""
)
)
(LZ
(:RegExp
""
""
""
)
)
)
Fail-test "f1"
...
...
examples/MLW/grammar_tests/REPORT/02_test_lemmaposition.md
View file @
319fdeac
...
...
@@ -14,9 +14,7 @@ Match-test "1"
(Lemma
(LemmaWort
(LAT_WORT
(:RegExp
"facitergula"
)
"facitergula"
)
)
)
...
...
@@ -44,9 +42,7 @@ Match-test "1"
### AST
(LemmaVarianten
(LAT_WORT
(:RegExp
"fascitergula"
)
"fascitergula"
)
(:ZeroOrMore
(:Series
...
...
@@ -59,9 +55,7 @@ Match-test "1"
)
)
(LAT_WORT
(:RegExp
"facietergula"
)
"facietergula"
)
)
(:Series
...
...
@@ -74,9 +68,7 @@ Match-test "1"
)
)
(LAT_WORT
(:RegExp
"facistergula"
)
"facistergula"
)
)
(:Series
...
...
@@ -89,9 +81,7 @@ Match-test "1"
)
)
(LAT_WORT
(:RegExp
"farcutergula"
)
"farcutergula"
)
)
)
...
...
@@ -106,9 +96,7 @@ Match-test "2"
### AST
(LemmaVarianten
(LAT_WORT
(:RegExp
"fascitergula"
)
"fascitergula"
)
)
...
...
@@ -138,20 +126,8 @@ Match-test "3"
)
)
(Zusatz
(:Series
(:Token
"{"
)
(DEU_WORT
(DEU_KLEIN
(:RegExp
"sim."
)
)
)
(:Token
"}"
)
(DEU_WORT
"sim."
)
)
)
...
...
@@ -166,60 +142,24 @@ Match-test "4"
### AST
(LemmaVarianten
(LAT_WORT
(:RegExp
"fascitergula"
)
"fascitergula"
)
(:ZeroOrMore
(:Series
(:Token
(:RegExp
","
)
(:Whitespace
" "
)
)
(LAT_WORT
(:RegExp
"facietergula"
)
)
(LAT_WORT
"facietergula"
)
(
:Series
(
:Token
(
LemmaVariante
(
LAT_WORT
(:RegExp
"
,
"
"
fascistergula
"
)
(:Whitespace
" "
)
)
(LemmaVariante
(LAT_WORT
(:RegExp
"fascistergula"
)
(:Whitespace
" "
)
)
(Zusatz
(:Series
(:Token
"{"
)
(DEU_WORT
(DEU_KLEIN
(:RegExp
"sim."
)
)
)
(:Token
"}"
)
)
(Zusatz
(DEU_WORT
"sim."
)
)
)
...
...
@@ -253,42 +193,16 @@ Match-test "1"
### AST
(LemmaPosition
(:Token
(:RegExp
"LEMMA"
)
(:Whitespace
" "
)
)
(Lemma
(LemmaWort
(LAT_WORT
(:RegExp
"facitergula"
)
)
)
)
(ZWW
(ZEILENSPRUNG
(:RegExp
""
""
)
)
(LZ
(:RegExp
""
""
"facitergula"
)
)
)
(LemmaVarianten
(LAT_WORT
(:RegExp
"fascitergula"
)
"fascitergula"
)
(:ZeroOrMore
(:Series
...
...
@@ -301,9 +215,7 @@ Match-test "1"
)
)
(LAT_WORT
(:RegExp
"facietergula"
)
"facietergula"
)
)
(:Series
...
...
@@ -316,9 +228,7 @@ Match-test "1"
)
)
(LAT_WORT
(:RegExp
"facistergula"
)
"facistergula"
)
)
(:Series
...
...
@@ -340,20 +250,8 @@ Match-test "1"
)
)
(Zusatz
(:Series
(:Token
"{"
)
(DEU_WORT
(DEU_KLEIN
(:RegExp
"sim."
)
)
)
(:Token
"}"
)
(DEU_WORT
"sim."
)
)
)
...
...
@@ -361,49 +259,15 @@ Match-test "1"
)
)
(GrammatikPosition
(ZWW
(ZEILENSPRUNG
(:RegExp
""
""
)
)
(LZ
(flexion
(FLEX
(:RegExp
""
""
"-ae"
)
)
)
(:Token
"GRAMMATIK"
)
(LZ
(:RegExp
""
""
)
)
(Grammatik
(wortart
"nomen"
)
(ABS
"; "
)
(flexion
(FLEX
(:RegExp
"-ae"
)
(:Whitespace
" "
)
(:Whitespace
" "
)
)
(genus
"f."
)
)
)
)
\ No newline at end of file
test/test_preprocess.py
View file @
319fdeac
...
...
@@ -22,10 +22,13 @@ limitations under the License.
# import sys
# sys.path.append('../')
from
functools
import
partial
from
DHParser.dsl
import
grammar_provider
from
DHParser.preprocess
import
make_token
,
tokenized_to_original_mapping
,
source_map
,
\
BEGIN_TOKEN
,
END_TOKEN
,
TOKEN_DELIMITER
from
DHParser.toolkit
import
lstrip_docstring
BEGIN_TOKEN
,
END_TOKEN
,
TOKEN_DELIMITER
,
SourceMapFunc
,
SourceMap
,
chain_preprocessors
from
DHParser.toolkit
import
lstrip_docstring
,
typing
from
typing
import
Tuple
class
TestMakeToken
:
...
...
@@ -60,7 +63,8 @@ class TestSourceMapping:
assert
len
(
positions
)
==
len
(
offsets
)