Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
badw-it
DHParser
Commits
a985e61e
Commit
a985e61e
authored
May 23, 2021
by
Eckhart Arnold
Browse files
preprocess.py: aglorithm for includes reworked
parent
d4e21b3e
Changes
5
Hide whitespace changes
Inline
Side-by-side
DHParser/error.py
View file @
a985e61e
...
...
@@ -48,9 +48,9 @@ column-number
"""
import
os
from
typing
import
Iterable
,
Iterator
,
Union
,
List
,
Any
,
Sequence
,
Tuple
from
typing
import
Iterable
,
Iterator
,
Union
,
List
,
Optional
,
Sequence
,
Tuple
from
DHParser.preprocess
import
SourceMapFunc
,
neutral_mapping
from
DHParser.preprocess
import
SourceMapFunc
,
SourceLocation
from
DHParser.stringview
import
StringView
from
DHParser.toolkit
import
linebreaks
,
line_col
,
is_filename
...
...
@@ -195,8 +195,6 @@ class Error:
:ivar orig_doc: the name or path or url of the original source file to
which ``orig_pos`` is related. This is relevant, if the preprocessed
document has been plugged together from several source files.
:ivar orig_offset: the offset of the included ``oric_doc`` within the
outermost including document.
:ivar line: the line number where the error occurred in the original text.
Lines are counted from 1 onward.
:ivar column: the column where the error occurred in the original text.
...
...
@@ -211,7 +209,7 @@ class Error:
__slots__
=
[
'message'
,
'code'
,
'_pos'
,
'line'
,
'column'
,
'length'
,
'end_line'
,
'end_column'
,
'related'
,
'orig_pos'
,
'orig_doc'
,
'orig_offset'
,
'relatedUri'
]
'relatedUri'
]
def
__init__
(
self
,
message
:
str
,
pos
:
int
,
code
:
ErrorCode
=
ERROR
,
line
:
int
=
-
1
,
column
:
int
=
-
1
,
length
:
int
=
1
,
...
...
@@ -229,7 +227,6 @@ class Error:
self
.
code
=
code
# type: ErrorCode
self
.
orig_pos
=
orig_pos
# type: int
self
.
orig_doc
=
orig_doc
# type: str
self
.
orig_offset
=
0
# type: int
self
.
line
=
line
# type: int
self
.
column
=
column
# type: int
# support for Language Server Protocol Diagnostics
...
...
@@ -382,7 +379,7 @@ def only_errors(messages: Iterable[Error], level: int = ERROR) -> Iterator[Error
def
adjust_error_locations
(
errors
:
List
[
Error
],
original_text
:
Union
[
StringView
,
str
],
source_mapping
:
SourceMapFunc
=
ne
utral_mapping
):
source_mapping
:
Optional
[
SourceMapFunc
]
=
No
ne
):
"""Adds (or adjusts) line and column numbers of error messages inplace.
Args:
...
...
@@ -406,17 +403,18 @@ def adjust_error_locations(errors: List[Error],
return
1
,
c
-
base_c
+
1
line_breaks
=
linebreaks
(
original_text
)
if
not
source_mapping
:
source_mapping
=
lambda
pos
:
SourceLocation
(
''
,
line_breaks
,
pos
)
for
err
in
errors
:
assert
err
.
pos
>=
0
err
.
orig_doc
,
err
.
orig_offset
,
err
.
orig_pos
=
source_mapping
(
err
.
pos
)
err
.
line
,
err
.
column
=
relative_lc
(
line_breaks
,
err
.
orig_pos
,
err
.
orig_offset
)
err
.
orig_doc
,
lbreaks
,
err
.
orig_pos
=
source_mapping
(
err
.
pos
)
err
.
line
,
err
.
column
=
line_col
(
line_breaks
,
err
.
orig_pos
)
# adjust length in case it exceeds the text size. As this is non-fatal
# it should be adjusted rather than an error raised to avoid
# unnecessary special-case treatments in other places
if
err
.
orig_pos
+
err
.
length
>
len
(
err
.
orig_doc
):
err
.
length
=
len
(
err
.
orig_doc
)
-
err
.
orig_pos
err
.
end_line
,
err
.
end_column
=
relative_lc
(
line_breaks
,
err
.
orig_pos
+
err
.
length
,
err
.
orig_offset
)
err
.
end_line
,
err
.
end_column
=
line_col
(
lbreaks
,
err
.
orig_pos
+
err
.
length
)
# def canonical_error_strings(errors: List[Error], source_file_name: str = '') -> List[str]:
# """Returns the list of error strings in canonical form that can be parsed by most
...
...
DHParser/preprocess.py
View file @
a985e61e
...
...
@@ -31,9 +31,9 @@ cannot completely be described entirely with context-free grammars.
import
bisect
import
functools
import
os
from
typing
import
Union
,
Optional
,
Callable
,
Tuple
,
NamedTuple
,
List
,
Any
from
typing
import
Union
,
Optional
,
Callable
,
Tuple
,
NamedTuple
,
List
,
Dict
,
Any
from
DHParser.toolkit
import
re
,
dataclasse
s
from
DHParser.toolkit
import
re
,
linebreak
s
__all__
=
(
'RX_TOKEN_NAME'
,
...
...
@@ -50,7 +50,6 @@ __all__ = ('RX_TOKEN_NAME',
'nil_preprocessor'
,
'chain_preprocessors'
,
'prettyprint_tokenized'
,
'neutral_mapping'
,
'tokenized_to_original_mapping'
,
'source_map'
,
'with_source_mapping'
,
...
...
@@ -74,17 +73,28 @@ RX_TOKEN_ARGUMENT = re.compile(r'[^\x1b\x1c\x1d]*')
RX_TOKEN
=
re
.
compile
(
r
'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d'
)
@
dataclasses
.
dataclass
class
SourceMap
:
source_name
:
str
# nome or path or uri of the original source file
positions
:
List
[
int
]
# a list of locations
offsets
:
List
[
int
]
# the corresponding offsets to be added from these locations onward
class
IncludeInfo
(
NamedTuple
):
begin
:
int
length
:
int
file_name
:
str
class
SourceMap
(
NamedTuple
):
source_name
:
str
# nome or path or uri of the original source file
positions
:
List
[
int
]
# a list of locations
offsets
:
List
[
int
]
# the corresponding offsets to be added from these locations onward
file_names
:
List
[
str
]
# list of file_names to which the source locations relate
lbreaks_dict
:
Dict
[
str
,
List
[
int
]]
# line breaks of the included texts
def
has_includes
(
sm
:
SourceMap
)
->
bool
:
return
any
(
fname
!=
sm
.
source_name
for
fname
in
sm
.
file_names
)
class
SourceLocation
(
NamedTuple
):
source_name
:
str
# the file name (or path or uri) of the source code
source_offset
:
int
# the offset of this file within the complet
e source
text
pos
:
int
# a position within this file
source_name
:
str
# the file name (or path or uri) of the source code
lbreaks
:
List
[
int
]
# positions of the line-breaks in th
e source
file
pos
:
int
# a position within this file
SourceMapFunc
=
Union
[
Callable
[[
int
],
SourceLocation
],
...
...
@@ -96,23 +106,7 @@ class Preprocessed(NamedTuple):
back_mapping
:
SourceMapFunc
@
dataclasses
.
dataclass
class
IncludeMap
(
SourceMap
):
file_names
:
List
[
str
]
# list of file_names to which the source locations relate
def
has_includes
(
self
)
->
bool
:
return
any
(
fname
!=
self
.
source_name
for
fname
in
self
.
file_names
)
class
IncludeInfo
(
NamedTuple
):
begin
:
int
length
:
int
file_name
:
str
PreprocessorResult
=
Union
[
str
,
Preprocessed
]
FindIncludeFunc
=
Union
[
Callable
[[
str
,
int
],
IncludeInfo
],
# (document: str, start: int)
functools
.
partial
]
PreprocessorFunc
=
Union
[
Callable
[[
str
,
str
],
PreprocessorResult
],
# text: str, filename: str
...
...
@@ -130,7 +124,8 @@ def nil_preprocessor(source_text: str, source_name: str) -> Preprocessed:
"""
A preprocessor that does nothing, i.e. just returns the input.
"""
return
Preprocessed
(
source_text
,
lambda
i
:
SourceLocation
(
source_name
,
0
,
i
))
lbreaks
=
linebreaks
(
source_text
)
return
Preprocessed
(
source_text
,
lambda
i
:
SourceLocation
(
source_name
,
lbreaks
,
i
))
def
_apply_mappings
(
position
:
int
,
mappings
:
List
[
SourceMapFunc
])
->
SourceLocation
:
...
...
@@ -140,10 +135,10 @@ def _apply_mappings(position: int, mappings: List[SourceMapFunc]) -> SourceLocat
position within a preprocessed source text and mappings should therefore
be a list of reverse-mappings in reversed order.
"""
filename
=
''
filename
,
lbreaks
=
''
,
[]
for
mapping
in
mappings
:
filename
,
offset
,
position
=
mapping
(
position
)
return
SourceLocation
(
filename
,
offset
,
position
)
filename
,
lbreaks
,
position
=
mapping
(
position
)
return
SourceLocation
(
filename
,
lbreaks
,
position
)
def
_apply_preprocessors
(
source_text
:
str
,
source_name
:
str
,
...
...
@@ -228,12 +223,6 @@ def strip_tokens(tokenized: str) -> str:
#######################################################################
def
neutral_mapping
(
pos
:
int
)
->
SourceLocation
:
'''Maps source locations on itself and sets the source file name
to the empty string.'''
return
SourceLocation
(
''
,
0
,
pos
)
def
tokenized_to_original_mapping
(
tokenized_text
:
str
,
source_name
:
str
=
'UNKNOWN_FILE'
)
->
SourceMap
:
"""
Generates a source map for mapping positions in a text that has
...
...
@@ -271,7 +260,9 @@ def tokenized_to_original_mapping(tokenized_text: str, source_name: str='UNKNOWN
# specific condition for preprocessor tokens
assert
all
(
offsets
[
i
]
>
offsets
[
i
+
1
]
for
i
in
range
(
len
(
offsets
)
-
2
))
return
SourceMap
(
source_name
,
positions
,
offsets
)
lbreaks
=
linebreaks
(
tokenized_text
)
L
=
len
(
positions
)
return
SourceMap
(
source_name
,
positions
,
offsets
,
[
source_name
]
*
L
,
{
source_name
:
lbreaks
})
def
source_map
(
position
:
int
,
srcmap
:
SourceMap
)
->
SourceLocation
:
...
...
@@ -281,13 +272,15 @@ def source_map(position: int, srcmap: SourceMap) -> SourceLocation:
:param position: the position in the processed text
:param srcmap: the source map, i.e. a mapping of locations to offset values
and source texts.
:returns: the mapped position
"""
i
=
bisect
.
bisect_right
(
srcmap
.
positions
,
position
)
if
i
:
source_name
=
srcmap
.
file_names
[
i
-
1
]
return
SourceLocation
(
srcmap
.
source_name
,
0
,
source_name
,
srcmap
.
lbreaks_dict
[
source_name
]
,
min
(
position
+
srcmap
.
offsets
[
i
-
1
],
srcmap
.
positions
[
i
]
+
srcmap
.
offsets
[
i
]))
raise
ValueError
...
...
@@ -366,12 +359,12 @@ def gen_find_include_func(rx: Union[str, Any],
def
generate_include_map
(
source_name
:
str
,
source_text
:
str
,
find_next_include
:
FindIncludeFunc
)
->
Tuple
[
Includ
eMap
,
str
]:
find_next_include
:
FindIncludeFunc
)
->
Tuple
[
Sourc
eMap
,
str
]:
file_names
:
set
=
set
()
def
generate_map
(
source_name
,
source_text
,
find_next
)
->
Tuple
[
Includ
eMap
,
str
]:
def
generate_map
(
source_name
,
source_text
,
find_next
)
->
Tuple
[
Sourc
eMap
,
str
]:
nonlocal
file_names
map
=
Includ
eMap
(
source_name
,
[
0
],
[
0
],
[
source_name
])
map
=
Sourc
eMap
(
source_name
,
[
0
],
[
0
],
[
source_name
]
,
{
source_name
:
linebreaks
(
source_text
)}
)
result
=
[]
if
source_name
in
file_names
:
...
...
@@ -393,19 +386,21 @@ def generate_include_map(source_name: str,
with
open
(
include_name
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
included_text
=
f
.
read
()
inner_map
,
inner_text
=
generate_map
(
include_name
,
included_text
,
find_next
)
inner_map
.
positions
=
[
pos
+
result_pointer
for
pos
in
inner_map
.
positions
]
inner_map
.
offsets
=
[
offset
-
result_pointer
for
offset
in
inner_map
.
offsets
]
assert
len
(
inner_map
.
positions
)
==
len
(
inner_map
.
offsets
)
==
len
(
inner_map
.
file_names
)
for
i
in
range
(
len
(
inner_map
.
positions
)):
inner_map
.
positions
[
i
]
+=
result_pointer
inner_map
.
offsets
[
i
]
-=
result_pointer
if
source_delta
==
0
:
map
.
file_names
=
map
.
file_names
[:
-
1
]
+
inner_map
.
file_names
[:
-
1
]
map
.
positions
=
map
.
positions
[:
-
1
]
+
inner_map
.
positions
[:
-
1
]
map
.
offsets
=
map
.
offsets
[:
-
1
]
+
inner_map
.
offsets
[:
-
1
]
result
.
append
(
inner_text
)
map
.
file_names
.
pop
()
map
.
positions
.
pop
()
map
.
offsets
.
pop
()
else
:
result
.
append
(
source_text
[
source_pointer
-
source_delta
:
source_pointer
])
map
.
file_names
+=
inner_map
.
file_names
[:
-
1
]
map
.
positions
+=
inner_map
.
positions
[:
-
1
]
map
.
offsets
+=
inner_map
.
offsets
[:
-
1
]
result
.
append
(
inner_text
)
map
.
file_names
.
extend
(
inner_map
.
file_names
[:
-
1
])
map
.
positions
.
extend
(
inner_map
.
positions
[:
-
1
])
map
.
offsets
.
extend
(
inner_map
.
offsets
[:
-
1
])
map
.
lbreaks_dict
.
update
(
inner_map
.
lbreaks_dict
)
result
.
append
(
inner_text
)
inner_length
=
len
(
inner_text
)
result_pointer
+=
inner_length
map
.
file_names
.
append
(
source_name
)
...
...
@@ -422,19 +417,20 @@ def generate_include_map(source_name: str,
map
.
offsets
.
append
(
source_offset
)
map
.
file_names
.
append
(
source_name
)
file_names
.
remove
(
source_name
)
# map.file_offsets = [-offset for offset in map.offsets] # only for debugging!
return
map
,
''
.
join
(
result
)
return
generate_map
(
source_name
,
source_text
,
find_next_include
)
def
srcmap_includes
(
position
:
int
,
inclmap
:
Includ
eMap
)
->
SourceLocation
:
def
srcmap_includes
(
position
:
int
,
inclmap
:
Sourc
eMap
)
->
SourceLocation
:
i
=
bisect
.
bisect_right
(
inclmap
.
positions
,
position
)
if
i
:
offset
=
inclmap
.
offset
s
[
i
-
1
]
source_name
=
inclmap
.
file_name
s
[
i
-
1
]
return
SourceLocation
(
inclmap
.
file_names
[
i
-
1
]
,
-
offset
,
position
+
offset
)
source_name
,
inclmap
.
lbreaks_dict
[
source_name
]
,
position
+
inclmap
.
offsets
[
i
-
1
]
)
raise
ValueError
...
...
DHParser/syntaxtree.py
View file @
a985e61e
...
...
@@ -592,7 +592,7 @@ from typing import Callable, cast, Iterator, Sequence, List, Set, Union, \
from
DHParser.configuration
import
get_config_value
,
ALLOWED_PRESET_VALUES
from
DHParser.error
import
Error
,
ErrorCode
,
ERROR
,
PARSER_STOPPED_BEFORE_END
,
\
adjust_error_locations
from
DHParser.preprocess
import
SourceMapFunc
,
neutral_mapping
from
DHParser.preprocess
import
SourceMapFunc
,
SourceLocation
from
DHParser.stringview
import
StringView
# , real_indices
from
DHParser.toolkit
import
re
,
cython
,
linebreaks
,
line_col
,
JSONnull
,
\
validate_XML_attribute_value
,
fix_XML_attribute_value
,
lxml_XML_attribute_value
,
\
...
...
@@ -2688,7 +2688,7 @@ class RootNode(Node):
def
__init__
(
self
,
node
:
Optional
[
Node
]
=
None
,
source
:
Union
[
str
,
StringView
]
=
''
,
source_mapping
:
SourceMapFunc
=
ne
utral_mapping
):
source_mapping
:
Optional
[
SourceMapFunc
]
=
No
ne
):
super
().
__init__
(
'__not_yet_ready__'
,
''
)
self
.
errors
=
[]
# type: List[Error]
self
.
error_nodes
=
dict
()
# type: Dict[int, List[Error]] # id(node) -> error list
...
...
@@ -2696,7 +2696,11 @@ class RootNode(Node):
self
.
error_flag
=
0
# info on source code (to be carried along all stages of tree-processing)
self
.
source
=
source
# type: str
self
.
source_mapping
=
source_mapping
# type: SourceMapFunc
if
source_mapping
is
None
:
line_breaks
=
linebreaks
(
source
)
self
.
source_mapping
=
lambda
pos
:
SourceLocation
(
''
,
line_breaks
,
pos
)
else
:
self
.
source_mapping
=
source_mapping
# type: SourceMapFunc
self
.
lbreaks
=
linebreaks
(
source
)
# List[int]
# customization for XML-Representation
self
.
inline_tags
=
set
()
# type: Set[str]
...
...
examples/LaTeX/LaTeXParser.py
View file @
a985e61e
...
...
@@ -49,7 +49,7 @@ from DHParser import start_logging, suspend_logging, resume_logging, is_filename
positions_of
,
replace_tag_names
,
add_attributes
,
delimit_children
,
merge_connected
,
\
has_attr
,
has_parent
,
ThreadLocalSingletonFactory
,
Error
,
canonical_error_strings
,
\
has_errors
,
apply_unless
,
WARNING
,
ERROR
,
FATAL
,
EMPTY_NODE
,
TreeReduction
,
CombinedParser
,
\
Preprocessed
,
neutral_mapping
,
preprocess_includes
,
gen_find_include_func
,
flatten_sxpr
,
\
Preprocessed
,
preprocess_includes
,
gen_find_include_func
,
flatten_sxpr
,
\
replace_content_with
...
...
tests/test_preprocess.py
View file @
a985e61e
...
...
@@ -130,7 +130,7 @@ def preprocess_comments(src: str, src_name: str) -> Tuple[str, SourceMapFunc]:
positions
.
append
(
pos
)
offsets
.
append
(
offsets
[
-
1
])
return
'
\n
'
.
join
(
lines
),
\
partial
(
source_map
,
srcmap
=
SourceMap
(
'
FILE_NAME_
DUMMY'
,
positions
,
offsets
))
partial
(
source_map
,
srcmap
=
SourceMap
(
'DUMMY'
,
positions
,
offsets
,
[
'DUMMY'
]
*
len
(
positions
),
{
'DUMMY'
:
[]}
))
class
TestTokenParsing
:
...
...
@@ -284,7 +284,7 @@ class TestIncludes:
name
,
offset
,
k
=
mapping
(
i
)
# print(i, k, name)
txt
=
main
if
name
==
'main.txt'
else
sub
assert
text
[
i
]
==
txt
[
k
],
f
'
{
i
}
:
{
text
[
i
]
}
!=
{
txt
[
k
]
}
in
{
name
}
'
assert
text
[
i
]
==
txt
[
k
],
f
'
{
i
}
,
{
k
}
:
{
text
[
i
]
}
!=
{
txt
[
k
]
}
in
{
name
}
'
perform
(
'include(sub.txt)xyz'
,
'abc'
)
perform
(
'012include(sub.txt)xyz'
,
'abc'
)
...
...
@@ -310,8 +310,10 @@ class TestIncludes:
assert
text
==
substrings
[
'main'
]
# print(text)
for
i
in
range
(
len
(
text
)):
name
,
offset
,
k
=
mapping
(
i
)
name
,
lbreaks
,
k
=
mapping
(
i
)
txt
=
ensemble
[
name
]
# print(name, substrings[name], text[offset:offset + len(substrings[name])])
# assert text[offset:offset + len(substrings[name])] == substrings[name]
# print(name, txt, i, k)
assert
text
[
i
]
==
txt
[
k
],
f
'
{
i
}
:
{
text
[
i
]
}
!=
{
txt
[
k
]
}
in
{
name
}
'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment