Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
badw-it
DHParser
Commits
d18f157c
Commit
d18f157c
authored
Dec 26, 2017
by
Eckhart Arnold
Browse files
- preprocessing tests + some bug fixes
parent
f2162cfb
Changes
22
Hide whitespace changes
Inline
Side-by-side
DHParser/__init__.py
View file @
d18f157c
...
...
@@ -18,19 +18,20 @@ implied. See the License for the specific language governing
permissions and limitations under the License.
"""
# Flat namespace for the DHParser Package. Is this a good idea...?
from
.error
import
*
from
.dsl
import
*
from
.ebnf
import
*
from
.parsers
import
*
# Flat namespace for the DHParser Package. Is this a good idea...?
from
.error
import
*
from
.parse
import
*
from
.preprocess
import
*
from
.stringview
import
*
from
.syntaxtree
import
*
from
.testing
import
*
from
.toolkit
import
*
from
.transform
import
*
from
.testing
import
*
from
.versionnumber
import
__version__
__author__
=
"Eckhart Arnold <arnold@badw.de>"
__copyright__
=
"http://www.apache.org/licenses/LICENSE-2.0"
# __all__ = ['toolkit', 'stringview', 'error', 'syntaxtree', 'p
arser', 'transform', 'ebnf', 'dsl', 'testing
',
# '
versionnumber'] # flat namespace
# __all__ = ['toolkit', 'stringview', 'error', 'syntaxtree', 'p
reprocess', 'parse
',
# '
transform', 'ebnf', 'dsl', 'testing', 'versionnumber']
DHParser/dsl.py
View file @
d18f157c
...
...
@@ -20,18 +20,20 @@ compilation of domain specific languages based on an EBNF-grammar.
"""
import
os
from
typing
import
Any
,
cast
,
List
,
Tuple
,
Union
,
Iterator
,
Iterable
from
DHParser.ebnf
import
EBNFCompiler
,
grammar_changed
,
\
get_ebnf_preprocessor
,
get_ebnf_grammar
,
get_ebnf_transformer
,
get_ebnf_compiler
,
\
PreprocessorFactoryFunc
,
ParserFactoryFunc
,
TransformerFactoryFunc
,
CompilerFactoryFunc
from
DHParser.error
import
Error
,
is_error
,
has_errors
,
only_errors
from
DHParser.parsers
import
Grammar
,
Compiler
,
compile_source
,
nil_preprocessor
,
PreprocessorFunc
from
DHParser.parse
import
Grammar
,
Compiler
,
compile_source
from
DHParser.preprocess
import
nil_preprocessor
,
PreprocessorFunc
from
DHParser.syntaxtree
import
Node
,
TransformationFunc
from
DHParser.toolkit
import
logging
,
load_if_file
,
is_python_code
,
compile_python_object
,
\
re
,
typing
from
typing
import
Any
,
cast
,
List
,
Tuple
,
Union
,
Iterator
,
Iterable
re
__all__
=
(
'GrammarError'
,
__all__
=
(
'DHPARSER_IMPORTS'
,
'GrammarError'
,
'CompilationError'
,
'load_compiler_suite'
,
'compileDSL'
,
...
...
@@ -70,7 +72,7 @@ try:
except ImportError:
import re
from DHParser import logging, is_filename, load_if_file,
\\
Grammar, Compiler, nil_preprocessor,
\\
Grammar, Compiler, nil_preprocessor,
PreprocessorToken,
\\
Lookbehind, Lookahead, Alternative, Pop, Token, Synonym, AllOf, SomeOf, Unordered,
\\
Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, RE, Capture,
\\
ZeroOrMore, Forward, NegativeLookahead, mixin_comment, compile_source,
\\
...
...
@@ -495,14 +497,15 @@ def compile_on_disk(source_file: str, compiler_suite="", extension=".xml") -> It
+
"
\n
# "
.
join
(
str
(
error
).
split
(
'
\n
)'
)))
print
(
result
)
finally
:
if
f
:
f
.
close
()
if
f
:
f
.
close
()
return
messages
def
recompile_grammar
(
ebnf_filename
,
force
=
False
)
->
bool
:
"""
Recompiles an
ebnf
-grammar if necessary, that is, if either no
Re
-
compiles an
EBNF
-grammar if necessary, that is, if either no
corresponding 'XXXXCompiler.py'-file exists or if that file is
outdated.
...
...
DHParser/ebnf.py
View file @
d18f157c
...
...
@@ -19,18 +19,19 @@ permissions and limitations under the License.
import
keyword
from
collections
import
OrderedDict
from
functools
import
partial
from
typing
import
Callable
,
Dict
,
List
,
Set
,
Tuple
from
DHParser.error
import
Error
from
DHParser.parse
rs
import
Grammar
,
mixin_comment
,
nil_preprocessor
,
Forward
,
RegExp
,
RE
,
\
from
DHParser.parse
import
Grammar
,
mixin_comment
,
Forward
,
RegExp
,
RE
,
\
NegativeLookahead
,
Alternative
,
Series
,
Option
,
OneOrMore
,
ZeroOrMore
,
Token
,
\
Compiler
,
PreprocessorFunc
Compiler
from
DHParser.preprocess
import
nil_preprocessor
,
PreprocessorFunc
from
DHParser.syntaxtree
import
Node
,
TransformationFunc
,
WHITESPACE_PTYPE
,
TOKEN_PTYPE
from
DHParser.toolkit
import
load_if_file
,
escape_re
,
md5
,
sane_parser_name
,
re
,
typing
from
DHParser.toolkit
import
load_if_file
,
escape_re
,
md5
,
sane_parser_name
,
re
from
DHParser.transform
import
traverse
,
remove_brackets
,
\
reduce_single_child
,
replace_by_single_child
,
remove_expendables
,
\
remove_tokens
,
flatten
,
forbid
,
assert_content
,
remove_infix_operator
from
DHParser.versionnumber
import
__version__
from
typing
import
Callable
,
Dict
,
List
,
Set
,
Tuple
__all__
=
(
'get_ebnf_preprocessor'
,
'get_ebnf_grammar'
,
...
...
@@ -332,7 +333,7 @@ class EBNFCompiler(Compiler):
`alternative = a | b`
Now `[
str(node)
for node in self.rules['alternative']]`
Now `[
node.content
for node in self.rules['alternative']]`
yields `['alternative = a | b', 'a', 'b']`
symbols: A mapping of symbol names to their first usage (not
...
...
@@ -597,7 +598,7 @@ class EBNFCompiler(Compiler):
def
on_definition
(
self
,
node
:
Node
)
->
Tuple
[
str
,
str
]:
rule
=
str
(
node
.
children
[
0
]
)
rule
=
node
.
children
[
0
]
.
content
if
rule
in
self
.
rules
:
first
=
self
.
rules
[
rule
][
0
]
if
not
first
.
_errors
:
...
...
@@ -652,7 +653,7 @@ class EBNFCompiler(Compiler):
def
on_directive
(
self
,
node
:
Node
)
->
str
:
key
=
str
(
node
.
children
[
0
]
)
.
lower
()
key
=
node
.
children
[
0
]
.
content
.
lower
()
assert
key
not
in
self
.
directives
[
'tokens'
]
if
key
not
in
self
.
REPEATABLE_DIRECTIVES
:
...
...
@@ -674,8 +675,9 @@ class EBNFCompiler(Compiler):
else
:
node
.
add_error
(
'Value "%s" not allowed for directive "%s".'
%
(
value
,
key
))
else
:
value
=
str
(
node
.
children
[
1
]).
strip
(
"~"
)
# cast(str, node.children[1].result).strip("~")
if
value
!=
str
(
node
.
children
[
1
]):
# cast(str, node.children[1].result):
value
=
node
.
children
[
1
].
content
.
strip
(
"~"
)
# cast(str, node.children[
# 1].result).strip("~")
if
value
!=
node
.
children
[
1
].
content
:
# cast(str, node.children[1].result):
node
.
add_error
(
"Whitespace marker '~' not allowed in definition of "
"%s regular expression."
%
key
)
if
value
[
0
]
+
value
[
-
1
]
in
{
'""'
,
"''"
}:
...
...
@@ -688,11 +690,11 @@ class EBNFCompiler(Compiler):
self
.
directives
[
key
]
=
value
elif
key
==
'ignorecase'
:
if
str
(
node
.
children
[
1
]
)
.
lower
()
not
in
{
"off"
,
"false"
,
"no"
}:
if
node
.
children
[
1
]
.
content
.
lower
()
not
in
{
"off"
,
"false"
,
"no"
}:
self
.
re_flags
.
add
(
'i'
)
# elif key == 'testing':
# value =
str(
node.children[1]
)
# value = node.children[1]
.content
# self.directives['testing'] = value.lower() not in {"off", "false", "no"}
elif
key
==
'literalws'
:
...
...
@@ -708,7 +710,7 @@ class EBNFCompiler(Compiler):
elif
key
in
{
'tokens'
,
'preprocessor_tokens'
}:
tokens
=
self
.
compile
(
node
.
children
[
1
])
redeclared
=
self
.
directives
[
'tokes'
]
&
tokens
redeclared
=
self
.
directives
[
'toke
n
s'
]
&
tokens
if
redeclared
:
node
.
add_error
(
'Tokens %s have already been declared earlier. '
%
str
(
redeclared
)
+
'Later declaration will be ignored'
,
...
...
@@ -752,7 +754,7 @@ class EBNFCompiler(Compiler):
filtered_children
=
[]
i
=
0
for
nd
in
node
.
children
:
if
nd
.
parser
.
ptype
==
TOKEN_PTYPE
and
str
(
nd
)
==
"§"
:
if
nd
.
parser
.
ptype
==
TOKEN_PTYPE
and
nd
.
content
==
"§"
:
mandatory_marker
.
append
(
i
)
if
i
==
0
:
nd
.
add_error
(
'First item of a series should not be mandatory.'
,
...
...
@@ -774,7 +776,7 @@ class EBNFCompiler(Compiler):
def
on_factor
(
self
,
node
:
Node
)
->
str
:
assert
node
.
children
assert
len
(
node
.
children
)
>=
2
,
node
.
as_sxpr
()
prefix
=
str
(
node
.
children
[
0
]
)
# cast(str, node.children[0].result)
prefix
=
node
.
children
[
0
]
.
content
custom_args
=
[]
# type: List[str]
if
prefix
in
{
'::'
,
':'
}:
...
...
@@ -806,15 +808,15 @@ class EBNFCompiler(Compiler):
if
len
(
nd
.
children
)
>=
1
:
nd
=
nd
.
children
[
0
]
while
nd
.
parser
.
name
==
"symbol"
:
symlist
=
self
.
rules
.
get
(
str
(
nd
)
,
[])
symlist
=
self
.
rules
.
get
(
nd
.
content
,
[])
if
len
(
symlist
)
==
2
:
nd
=
symlist
[
1
]
else
:
if
len
(
symlist
)
==
1
:
nd
=
symlist
[
0
].
children
[
1
]
break
if
(
nd
.
parser
.
name
!=
"regexp"
or
str
(
nd
)
[:
1
]
!=
'/'
or
str
(
nd
)
[
-
1
:]
!=
'/'
):
if
(
nd
.
parser
.
name
!=
"regexp"
or
nd
.
content
[:
1
]
!=
'/'
or
nd
.
content
[
-
1
:]
!=
'/'
):
node
.
add_error
(
"Lookbehind-parser can only be used with plain RegExp-"
"parsers, not with: "
+
nd
.
parser
.
name
+
nd
.
parser
.
ptype
)
...
...
@@ -838,10 +840,6 @@ class EBNFCompiler(Compiler):
return
self
.
non_terminal
(
node
,
'OneOrMore'
)
def
on_regexchain
(
self
,
node
)
->
str
:
raise
EBNFCompilerError
(
"Not yet implemented!"
)
def
on_group
(
self
,
node
)
->
str
:
raise
EBNFCompilerError
(
"Group nodes should have been eliminated by "
"AST transformation!"
)
...
...
@@ -851,7 +849,7 @@ class EBNFCompiler(Compiler):
assert
len
(
node
.
children
)
==
1
nd
=
node
.
children
[
0
]
for
child
in
nd
.
children
:
if
child
.
parser
.
ptype
==
TOKEN_PTYPE
and
str
(
nd
)
==
"§"
:
if
child
.
parser
.
ptype
==
TOKEN_PTYPE
and
nd
.
content
==
"§"
:
node
.
add_error
(
"Unordered parser lists cannot contain mandatory (§) items."
)
args
=
', '
.
join
(
self
.
compile
(
child
)
for
child
in
nd
.
children
)
if
nd
.
parser
.
name
==
"term"
:
...
...
@@ -863,7 +861,7 @@ class EBNFCompiler(Compiler):
return
""
def
on_symbol
(
self
,
node
:
Node
)
->
str
:
# called only for symbols on the right hand side!
symbol
=
str
(
node
)
# ; assert result == cast(str, node.result)
symbol
=
node
.
content
# ; assert result == cast(str, node.result)
if
symbol
in
self
.
directives
[
'tokens'
]:
return
'PreprocessorToken("'
+
symbol
+
'")'
else
:
...
...
@@ -878,11 +876,12 @@ class EBNFCompiler(Compiler):
def
on_literal
(
self
,
node
)
->
str
:
return
'Token('
+
str
(
node
).
replace
(
'
\\
'
,
r
'\\'
)
+
')'
# return 'Token(' + ', '.merge_children([node.result]) + ')' ?
return
'Token('
+
node
.
content
.
replace
(
'
\\
'
,
r
'\\'
)
+
')'
# return 'Token(' + ',
# '.merge_children([node.result]) + ')' ?
def
on_regexp
(
self
,
node
:
Node
)
->
str
:
rx
=
str
(
node
)
rx
=
node
.
content
name
=
[]
# type: List[str]
if
rx
[
0
]
==
'/'
and
rx
[
-
1
]
==
'/'
:
parser
=
'RegExp('
...
...
DHParser/error.py
View file @
d18f157c
...
...
@@ -18,11 +18,9 @@ permissions and limitations under the License.
import
bisect
import
functools
from
typing
import
Iterable
,
Iterator
,
Union
,
Tuple
,
List
from
DHParser.stringview
import
StringView
from
DHParser.toolkit
import
typing
from
typing
import
Hashable
,
Iterable
,
Iterator
,
Union
,
Tuple
,
List
__all__
=
(
'Error'
,
'is_error'
,
...
...
@@ -71,10 +69,16 @@ class Error:
@
property
def
level_str
(
self
):
"""Returns a string representation of the error level, e.g. "warning".
"""
"""Returns a string representation of the error level, e.g. "warning"."""
return
"Warning"
if
is_warning
(
self
.
code
)
else
"Error"
def
visualize
(
self
,
document
:
str
)
->
str
:
"""Shows the line of the document and the position where the error
occurred."""
start
=
document
.
rfind
(
'
\n
'
,
0
,
self
.
pos
)
+
1
stop
=
document
.
find
(
'
\n
'
,
self
.
pos
)
return
document
[
start
:
stop
]
+
'
\n
'
+
' '
*
(
self
.
pos
-
start
)
+
'^
\n
'
def
is_warning
(
code
:
int
)
->
bool
:
"""Returns True, if error is merely a warning."""
...
...
DHParser/parse
rs
.py
→
DHParser/parse.py
View file @
d18f157c
...
...
@@ -59,26 +59,20 @@ import collections
import
copy
import
html
import
os
from
functools
import
partial
from
DHParser.error
import
Error
,
is_error
,
has_errors
,
linebreaks
,
line_col
from
DHParser.stringview
import
StringView
,
EMPTY_STRING_VIEW
from
DHParser.syntaxtree
import
Node
,
TransformationFunc
,
ParserBase
,
WHITESPACE_PTYPE
,
\
TOKEN_PTYPE
,
ZOMBIE_PARSER
from
DHParser.preprocess
import
BEGIN_TOKEN
,
END_TOKEN
,
RX_TOKEN_NAME
,
\
PreprocessorFunc
from
DHParser.toolkit
import
is_logging
,
log_dir
,
logfile_basename
,
escape_re
,
sane_parser_name
,
\
load_if_file
,
re
,
typing
escape_control_characters
,
load_if_file
,
re
,
typing
from
typing
import
Any
,
Callable
,
cast
,
Dict
,
List
,
Set
,
Tuple
,
Union
,
Optional
__all__
=
(
'PreprocessorFunc'
,
'HistoryRecord'
,
__all__
=
(
'HistoryRecord'
,
'Parser'
,
'Grammar'
,
'RX_PREPROCESSOR_TOKEN'
,
'BEGIN_TOKEN'
,
'END_TOKEN'
,
'make_token'
,
'nil_preprocessor'
,
'PreprocessorToken'
,
'RegExp'
,
'RE'
,
...
...
@@ -117,9 +111,6 @@ __all__ = ('PreprocessorFunc',
########################################################################
PreprocessorFunc
=
Union
[
Callable
[[
str
],
str
],
partial
]
LEFT_RECURSION_DEPTH
=
8
# type: int
# because of python's recursion depth limit, this value ought not to be
# set too high. PyPy allows higher values than CPython
...
...
@@ -242,7 +233,7 @@ class HistoryRecord:
def
excerpt
(
self
):
length
=
len
(
self
.
node
)
if
self
.
node
else
len
(
self
.
text
)
excerpt
=
str
(
self
.
node
)[:
min
(
length
,
20
)]
if
self
.
node
else
self
.
text
[:
20
]
excerpt
=
e
xcerpt
.
replace
(
'
\n
'
,
'
\\
n'
)
excerpt
=
e
scape_control_characters
(
excerpt
)
if
length
>
20
:
excerpt
+=
'...'
return
excerpt
...
...
@@ -1007,27 +998,28 @@ class Grammar:
if
html
and
len
(
log
)
%
100
==
0
:
log
.
append
(
'
\n
</table>
\n
<table>
\n
'
+
HistoryRecord
.
COLGROUP
)
if
is_logging
():
assert
self
.
history__
,
\
"Parser did not yet run or logging was turned off when running parser!"
if
not
log_file_name
:
name
=
self
.
__class__
.
__name__
log_file_name
=
name
[:
-
7
]
if
name
.
lower
().
endswith
(
'grammar'
)
else
name
elif
log_file_name
.
lower
().
endswith
(
'.log'
):
log_file_name
=
log_file_name
[:
-
4
]
full_history
,
match_history
,
errors_only
=
[],
[],
[]
for
record
in
self
.
history__
:
line
=
record
.
as_html_tr
()
if
html
else
str
(
record
)
append_line
(
full_history
,
line
)
if
record
.
node
and
record
.
node
.
parser
.
ptype
!=
WHITESPACE_PTYPE
:
append_line
(
match_history
,
line
)
if
record
.
node
.
error_flag
:
append_line
(
errors_only
,
line
)
write_log
(
full_history
,
log_file_name
+
'_full'
)
if
len
(
full_history
)
>
250
:
write_log
(
full_history
[
-
200
:],
log_file_name
+
'_full.tail'
)
write_log
(
match_history
,
log_file_name
+
'_match'
)
write_log
(
errors_only
,
log_file_name
+
'_errors'
)
if
not
is_logging
():
raise
AssertionError
(
"Cannot log history when logging is turned off!"
)
assert
self
.
history__
,
\
"Parser did not yet run or logging was turned off when running parser!"
if
not
log_file_name
:
name
=
self
.
__class__
.
__name__
log_file_name
=
name
[:
-
7
]
if
name
.
lower
().
endswith
(
'grammar'
)
else
name
elif
log_file_name
.
lower
().
endswith
(
'.log'
):
log_file_name
=
log_file_name
[:
-
4
]
full_history
,
match_history
,
errors_only
=
[],
[],
[]
for
record
in
self
.
history__
:
line
=
record
.
as_html_tr
()
if
html
else
str
(
record
)
append_line
(
full_history
,
line
)
if
record
.
node
and
record
.
node
.
parser
.
ptype
!=
WHITESPACE_PTYPE
:
append_line
(
match_history
,
line
)
if
record
.
node
.
error_flag
:
append_line
(
errors_only
,
line
)
write_log
(
full_history
,
log_file_name
+
'_full'
)
if
len
(
full_history
)
>
250
:
write_log
(
full_history
[
-
200
:],
log_file_name
+
'_full.tail'
)
write_log
(
match_history
,
log_file_name
+
'_match'
)
write_log
(
errors_only
,
log_file_name
+
'_errors'
)
def
dsl_error_msg
(
parser
:
Parser
,
error_str
:
str
)
->
str
:
...
...
@@ -1059,31 +1051,6 @@ def dsl_error_msg(parser: Parser, error_str: str) -> str:
########################################################################
RX_PREPROCESSOR_TOKEN
=
re
.
compile
(
r
'\w+'
)
BEGIN_TOKEN
=
'
\x1b
'
END_TOKEN
=
'
\x1c
'
def
make_token
(
token
:
str
,
argument
:
str
=
''
)
->
str
:
"""
Turns the ``token`` and ``argument`` into a special token that
will be caught by the `PreprocessorToken`-parser.
This function is a support function that should be used by
preprocessors to inject preprocessor tokens into the source text.
"""
assert
RX_PREPROCESSOR_TOKEN
.
match
(
token
)
assert
argument
.
find
(
BEGIN_TOKEN
)
<
0
assert
argument
.
find
(
END_TOKEN
)
<
0
return
BEGIN_TOKEN
+
token
+
argument
+
END_TOKEN
def
nil_preprocessor
(
text
:
str
)
->
str
:
"""A preprocessor that does nothing, i.e. just returns the input."""
return
text
class
PreprocessorToken
(
Parser
):
"""
Parses tokens that have been inserted by a preprocessor.
...
...
@@ -1097,7 +1064,7 @@ class PreprocessorToken(Parser):
def
__init__
(
self
,
token
:
str
)
->
None
:
assert
token
and
token
.
isupper
()
assert
RX_
PREPROCESSOR_
TOKEN
.
match
(
token
)
assert
RX_TOKEN
_NAME
.
match
(
token
)
super
(
PreprocessorToken
,
self
).
__init__
(
token
)
def
__call__
(
self
,
text
:
StringView
)
->
Tuple
[
Optional
[
Node
],
StringView
]:
...
...
@@ -1121,8 +1088,7 @@ class PreprocessorToken(Parser):
'(Most likely due to a preprocessor bug!)'
)
return
node
,
text
[
end
:]
if
text
[
1
:
len
(
self
.
name
)
+
1
]
==
self
.
name
:
return
Node
(
self
,
text
[
len
(
self
.
name
)
+
1
:
end
]),
\
text
[
end
+
1
:]
return
Node
(
self
,
text
[
len
(
self
.
name
)
+
2
:
end
]),
text
[
end
+
1
:]
return
None
,
text
...
...
@@ -1157,15 +1123,21 @@ class RegExp(Parser):
return
RegExp
(
regexp
,
self
.
name
)
def
__call__
(
self
,
text
:
StringView
)
->
Tuple
[
Optional
[
Node
],
StringView
]:
if
text
[
0
:
1
]
!=
BEGIN_TOKEN
:
# ESC starts a preprocessor token.
match
=
text
.
match
(
self
.
regexp
)
if
match
:
end
=
text
.
index
(
match
.
end
())
return
Node
(
self
,
match
.
group
(
0
),
True
),
text
[
end
:]
match
=
text
.
match
(
self
.
regexp
)
if
match
:
capture
=
match
.
group
(
0
)
end
=
text
.
index
(
match
.
end
())
# regular expresseion must never match preprocessor-tokens!
# TODO: Find a better solution here, e.g. static checking/re-mangling at compile time
i
=
capture
.
find
(
BEGIN_TOKEN
)
if
i
>=
0
:
capture
=
capture
[:
i
]
end
=
i
return
Node
(
self
,
capture
,
True
),
text
[
end
:]
return
None
,
text
def
__repr__
(
self
):
return
'/%s/'
%
self
.
regexp
.
pattern
return
escape_control_characters
(
'/%s/'
%
self
.
regexp
.
pattern
)
class
Whitespace
(
RegExp
):
...
...
DHParser/preprocess.py
0 → 100644
View file @
d18f157c
""" preprocess.py - preprocessing of source files for DHParser
Copyright 2016 by Eckhart Arnold (arnold@badw.de)
Bavarian Academy of Sciences an Humanities (badw.de)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
"""
import
bisect
import
collections
import
functools
from
DHParser.toolkit
import
typing
,
re
from
typing
import
Union
,
Callable
__all__
=
(
'RX_TOKEN_NAME'
,
'BEGIN_TOKEN'
,
'TOKEN_DELIMITER'
,
'END_TOKEN'
,
'PreprocessorFunc'
,
'make_token'
,
'nil_preprocessor'
,
'pp_tokenized'
,
'tokenized_to_original_mapping'
,
'source_map'
)
BEGIN_TOKEN
=
'
\x1b
'
TOKEN_DELIMITER
=
'
\x1c
'
END_TOKEN
=
'
\x1d
'
RESERVED_TOKEN_CHARS
=
BEGIN_TOKEN
+
TOKEN_DELIMITER
+
END_TOKEN
RX_TOKEN_NAME
=
re
.
compile
(
r
'\w+'
)
RX_TOKEN_ARGUMENT
=
re
.
compile
(
r
'[^\x1b\x1c\x1d]*'
)
RX_TOKEN
=
re
.
compile
(
r
'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d'
)
PreprocessorFunc
=
Union
[
Callable
[[
str
],
str
],
functools
.
partial
]
def
make_token
(
token
:
str
,
argument
:
str
=
''
)
->
str
:
"""
Turns the ``token`` and ``argument`` into a special token that
will be caught by the `PreprocessorToken`-parser.
This function is a support function that should be used by
preprocessors to inject preprocessor tokens into the source text.
"""
assert
RX_TOKEN_NAME
.
match
(
token
)
assert
RX_TOKEN_ARGUMENT
.
match
(
argument
)
return
BEGIN_TOKEN
+
token
+
TOKEN_DELIMITER
+
argument
+
END_TOKEN
def
nil_preprocessor
(
text
:
str
)
->
str
:
"""A preprocessor that does nothing, i.e. just returns the input."""
return
text
def
pp_tokenized
(
tokenized
:
str
)
->
str
:
"""Returns a pretty-printable version of a document that contains tokens."""
return
tokenized
.
replace
(
'
\x1b
'
,
'<'
).
replace
(
'
\x1c
'
,
'|'
).
replace
(
'
\x1d
'
,
'>'
)
#######################################################################
#
# Source Maps - mapping source code positions between different
# transformations of the source text
#
#######################################################################
SourceMap
=
collections
.
namedtuple
(
'SourceMap'
,
[
'positions'
,
'offsets'
])
def
tokenized_to_original_mapping
(
tokenized_source
:
str
)
->
SourceMap
:
"""
Generates a source map for mapping positions in a text that has
been enriched with token markers to their original positions.
Args:
tokenized_source: the source text enriched with token markers
Returns:
a source map, i.e. a list of positions and a list of corresponding
offsets. The list of positions is ordered from smallest to highest.
An offset is valid for its associated position and all following
positions until (and excluding) the next position in the list of
positions.
"""
positions
,
offsets
=
[
0
],
[
0
]
o
=
0
i
=
tokenized_source
.
find
(
BEGIN_TOKEN
)
while
i
>=
0
:
d
=
tokenized_source
.
find
(
TOKEN_DELIMITER
,
i
)
e
=
tokenized_source
.
find
(
END_TOKEN
,
i
)
assert
0
<=
d
<
e
o
-=
(
d
-
i
+
2
)
positions
.
extend
([
d
+
1
,
e
+
1
])
offsets
.
extend
([
o
,
o
-
1
])
i
=
tokenized_source
.
find
(
BEGIN_TOKEN
,
e
+
1
)
# post conditions
assert
len
(
positions
)
==
len
(
offsets
),
'
\n
'
+
str
(
positions
)
+
'
\n
'
+
str
(
offsets
)
assert
positions
[
0
]
==
0
assert
all
(
positions
[
i
]
<
positions
[
i
+
1
]
for
i
in
range
(
len
(
positions
)
-
1
))
assert
all
(
offsets
[
i
]
>
offsets
[
i
+
1
]
for
i
in
range
(
len
(
offsets
)
-
1
))
return
SourceMap
(
positions
,
offsets
)
def
source_map
(
position
:
int
,
srcmap
:
SourceMap
)
->
int
:
"""
Maps a position in a (pre-)processed text to its corresponding
position in the original document according to the given source map.
Args:
position: the position in the processed text
srcmap: the source map, i.e. a mapping of locations to
offset values
Returns:
the mapped position
"""
i
=
bisect
.
bisect_right
(
srcmap
[
0
],
position
)
if
i
:
return
position
+
srcmap
[
1
][
i
-
1
]
raise
ValueError
DHParser/stringview.py
View file @
d18f157c
"""cstringview.pyx - a cython-version of the stringview class for speedup
slicing strings without copying
"""stringview.py - a string class where slices are views not copies as
with the standard Python strings.
stringview.pxd - declarations for the cython Python to C compiler
to speed up handling of StringViews.
Copyright 2016 by Eckhart Arnold (arnold@badw.de)
Bavarian Academy of Sciences an Humanities (badw.de)
...
...
@@ -24,6 +27,7 @@ time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.
"""
import
collections
from
DHParser.toolkit
import
typing
...
...
@@ -81,7 +85,7 @@ def real_indices(begin: Optional[int],
class
StringView
(
collections
.
abc
.
Sized
):
"""
A rudimentary StringView class, just enough for the use cases
in parse
rs
.py. The difference between a StringView and the python
in parse.py. The difference between a StringView and the python
builtin strings is that StringView-objects do slicing without
copying, i.e. slices are just a view on a section of the sliced
string.
...
...
DHParser/syntaxtree.py
View file @
d18f157c
...
...
@@ -399,7 +399,6 @@ class Node(collections.abc.Sized):