Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
badw-it
DHParser
Commits
b5fd9558
Commit
b5fd9558
authored
Apr 14, 2017
by
Eckhart Arnold
Browse files
- rudimentary support for semantic validation in syntaxtree.py; more unittests; bug fixes
parent
84e7061a
Changes
6
Hide whitespace changes
Inline
Side-by-side
DHParser/EBNFcompiler.py
View file @
b5fd9558
...
...
@@ -32,7 +32,7 @@ from .parsercombinators import GrammarBase, mixin_comment, Forward, RE, Negative
Alternative
,
Sequence
,
Optional
,
Required
,
OneOrMore
,
ZeroOrMore
,
Token
,
CompilerBase
from
.syntaxtree
import
Node
,
remove_enclosing_delimiters
,
reduce_single_child
,
\
replace_by_single_child
,
TOKEN_KEYWORD
,
remove_expendables
,
remove_tokens
,
flatten
,
\
WHITESPACE_KEYWORD
forbid
,
assert_content
,
WHITESPACE_KEYWORD
__all__
=
[
'EBNFGrammar'
,
...
...
@@ -61,6 +61,7 @@ class EBNFGrammar(GrammarBase):
| [flowmarker] literal
| [flowmarker] regexp
| [flowmarker] group
| [flowmarker] regexchain
| [flowmarker] oneormore
| repetition
| option
...
...
@@ -70,9 +71,12 @@ class EBNFGrammar(GrammarBase):
retrieveop = "::" | ":" # '::' pop, ':' retrieve
group = "(" expression §")"
option = "["
expression
§"]"
regexchain = "<" expression §">" # compiles "expression" into a singular regular
expression
oneormore = "{" expression "}+"
repetition = "{" expression §"}"
option = "[" expression §"]"
link = regexp | symbol | literal # semantic restriction: symbol must evaluate to a regexp or chain
symbol = /(?!\d)\w+/~ # e.g. expression, factor, parameter_list
literal = /"(?:[^"]|\\")*?"/~ # e.g. "(", '+', 'while'
...
...
@@ -80,36 +84,39 @@ class EBNFGrammar(GrammarBase):
regexp = /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
# '~' is a whitespace-marker, if present leading or trailing
# whitespace of a regular expression will be ignored tacitly.
list_ = /\w+
\s*(?:,\s*\w+\s*)*/~
# comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
list_ = /\w+
/~ { "," /\w+/~ }
# comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
# BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
EOF = !/./
"""
expression
=
Forward
()
source_hash__
=
"
1065c2e43262a5cb3aa438ec4d347c32
"
source_hash__
=
"
a410e1727fb7575e98ff8451dbf8f3bd
"
parser_initialization__
=
"upon instatiation"
wsp__
=
mixin_comment
(
whitespace
=
r
'\s*'
,
comment
=
r
'#.*(?:\n|$)'
)
COMMENT__
=
r
'#.*(?:\n|$)'
WSP__
=
mixin_comment
(
whitespace
=
r
'\s*'
,
comment
=
r
'#.*(?:\n|$)'
)
wspL__
=
''
wspR__
=
wsp
__
wspR__
=
WSP
__
EOF
=
NegativeLookahead
(
RE
(
'.'
,
wR
=
''
))
list_
=
RE
(
'
\\
w+
\\
s*(?:,
\\
s*
\\
w+
\\
s*)*'
)
list_
=
Sequence
(
RE
(
'
\\
w+'
),
ZeroOrMore
(
Sequence
(
Token
(
","
),
RE
(
'
\\
w+'
)))
)
regexp
=
RE
(
'~?/(?:[^/]|(?<=
\\\\
)/)*/~?'
)
literal
=
Alternative
(
RE
(
'"(?:[^"]|
\\\\
")*?"'
),
RE
(
"'(?:[^']|
\\\\
')*?'"
))
symbol
=
RE
(
'(?!
\\
d)
\\
w+'
)
link
=
Alternative
(
regexp
,
symbol
,
literal
)
option
=
Sequence
(
Token
(
"["
),
expression
,
Required
(
Token
(
"]"
)))
repetition
=
Sequence
(
Token
(
"{"
),
expression
,
Required
(
Token
(
"}"
)))
oneormore
=
Sequence
(
Token
(
"{"
),
expression
,
Token
(
"}+"
))
optio
n
=
Sequence
(
Token
(
"
[
"
),
expression
,
Required
(
Token
(
"
]
"
)))
regexchai
n
=
Sequence
(
Token
(
"
<
"
),
expression
,
Required
(
Token
(
"
>
"
)))
group
=
Sequence
(
Token
(
"("
),
expression
,
Required
(
Token
(
")"
)))
retrieveop
=
Alternative
(
Token
(
"::"
),
Token
(
":"
))
flowmarker
=
Alternative
(
Token
(
"!"
),
Token
(
"&"
),
Token
(
"§"
),
Token
(
"-!"
),
Token
(
"-&"
))
factor
=
Alternative
(
Sequence
(
Optional
(
flowmarker
),
Optional
(
retrieveop
),
symbol
,
NegativeLookahead
(
Token
(
"="
))),
Sequence
(
Optional
(
flowmarker
),
literal
),
Sequence
(
Optional
(
flowmarker
),
regexp
),
Sequence
(
Optional
(
flowmarker
),
group
),
Sequence
(
Optional
(
flowmarker
),
oneormore
),
repetition
,
option
)
Sequence
(
Optional
(
flowmarker
),
group
),
Sequence
(
Optional
(
flowmarker
),
regexchain
)
,
Sequence
(
Optional
(
flowmarker
),
oneormore
),
repetition
,
option
)
term
=
OneOrMore
(
factor
)
expression
.
set
(
Sequence
(
term
,
ZeroOrMore
(
Sequence
(
Token
(
"|"
),
term
))))
directive
=
Sequence
(
Token
(
"@"
),
Required
(
symbol
),
Required
(
Token
(
"="
)),
Alternative
(
regexp
,
literal
,
list_
))
definition
=
Sequence
(
symbol
,
Required
(
Token
(
"="
)),
expression
)
syntax
=
Sequence
(
Optional
(
RE
(
''
,
wR
=
''
,
wL
=
wsp
__
)),
ZeroOrMore
(
Alternative
(
definition
,
directive
)),
Required
(
EOF
))
syntax
=
Sequence
(
Optional
(
RE
(
''
,
wR
=
''
,
wL
=
WSP
__
)),
ZeroOrMore
(
Alternative
(
definition
,
directive
)),
Required
(
EOF
))
root__
=
syntax
...
...
@@ -140,8 +147,14 @@ EBNF_ASTTransform = {
[
remove_expendables
,
replace_by_single_child
]
}
EBNF_semantic_validation
=
{
# Semantic validation on the AST
"repetition, option, oneormore"
:
[
partial
(
forbid
,
child_tags
=
[
'repetition'
,
'option'
,
'oneormore'
]),
partial
(
assert_content
,
regex
=
r
'(?!§)'
)],
}
EBNF_ASTPipeline
=
[
EBNF_ASTTransform
]
EBNF_ASTPipeline
=
[
EBNF_ASTTransform
,
EBNF_semantic_validation
]
class
EBNFCompilerError
(
Exception
):
...
...
DHParser/parsercombinators.py
View file @
b5fd9558
...
...
@@ -59,7 +59,7 @@ try:
except
ImportError
:
import
re
from
.toolkit
import
IS_LOGGING
,
LOGS_DIR
,
escape_re
,
sane_parser_name
,
s
equence
from
.toolkit
import
IS_LOGGING
,
LOGS_DIR
,
escape_re
,
sane_parser_name
,
s
mart_list
from
.syntaxtree
import
WHITESPACE_KEYWORD
,
TOKEN_KEYWORD
,
ZOMBIE_PARSER
,
Node
,
\
traverse
from
DHParser.toolkit
import
error_messages
...
...
@@ -460,6 +460,13 @@ class ScannerToken(Parser):
class
RegExp
(
Parser
):
"""Regular expression parser.
The RegExp-parser parses text that matches a regular expression.
RegExp can also be considered as the "atomic parser", because all
other parsers delegate part of the parsing job to other parsers,
but do not match text directly.
"""
def
__init__
(
self
,
regexp
,
name
=
None
):
super
(
RegExp
,
self
).
__init__
(
name
)
self
.
regexp
=
re
.
compile
(
regexp
)
if
isinstance
(
regexp
,
str
)
else
regexp
...
...
@@ -482,9 +489,33 @@ class RegExp(Parser):
class
RE
(
Parser
):
"""Regular Expressions with optional leading or trailing whitespace.
The RE-parser parses pieces of text that match a given regular
expression. Other than the ``RegExp``-Parser it can also skip
"implicit whitespace" before or after the matched text.
The whitespace is in turn defined by a regular expression. It
should be made sure that this expression also matches the empty
string, e.g. use r'\s*' or r'[
\t
]+', but not r'\s+'. If the
respective parameters in the constructor are set to ``None`` the
default whitespace expression from the Grammar object will be used.
"""
def
__init__
(
self
,
regexp
,
wL
=
None
,
wR
=
None
,
name
=
None
):
"""Constructor for class RE.
Args:
regexp (str or regex object): The regular expression to be
used for parsing.
wL (str or regexp): Left whitespace regular expression,
i.e. either ``None``, the empty string or a regular
expression (e.g. "\s*") that defines whitespace. An
empty string means no whitespace will be skipped,
``None`` means that the default whitespace will be
used.
wR (str or regexp): Right whitespace regular expression.
See above.
name: The optional name of the parser.
"""
super
(
RE
,
self
).
__init__
(
name
)
# assert wR or regexp == '.' or isinstance(self, Token)
self
.
wL
=
wL
...
...
@@ -520,6 +551,7 @@ class RE(Parser):
def
_grammar_assigned_notifier
(
self
):
if
self
.
grammar
:
# use default whitespace parsers if not otherwise specified
if
self
.
wL
is
None
:
self
.
wspLeft
=
self
.
grammar
.
wsp_left_parser__
if
self
.
wR
is
None
:
...
...
@@ -535,11 +567,24 @@ class RE(Parser):
def
Token
(
token
,
wL
=
None
,
wR
=
None
,
name
=
None
):
"""Returns an RE-parser that matches plain strings that are
considered as 'tokens'.
If the ``name``-parameter is empty, the parser's name will be set
to the TOKEN_KEYWORD, making it easy to identify tokens in the
abstract syntax tree transformation and compilation stage.
"""
return
RE
(
escape_re
(
token
),
wL
,
wR
,
name
or
TOKEN_KEYWORD
)
def
mixin_comment
(
whitespace
,
comment
):
"""Mixes comment-regexp into whitespace regexp.
"""Returns a regular expression that merges comment and whitespace
regexps. Thus comments cann occur whereever whitespace is allowed
and will be skipped just as implicit whitespace.
Note, that because this works on the level of regular expressions,
nesting comments is not possible. It also makes it much harder to
use directives inside comments (which isn't recommended, anyway).
"""
wspc
=
'(?:'
+
whitespace
+
'(?:'
+
comment
+
whitespace
+
')*)'
return
wspc
...
...
@@ -868,7 +913,10 @@ class CompilerBase:
return
None
else
:
compiler
=
self
.
__getattribute__
(
elem
)
# TODO Add support for python keyword attributes
return
compiler
(
node
)
result
=
compiler
(
node
)
for
child
in
node
.
children
:
node
.
error_flag
|=
child
.
error_flag
return
result
def
full_compilation
(
source
,
grammar_base
,
AST_pipeline
,
compiler
):
...
...
@@ -879,7 +927,7 @@ def full_compilation(source, grammar_base, AST_pipeline, compiler):
The compilations stage is only invoked if no errors occurred in
either of the two previous stages.
Paraemter
s:
Arg
s:
source (str): The input text for compilation
grammar_base (GrammarBase): The GrammarBase object
AST_pipeline (dict or list of dicts): A syntax-tree processing
...
...
@@ -912,12 +960,15 @@ def full_compilation(source, grammar_base, AST_pipeline, compiler):
# likely that error list gets littered with compile error messages
if
syntax_tree
.
error_flag
:
result
=
None
errors
=
syntax_tree
.
collect_errors
()
else
:
for
processing_table
in
s
equence
(
AST_pipeline
):
for
processing_table
in
s
mart_list
(
AST_pipeline
):
traverse
(
syntax_tree
,
processing_table
)
syntax_tree
.
log
(
log_file_name
,
ext
=
'.ast'
)
result
=
compiler
.
compile__
(
syntax_tree
)
errors
=
syntax_tree
.
collect_errors
()
errors
=
syntax_tree
.
collect_errors
()
if
not
errors
:
result
=
compiler
.
compile__
(
syntax_tree
)
errors
=
syntax_tree
.
collect_errors
()
messages
=
error_messages
(
source
,
errors
)
return
result
,
messages
,
syntax_tree
...
...
DHParser/syntaxtree.py
View file @
b5fd9558
...
...
@@ -28,7 +28,7 @@ except ImportError:
import
re
from
typing
import
NamedTuple
from
.toolkit
import
IS_LOGGING
,
LOGS_DIR
,
expand_table
,
line_col
,
s
equence
from
.toolkit
import
IS_LOGGING
,
LOGS_DIR
,
expand_table
,
line_col
,
s
mart_list
__all__
=
[
'WHITESPACE_KEYWORD'
,
...
...
@@ -50,7 +50,10 @@ __all__ = ['WHITESPACE_KEYWORD',
'remove_expendables'
,
'remove_tokens'
,
'flatten'
,
'remove_enclosing_delimiters'
]
'remove_enclosing_delimiters'
,
'forbid'
,
'require'
,
'assert_content'
]
class
ZombieParser
:
...
...
@@ -297,6 +300,7 @@ class Node:
of a set of tuples (position, error_message), where position
is always relative to this node.
"""
errors
=
[]
if
self
.
error_flag
:
errors
=
self
.
errors
if
clear_errors
:
...
...
@@ -305,8 +309,7 @@ class Node:
if
self
.
children
:
for
child
in
self
.
result
:
errors
.
extend
(
child
.
collect_errors
(
clear_errors
))
return
errors
return
[]
return
errors
def
log
(
self
,
log_file_name
,
ext
):
if
IS_LOGGING
():
...
...
@@ -393,13 +396,14 @@ def traverse(root_node, processing_table):
"""
# normalize processing_table entries by turning single values into lists
# with a single value
table
=
{
name
:
s
equence
(
call
)
for
name
,
call
in
list
(
processing_table
.
items
())}
table
=
{
name
:
s
mart_list
(
call
)
for
name
,
call
in
list
(
processing_table
.
items
())}
table
=
expand_table
(
table
)
def
traverse_recursive
(
node
):
if
node
.
children
:
for
child
in
node
.
result
:
traverse_recursive
(
child
)
node
.
error_flag
|=
child
.
error_flag
# propagate error flag
sequence
=
table
.
get
(
node
.
parser
.
name
,
table
.
get
(
'~'
,
[]))
+
table
.
get
(
'*'
,
[])
for
call
in
sequence
:
...
...
@@ -531,3 +535,29 @@ def remove_enclosing_delimiters(node):
node
.
result
=
node
.
result
[
1
:
-
1
]
########################################################################
#
# syntax tree validation functions
#
########################################################################
def
require
(
node
,
child_tags
):
for
child
in
node
.
children
:
if
child
.
tag_name
not
in
child_tags
:
node
.
add_error
(
'Element "%s" is not allowed inside "%s".'
%
(
child
.
tag_name
,
node
.
tag_name
))
def
forbid
(
node
,
child_tags
):
for
child
in
node
.
children
:
if
child
.
tag_name
in
child_tags
:
node
.
add_error
(
'Element "%s" cannot be nested inside "%s".'
%
(
child
.
tag_name
,
node
.
tag_name
))
def
assert_content
(
node
,
regex
):
content
=
str
(
node
)
if
not
re
.
match
(
regex
,
content
):
node
.
add_error
(
'Element "%s" violates %s on %s'
%
(
node
.
tag_name
,
str
(
regex
),
content
))
DHParser/toolkit.py
View file @
b5fd9558
...
...
@@ -52,7 +52,7 @@ __all__ = ['logging_on',
'is_python_code'
,
'md5'
,
'expand_table'
,
's
equence
'
,
's
mart_list
'
,
'sane_parser_name'
]
...
...
@@ -189,6 +189,47 @@ def md5(*txt):
return
md5_hash
.
hexdigest
()
def
smart_list
(
arg
):
"""Returns the argument as list, depending on its type and content.
If the argument is a string, it will be interpreted as a list of
comma separated values, trying ';', ',', ' ' as possible delimiters
in this order, e.g.
>>> smart_list("1; 2, 3; 4")
["1", "2, 3", "4"]
>>> smart_list("2, 3")
["2", "3"]
>>> smart_list("a b cd")
["a", "b", "cd"]
If the argument is a collection other than a string, it will be
returned as is, e.g.
>>> smart_list((1, 2, 3))
(1, 2, 3)
>>> smart_list({1, 2, 3})
{1, 2, 3}
If the argument is another iterable than a collection, it will
be converted into a list, e.g.
>>> smart_list(i for i in {1,2,3})
[1, 2, 3]
Finally, if none of the above is true, the argument will be
wrapped in a list and returned, e.g.
>>> smart_list(125)
[125]
"""
if
isinstance
(
arg
,
str
):
for
delimiter
in
(
';'
,
','
):
lst
=
arg
.
split
(
delimiter
)
if
len
(
lst
)
>
1
:
return
(
s
.
strip
()
for
s
in
lst
)
return
(
s
.
strip
()
for
s
in
arg
.
strip
().
split
(
' '
))
elif
isinstance
(
arg
,
collections
.
abc
.
Collection
):
return
arg
elif
isinstance
(
arg
,
collections
.
abc
.
Iterable
):
return
list
(
arg
)
else
:
return
[
arg
]
def
expand_table
(
compact_table
):
"""Expands a table by separating keywords that are tuples or strings
containing comma separated words into single keyword entries with
...
...
@@ -201,22 +242,11 @@ def expand_table(compact_table):
keys
=
list
(
compact_table
.
keys
())
for
key
in
keys
:
value
=
compact_table
[
key
]
if
isinstance
(
key
,
str
):
parts
=
(
s
.
strip
()
for
s
in
key
.
split
(
','
))
else
:
assert
isinstance
(
key
,
collections
.
abc
.
Iterable
)
parts
=
key
for
p
in
parts
:
expanded_table
[
p
]
=
value
for
k
in
smart_list
(
key
):
expanded_table
[
k
]
=
value
return
expanded_table
def
sequence
(
arg
):
"""Returns the argument if it is a sequence, otherwise returns a
list containing the argument as sole item."""
return
arg
if
isinstance
(
arg
,
collections
.
abc
.
Sequence
)
else
[
arg
]
def
sane_parser_name
(
name
):
"""Checks whether given name is an acceptable parser name. Parser names
must not be preceeded or succeeded by a double underscore '__'!
...
...
tests/test_EBNFcompiler.py
View file @
b5fd9558
...
...
@@ -23,6 +23,9 @@ limitations under the License.
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
abspath
(
'../../'
))
from
DHParser.syntaxtree
import
traverse
from
DHParser.parsercombinators
import
full_compilation
from
DHParser.EBNFcompiler
import
EBNFGrammar
,
EBNF_ASTPipeline
,
EBNFCompiler
from
DHParser.DSLsupport
import
compileEBNF
...
...
@@ -59,6 +62,7 @@ class TestDirectives:
syntax_tree
=
parser
.
parse
(
"3 + 4
\n
* 12"
)
assert
syntax_tree
.
collect_errors
()
class
TestPopRetrieve
:
mini_language
=
"""
document = { text | codeblock }
...
...
@@ -78,6 +82,9 @@ class TestPopRetrieve:
teststr
=
"Anfang ```code block `` <- keine Ende-Zeichen ! ``` Ende"
syntax_tree
=
self
.
minilang_parser
.
parse
(
teststr
)
assert
not
syntax_tree
.
collect_errors
()
delim
=
str
(
next
(
syntax_tree
.
find
(
lambda
node
:
node
.
tag_name
==
"delimiter"
)))
pop
=
str
(
next
(
syntax_tree
.
find
(
lambda
node
:
node
.
tag_name
==
"Pop"
)))
assert
delim
==
pop
if
WRITE_LOGS
:
syntax_tree
.
log
(
"test_PopRetrieve_single_line"
,
'.cst'
)
# self.minilang_parser.log_parsing_history("test_PopRetrieve_single_line")
...
...
@@ -93,11 +100,45 @@ class TestPopRetrieve:
"""
syntax_tree
=
self
.
minilang_parser
.
parse
(
teststr
)
assert
not
syntax_tree
.
collect_errors
()
delim
=
str
(
next
(
syntax_tree
.
find
(
lambda
node
:
node
.
tag_name
==
"delimiter"
)))
pop
=
str
(
next
(
syntax_tree
.
find
(
lambda
node
:
node
.
tag_name
==
"Pop"
)))
assert
delim
==
pop
if
WRITE_LOGS
:
syntax_tree
.
log
(
"test_PopRetrieve_multi_line"
,
'.cst'
)
# self.minilang_parser.log_parsing_history("test_PopRetrieve_multi_line")
class
TestSemanticValidation
:
def
check
(
self
,
minilang
,
bool_filter
=
lambda
x
:
x
):
grammar
=
EBNFGrammar
()
st
=
grammar
.
parse
(
minilang
)
assert
not
st
.
collect_errors
()
for
table
in
EBNF_ASTPipeline
:
traverse
(
st
,
table
)
assert
bool_filter
(
st
.
collect_errors
())
def
test_illegal_nesting
(
self
):
self
.
check
(
'impossible = { [ "an optional requirement" ] }'
)
def
test_illegal_nesting_option_required
(
self
):
self
.
check
(
'impossible = [ §"an optional requirement" ]'
)
def
test_illegal_nesting_oneormore_option
(
self
):
self
.
check
(
'impossible = { [ "no use"] }+'
)
def
test_legal_nesting
(
self
):
self
.
check
(
'possible = { [ "+" ] "1" }'
,
lambda
x
:
not
x
)
class
TestCompilerErrors
:
def
test_error_propagation
(
self
):
ebnf
=
"@ literalws = wrongvalue # testing error propagation"
result
,
messages
,
st
=
full_compilation
(
ebnf
,
EBNFGrammar
(),
EBNF_ASTPipeline
,
EBNFCompiler
(
'ErrorPropagationTest'
))
assert
messages
if
__name__
==
"__main__"
:
from
run
import
run_tests
run_tests
(
"TestDirectives TestPopRetrieve"
,
globals
())
run_tests
(
"TestCompilerErrors"
,
globals
())
tests/test_syntaxtree.py
View file @
b5fd9558
...
...
@@ -25,7 +25,8 @@ import re
import
sys
sys
.
path
.
append
(
os
.
path
.
abspath
(
'../../'
))
from
DHParser.toolkit
import
compact_sexpr
from
DHParser.syntaxtree
import
Node
from
DHParser.syntaxtree
import
Node
,
traverse
class
DummyParser
:
def
__init__
(
self
,
name
=
''
):
...
...
@@ -115,6 +116,21 @@ class TestNode:
assert
found
[
0
].
result
==
'x'
and
found
[
1
].
result
==
'y'
class
TestErrorHandling
:
def
test_error_propagations
(
self
):
tree
=
from_sexpr
(
'(a (b c) (d (e (f (g h)))))'
)
def
find_h
(
node
):
if
node
.
result
==
"h"
:
node
.
add_error
(
"an error deep inside the syntax tree"
)
assert
not
tree
.
error_flag
traverse
(
tree
,
{
"*"
:
find_h
})
assert
tree
.
error_flag
if
__name__
==
"__main__"
:
from
run
import
run_tests
run_tests
(
"TestSExpr TestNode"
,
globals
())
run_tests
(
"TestErrorHandling"
,
globals
())
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment