Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
9.2.2023: Due to updates GitLab will be unavailable for some minutes between 9:00 and 11:00.
Open sidebar
badw-it
DHParser
Commits
2cb7b778
Commit
2cb7b778
authored
Jul 10, 2017
by
Eckhart Arnold
Browse files
- regex bug in EBNF.ebnf and ebnf.EBNFGrammar fixed
parent
f277041a
Changes
6
Hide whitespace changes
Inline
Side-by-side
DHParser/ebnf.py
View file @
2cb7b778
...
...
@@ -124,7 +124,7 @@ class EBNFGrammar(Grammar):
wspR__
=
WSP__
EOF
=
NegativeLookahead
(
RE
(
'.'
,
wR
=
''
))
list_
=
Series
(
RE
(
'
\\
w+'
),
ZeroOrMore
(
Series
(
Token
(
","
),
RE
(
'
\\
w+'
))))
regexp
=
RE
(
'~?/(?:[^/]|(?<=
\\\\
)/)*/~?'
)
regexp
=
RE
(
r
'~?/(?:\\/|[^/])*?/~?'
)
#
RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
literal
=
Alternative
(
RE
(
'"(?:[^"]|
\\\\
")*?"'
),
RE
(
"'(?:[^']|
\\\\
')*?'"
))
symbol
=
RE
(
'(?!
\\
d)
\\
w+'
)
option
=
Series
(
Token
(
"["
),
expression
,
Required
(
Token
(
"]"
)))
...
...
@@ -300,7 +300,8 @@ class EBNFCompilerError(Exception):
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!? Is this possible at compile time?
class
EBNFCompiler
(
Compiler
):
"""Generates a Parser from an abstract syntax tree of a grammar specified
"""
Generates a Parser from an abstract syntax tree of a grammar specified
in EBNF-Notation.
"""
COMMENT_KEYWORD
=
"COMMENT__"
...
...
@@ -316,10 +317,12 @@ class EBNFCompiler(Compiler):
'linefeed'
:
r
'[ \t]*\n?(?!\s*\n)[ \t]*'
,
'vertical'
:
r
'\s*'
}
def
__init__
(
self
,
grammar_name
=
""
,
grammar_source
=
""
):
super
(
EBNFCompiler
,
self
).
__init__
(
grammar_name
,
grammar_source
)
self
.
_reset
()
def
_reset
(
self
):
self
.
_result
=
''
# type: str
self
.
rules
=
OrderedDict
()
# type: OrderedDict[str, List[Node]]
...
...
@@ -340,11 +343,15 @@ class EBNFCompiler(Compiler):
def
result
(
self
)
->
str
:
return
self
.
_result
# methods for generating skeleton code for scanner, transformer, and compiler
def
gen_scanner_skeleton
(
self
)
->
str
:
name
=
self
.
grammar_name
+
"Scanner"
return
"def %s(text):
\n
return text
\n
"
%
name
\
+
SCANNER_FACTORY
.
format
(
NAME
=
self
.
grammar_name
)
def
gen_transformer_skeleton
(
self
)
->
str
:
if
not
self
.
rules
:
raise
EBNFCompilerError
(
'Compiler must be run before calling '
...
...
@@ -363,6 +370,7 @@ class EBNFCompiler(Compiler):
transtable
+=
[
TRANSFORMER_FACTORY
.
format
(
NAME
=
self
.
grammar_name
)]
return
'
\n
'
.
join
(
transtable
)
def
gen_compiler_skeleton
(
self
)
->
str
:
if
not
self
.
rules
:
raise
EBNFCompilerError
(
'Compiler has not been run before calling '
...
...
@@ -387,9 +395,12 @@ class EBNFCompiler(Compiler):
compiler
+=
[
COMPILER_FACTORY
.
format
(
NAME
=
self
.
grammar_name
)]
return
'
\n
'
.
join
(
compiler
)
def
assemble_parser
(
self
,
definitions
:
List
[
Tuple
[
str
,
str
]],
root_node
:
Node
)
->
str
:
# fix capture of variables that have been defined before usage [sic!]
def
assemble_parser
(
self
,
definitions
:
List
[
Tuple
[
str
,
str
]],
root_node
:
Node
)
->
str
:
"""
Creates the Python code for the parser after compilation of
the EBNF-Grammar
"""
if
self
.
variables
:
for
i
in
range
(
len
(
definitions
)):
if
definitions
[
i
][
0
]
in
self
.
variables
:
...
...
@@ -470,6 +481,9 @@ class EBNFCompiler(Compiler):
+
GRAMMAR_FACTORY
.
format
(
NAME
=
self
.
grammar_name
)
return
self
.
_result
## compilation methods
def
on_syntax
(
self
,
node
:
Node
)
->
str
:
self
.
_reset
()
definitions
=
[]
...
...
@@ -489,6 +503,7 @@ class EBNFCompiler(Compiler):
return
self
.
assemble_parser
(
definitions
,
node
)
def
on_definition
(
self
,
node
:
Node
)
->
Tuple
[
str
,
str
]:
rule
=
str
(
node
.
children
[
0
])
if
rule
in
self
.
rules
:
...
...
@@ -520,9 +535,11 @@ class EBNFCompiler(Compiler):
rule
,
defn
=
rule
+
':error'
,
'"'
+
errmsg
+
'"'
return
rule
,
defn
@
staticmethod
def
_check_rx
(
node
:
Node
,
rx
:
str
)
->
str
:
"""Checks whether the string `rx` represents a valid regular
"""
Checks whether the string `rx` represents a valid regular
expression. Makes sure that multiline regular expressions are
prepended by the multiline-flag. Returns the regular expression string.
"""
...
...
@@ -534,6 +551,7 @@ class EBNFCompiler(Compiler):
(
repr
(
rx
),
str
(
re_error
)))
return
rx
def
on_directive
(
self
,
node
:
Node
)
->
str
:
key
=
str
(
node
.
children
[
0
]).
lower
()
assert
key
not
in
self
.
directives
[
'tokens'
]
...
...
@@ -593,19 +611,24 @@ class EBNFCompiler(Compiler):
', '
.
join
(
list
(
self
.
directives
.
keys
()))))
return
""
def
non_terminal
(
self
,
node
:
Node
,
parser_class
:
str
,
custom_args
:
List
[
str
]
=
[])
->
str
:
"""Compiles any non-terminal, where `parser_class` indicates the Parser class
"""
Compiles any non-terminal, where `parser_class` indicates the Parser class
name for the particular non-terminal.
"""
arguments
=
[
self
.
_compile
(
r
)
for
r
in
node
.
children
]
+
custom_args
return
parser_class
+
'('
+
', '
.
join
(
arguments
)
+
')'
def
on_expression
(
self
,
node
)
->
str
:
return
self
.
non_terminal
(
node
,
'Alternative'
)
def
on_term
(
self
,
node
)
->
str
:
return
self
.
non_terminal
(
node
,
'Series'
)
def
on_factor
(
self
,
node
:
Node
)
->
str
:
assert
node
.
children
assert
len
(
node
.
children
)
>=
2
,
node
.
as_sxpr
()
...
...
@@ -639,22 +662,28 @@ class EBNFCompiler(Compiler):
node
.
add_error
(
'Unknown prefix "%s".'
%
prefix
)
return
""
def
on_option
(
self
,
node
)
->
str
:
return
self
.
non_terminal
(
node
,
'Optional'
)
def
on_repetition
(
self
,
node
)
->
str
:
return
self
.
non_terminal
(
node
,
'ZeroOrMore'
)
def
on_oneormore
(
self
,
node
)
->
str
:
return
self
.
non_terminal
(
node
,
'OneOrMore'
)
def
on_regexchain
(
self
,
node
)
->
str
:
raise
EBNFCompilerError
(
"Not yet implemented!"
)
def
on_group
(
self
,
node
)
->
str
:
raise
EBNFCompilerError
(
"Group nodes should have been eliminated by "
"AST transformation!"
)
def
on_symbol
(
self
,
node
:
Node
)
->
str
:
# called only for symbols on the right hand side!
symbol
=
str
(
node
)
# ; assert result == cast(str, node.result)
if
symbol
in
self
.
directives
[
'tokens'
]:
...
...
@@ -667,9 +696,11 @@ class EBNFCompiler(Compiler):
self
.
recursive
.
add
(
symbol
)
return
symbol
def
on_literal
(
self
,
node
)
->
str
:
return
'Token('
+
str
(
node
).
replace
(
'
\\
'
,
r
'\\'
)
+
')'
# return 'Token(' + ', '.join([node.result]) + ')' ?
def
on_regexp
(
self
,
node
:
Node
)
->
str
:
rx
=
str
(
node
)
name
=
[]
# type: List[str]
...
...
@@ -694,6 +725,7 @@ class EBNFCompiler(Compiler):
return
'"'
+
errmsg
+
'"'
return
'RE('
+
', '
.
join
([
arg
]
+
name
)
+
')'
def
on_list_
(
self
,
node
)
->
Set
[
str
]:
assert
node
.
children
return
set
(
item
.
result
.
strip
()
for
item
in
node
.
children
)
...
...
dhparser.py
View file @
2cb7b778
...
...
@@ -52,7 +52,7 @@ def selftest(file_name):
else
:
# compile the grammar again using the result of the previous
# compilation as parser
for
i
in
range
(
1
00
):
for
i
in
range
(
1
):
result
=
compileDSL
(
grammar
,
nil_scanner
,
result
,
transformer
,
compiler
)
print
(
result
)
return
result
...
...
examples/EBNF/EBNF.ebnf
View file @
2cb7b778
...
...
@@ -30,7 +30,7 @@ option = "[" expression §"]"
symbol = /(?!\d)\w+/~ # e.g. expression, factor, parameter_list
literal = /"(?:[^"]|\\")*?"/~ # e.g. "(", '+', 'while'
| /'(?:[^']|\\')*?'/~ # whitespace following literals will be ignored tacitly.
regexp = /~?\/(?:
[^\/]|(?<=\\)
\/)*\/~?/~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
regexp = /~?\/(?:
\\\/|[^
\/
]
)*
?
\/~?/~
# e.g. /\w+/, ~/#.*(?:\n|$)/~
# '~' is a whitespace-marker, if present leading or trailing
# whitespace of a regular expression will be ignored tacitly.
list_ = /\w+/~ { "," /\w+/~ } # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
...
...
examples/LaTeX/LaTeX.ebnf
View file @
2cb7b778
...
...
@@ -113,10 +113,10 @@ text = { cfgtext | (BRACKETS //~) }+
cfgtext = { word_sequence | (ESCAPED //~) }+
word_sequence = { TEXTCHUNK //~ }+
blockcmd = /
\
/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
blockcmd = /
[\\]
/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
#######################################################################
...
...
examples/LaTeX/LaTeXCompiler.py
View file @
2cb7b778
...
...
@@ -163,10 +163,10 @@ class LaTeXGrammar(Grammar):
cfgtext = { word_sequence | (ESCAPED //~) }+
word_sequence = { TEXTCHUNK //~ }+
blockcmd = /
A
/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
blockcmd = /
[\\]
/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
#######################################################################
...
...
@@ -192,7 +192,7 @@ class LaTeXGrammar(Grammar):
block_enrivonment
=
Forward
()
block_of_paragraphs
=
Forward
()
text_elements
=
Forward
()
source_hash__
=
"
7ef00020ebbb2b82e36d38460de56370
"
source_hash__
=
"
9f1579db1994211dc53dd4a8f317bfb6
"
parser_initialization__
=
"upon instantiation"
COMMENT__
=
r
'%.*(?:\n|$)'
WSP__
=
mixin_comment
(
whitespace
=
r
'[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?'
,
comment
=
r
'%.*(?:\n|$)'
)
...
...
@@ -208,7 +208,7 @@ class LaTeXGrammar(Grammar):
MATH
=
RE
(
'[
\\
w_^{}[
\\
]]*'
)
NAME
=
Capture
(
RE
(
'
\\
w+'
))
CMDNAME
=
RE
(
'
\\\\
(?:(?!_)
\\
w)+'
)
blockcmd
=
Series
(
RE
(
'
A
'
,
wR
=
''
),
Alternative
(
Series
(
Token
(
"begin{"
),
Alternative
(
Token
(
"enumerate"
),
Token
(
"itemize"
),
Token
(
"figure"
),
Token
(
"quote"
),
Token
(
"quotation"
),
Token
(
"tabular"
)),
Token
(
"}"
)),
Token
(
"subsection"
),
Token
(
"section"
),
Token
(
"chapter"
),
Token
(
"subsubsection"
),
Token
(
"paragraph"
),
Token
(
"subparagraph"
),
Token
(
"item"
)))
blockcmd
=
Series
(
RE
(
'
[
\\\\
]
'
,
wR
=
''
),
Alternative
(
Series
(
Token
(
"begin{"
),
Alternative
(
Token
(
"enumerate"
),
Token
(
"itemize"
),
Token
(
"figure"
),
Token
(
"quote"
),
Token
(
"quotation"
),
Token
(
"tabular"
)),
Token
(
"}"
)),
Token
(
"subsection"
),
Token
(
"section"
),
Token
(
"chapter"
),
Token
(
"subsubsection"
),
Token
(
"paragraph"
),
Token
(
"subparagraph"
),
Token
(
"item"
)))
word_sequence
=
OneOrMore
(
Series
(
TEXTCHUNK
,
RE
(
''
)))
cfgtext
=
OneOrMore
(
Alternative
(
word_sequence
,
Series
(
ESCAPED
,
RE
(
''
))))
text
=
OneOrMore
(
Alternative
(
cfgtext
,
Series
(
BRACKETS
,
RE
(
''
))))
...
...
test/test_ebnf.py
View file @
2cb7b778
...
...
@@ -20,6 +20,10 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
try
:
import
regex
as
re
except
ImportError
:
import
re
import
sys
from
functools
import
partial
from
multiprocessing
import
Pool
...
...
@@ -102,8 +106,11 @@ class TestEBNFParser:
def
test_RE
(
self
):
gr
=
get_ebnf_grammar
()
m
=
gr
.
regexp
.
main
.
regexp
.
match
(
r
'/\\/ xxx /'
)
assert
m
.
group
().
find
(
'x'
)
<
0
,
m
.
group
()
m
=
gr
.
regexp
.
main
.
regexp
.
match
(
r
'/[\\\\]/ xxx /'
)
rs
=
m
.
group
()
assert
rs
.
find
(
'x'
)
<
0
,
rs
.
group
()
rx
=
re
.
compile
(
rs
[
1
:
-
1
])
assert
rx
.
match
(
r
'\\'
)
def
test_literal
(
self
):
snippet
=
'"literal" '
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment