Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
The container registry cleanup task is now completed and the registry can be used normally.
Open sidebar
badw-it
DHParser
Commits
756e677e
Commit
756e677e
authored
Feb 27, 2021
by
Eckhart Arnold
Browse files
ebnf - full documentation started
parent
878cf515
Changes
4
Hide whitespace changes
Inline
Side-by-side
DHParser/ebnf.py
View file @
756e677e
...
...
@@ -40,6 +40,69 @@ However, here we will show how to compile an EBNF-specified grammar
from within Python-code and how to execute the parser that was
generated by compiling the grammar.
As an example, we will realize a json-parser (https://www.json.org/).
Let's start with creating some test-data::
>>> testobj = {'list': [1,2,"string"], 'int': 3, 'bool': False}
>>> import json
>>> testdata = json.dumps(testobj)
>>> testdata
'{"list": [1, 2, "string"], "int": 3, "bool": false}'
We define the json-Grammar (see https://www.json.org/) in
top-down manner in EBNF. We'll use a regular-expression look-alike
syntax. EBNF, as you may recall, consists of a sequence of symbol
definitions. The definiens of those definitions either is a string
literal or regular expression or other symbols or a combination
of these with four different operators: 1. sequences
2. alternatives 3. options and 4. repetitions. Here is how these
elements are denoted in classical and regex-like EBNF-syntax
..table::
======================== ============== ==============
element classical EBNF regex-like
======================== ============== ==============
insignificant whitespace ~ ~
string literal "..." or `...` "..." or `...`
regular expr. /.../ /.../
sequences A B C A B C
alternatives A | B | C A | B | C
options [ ... ] ...?
repetions { ... } ...*
one or more ...+
grouping (...) (...)
======================== ============== ==============
"insignificant whitespace" is a speciality of DHParser. Denoting
insignificant whitespace with a particular sign `~` allows to eliminate
it already during the parsing process without burdening later
syntax-tree-processing stages with this common task. DHParser offers
several more facilities to restrain the verbosity of the concrete
syntax tree, so that the outcome of the parsing stage comes close (or
at least closer) to the intended abstract-syntax-tree, already.
JSON consists of two complex data types, 1) associative arrays,
called "object" and sequences of heterogeneous data, called array; and
of four simple data types, 1) string, 2) number, 3) bool and 4) null.
The structure of a JSON file can easily be described in EBNF::
>>> grammar = 'json = ~ _element _EOF
\\
n'
\
' _EOF = /$/
\\
n'
\
'_element = object | array | string | number | bool | null
\\
n'
\
'object = "{" ~ member ( "," ~ §member )* §"}" ~
\\
n'
\
'member = string §":" ~ _element
\\
n'
\
'array = "[" ~ ( _element ( "," ~ _element )* )? §"]" ~
\\
n'
\
'string = `"` §_CHARS `"` ~
\\
n'
\
' _CHARS = /[^"
\\
]+/ | /
\\
[\/bnrt
\\
]/
\\
n'
\
'number = _INT _FRAC? _EXP? ~
\\
n'
\
' _INT = `-` /[1-9][0-9]+/ | /[0-9]/
\\
n'
\
' _FRAC = `.` /[0-9]+/
\\
n'
\
' _EXP = (`E`|`e`) [`+`|`-`] /[0-9]+/
\\
n'
\
'bool = "true" ~ | "false" ~
\\
n'
\
'null = "null" ~
\\
n'
Let's try this on our test-string
"""
...
...
@@ -63,7 +126,7 @@ from DHParser.parse import Parser, Grammar, mixin_comment, mixin_nonempty, Forwa
INFINITE
,
matching_bracket
,
ParseFunc
,
update_scanner
from
DHParser.preprocess
import
nil_preprocessor
,
PreprocessorFunc
from
DHParser.syntaxtree
import
Node
,
RootNode
,
WHITESPACE_PTYPE
,
TOKEN_PTYPE
from
DHParser.toolkit
import
load_if_file
,
escape_re
,
escape_c
on
tr
o
l_char
acter
s
,
md5
,
\
from
DHParser.toolkit
import
load_if_file
,
escape_re
,
escape_ctrl_chars
,
md5
,
\
sane_parser_name
,
re
,
expand_table
,
unrepr
,
compile_python_object
,
DHPARSER_PARENTDIR
,
\
RX_NEVER_MATCH
,
cython
from
DHParser.transform
import
TransformationFunc
,
traverse
,
remove_brackets
,
\
...
...
@@ -1981,10 +2044,10 @@ class EBNFCompiler(Compiler):
def on_directive(self, node: Node) -> str:
for child in node.children:
if child.tag_name == "
literal
":
child.result = escape_c
on
tr
o
l_char
acter
s(child.content)
child.result = escape_ctrl_chars(child.content)
elif child.tag_name == "
literals
":
self.join_literals(child)
child.result = escape_c
on
tr
o
l_char
acter
s(child.content)
child.result = escape_ctrl_chars(child.content)
key = node.children[0].content
assert key not in self.directives.tokens
...
...
@@ -2504,16 +2567,16 @@ class EBNFCompiler(Compiler):
return symbol
def TEXT_PARSER(self, text):
if DROP_STRINGS in self.directives.drop and self.context[-2].tag_name != "
definition
":
return 'Drop(Text(' + text + '))'
return 'Text(' + text + ')'
def drop_on(self, category):
return category in self.directives.drop and self.context[-2].tag_name != "
definition
"
def REGEXP_PARSER(self, regexp):
if DROP_REGEXP in self.directives.drop and self.context[-2].tag_name != "
definition
":
return 'Drop(RegExp(' + regexp + '))'
return 'RegExp(' + regexp + ')'
def TEXT_PARSER(self, text, drop):
return 'Drop(Text(' + text + '))' if drop else 'Text(' + text + ')'
def REGEXP_PARSER(self, regexp, drop):
return 'Drop(RegExp(' + regexp + '))' if drop else 'RegExp(' + regexp + ')'
def WSPC_PARSER(self, force_drop=False):
...
...
@@ -2530,7 +2593,7 @@ class EBNFCompiler(Compiler):
def on_literal(self, node: Node) -> str:
center = self.TEXT_PARSER(escape_c
on
tr
o
l_char
acter
s(node.content))
center = self.TEXT_PARSER(escape_ctrl_chars(node.content)
, self.drop_on(DROP_STRINGS)
)
force = DROP_STRINGS in self.directives.drop
left = self.WSPC_PARSER(force) if 'left' in self.directives.literalws else ''
right = self.WSPC_PARSER(force) if 'right' in self.directives.literalws else ''
...
...
@@ -2540,13 +2603,13 @@ class EBNFCompiler(Compiler):
def on_plaintext(self, node: Node) -> str:
tk = escape_c
on
tr
o
l_char
acter
s(node.content)
tk = escape_ctrl_chars(node.content)
rpl = '"' if tk.find('"') < 0 else "'" if tk.find("'") < 0 else ''
if rpl:
tk = rpl + tk[1:-1] + rpl
else:
tk = rpl + tk.replace('"', '
\\
"')[1:-1] + rpl
return self.TEXT_PARSER(tk)
return self.TEXT_PARSER(tk
, self.drop_on(DROP_STRINGS)
)
def on_regexp(self, node: Node) -> str:
...
...
@@ -2560,7 +2623,7 @@ class EBNFCompiler(Compiler):
% (EBNFCompiler.AST_ERROR, str(error), trace, node.as_sxpr())
self.tree.new_error(node, errmsg)
return '"' + errmsg + '"'
return self.REGEXP_PARSER(arg)
return self.REGEXP_PARSER(arg
, self.drop_on(DROP_REGEXP)
)
def on_char_range(self, node) -> str:
...
...
DHParser/log.py
View file @
756e677e
...
...
@@ -62,7 +62,7 @@ from DHParser.configuration import access_presets, finalize_presets, get_config_
from
DHParser.error
import
Error
from
DHParser.stringview
import
StringView
from
DHParser.syntaxtree
import
Node
,
FrozenNode
,
ZOMBIE_TAG
,
EMPTY_PTYPE
from
DHParser.toolkit
import
escape_c
on
tr
o
l_char
acter
s
,
abbreviate_middle
,
cython
from
DHParser.toolkit
import
escape_ctrl_chars
,
abbreviate_middle
,
cython
__all__
=
(
'CallItem'
,
'start_logging'
,
...
...
@@ -443,7 +443,7 @@ class HistoryRecord:
else
:
s
=
self
.
text
excerpt
=
s
[:
36
]
+
' ...'
if
len
(
s
)
>
36
else
s
excerpt
=
escape_c
on
tr
o
l_char
acter
s
(
str
(
excerpt
))
excerpt
=
escape_ctrl_chars
(
str
(
excerpt
))
return
excerpt
# @property
...
...
DHParser/parse.py
View file @
756e677e
...
...
@@ -49,7 +49,7 @@ from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME
from
DHParser.stringview
import
StringView
,
EMPTY_STRING_VIEW
from
DHParser.syntaxtree
import
ChildrenType
,
Node
,
RootNode
,
WHITESPACE_PTYPE
,
\
TOKEN_PTYPE
,
ZOMBIE_TAG
,
EMPTY_NODE
,
ResultType
from
DHParser.toolkit
import
sane_parser_name
,
escape_c
on
tr
o
l_char
acter
s
,
re
,
cython
,
\
from
DHParser.toolkit
import
sane_parser_name
,
escape_ctrl_chars
,
re
,
cython
,
\
abbreviate_middle
,
RX_NEVER_MATCH
,
RxPatternType
,
linebreaks
,
line_col
,
identity
...
...
@@ -1841,7 +1841,7 @@ class RegExp(Parser):
return
'whitespace__'
except
(
AttributeError
,
NameError
):
pass
return
'/'
+
escape_c
on
tr
o
l_char
acter
s
(
'%s'
%
abbreviate_middle
(
pattern
,
118
))
\
return
'/'
+
escape_ctrl_chars
(
'%s'
%
abbreviate_middle
(
pattern
,
118
))
\
.
replace
(
'/'
,
'
\\
/'
)
+
'/'
...
...
DHParser/toolkit.py
View file @
756e677e
...
...
@@ -81,7 +81,7 @@ __all__ = ('typing',
'RxPatternType'
,
're_find'
,
'escape_re'
,
'escape_c
on
tr
o
l_char
acter
s'
,
'escape_ctrl_chars'
,
'is_filename'
,
'relative_path'
,
'concurrent_ident'
,
...
...
@@ -391,7 +391,7 @@ def escape_re(strg: str) -> str:
return
re
.
escape
(
strg
)
def
escape_c
on
tr
o
l_char
acter
s
(
strg
:
str
)
->
str
:
def
escape_ctrl_chars
(
strg
:
str
)
->
str
:
r
"""
Replace all control characters (e.g. `\n` `\t`) in a string
by their back-slashed representation and replaces backslash by
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment