Commit 072c5d19 authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

Merge remote-tracking branch 'origin/development' into development

parents c13f63ea 6b11e2aa
......@@ -1622,14 +1622,12 @@ class EBNFGrammar(Grammar):
EBNF-definition of the Grammar::
@ comment = /(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)/
# comments can be either C-Style: /* ... */
# or pascal/modula/oberon-style: (* ... *)
# or python-style: # ... \n,
# excluding, however, character markers: #x20
@ whitespace = /\s*/ # whitespace includes linefeeds
# comments can be either C-Style: /* ... */
# or pascal/modula/oberon-style: (* ... *)
# or python-style: # ... \n, excluding, however, character markers: #x20
@ whitespace = /\s*/ # whitespace includes linefeed
@ literalws = right # trailing whitespace of literals will be ignored tacitly
@ disposable = pure_elem, countable, FOLLOW_UP, SYM_REGEX, ANY_SUFFIX, EOF
@ disposable = component, pure_elem, countable, FOLLOW_UP, SYM_REGEX, ANY_SUFFIX, EOF
@ drop = whitespace, EOF # do not include these even in the concrete syntax tree
@ RNG_BRACE_filter = matching_bracket() # filter or transform content of RNG_BRACE on retrieve
......@@ -1638,6 +1636,7 @@ class EBNFGrammar(Grammar):
@ definition_resume = /\n\s*(?=@|\w+\w*\s*=)/
@ directive_resume = /\n\s*(?=@|\w+\w*\s*=)/
# specialized error messages for certain cases
@ definition_error = /,/, 'Delimiter "," not expected in definition!\nEither this was meant to '
......@@ -1651,10 +1650,10 @@ class EBNFGrammar(Grammar):
syntax = ~ { definition | directive } EOF
definition = symbol §:DEF~ [ :OR~ ] expression :ENDL~ & FOLLOW_UP # [:OR~] to support v. Rossum's syntax
directive = "@" §symbol "=" (regexp | literals | procedure | symbol !DEF)
{ "," (regexp | literals | procedure | symbol !DEF) } & FOLLOW_UP
literals = { literal }+ # string chaining, only allowed in directives!
procedure = SYM_REGEX "()" # procedure name, only allowed in directives!
directive = "@" §symbol "=" component { "," component } & FOLLOW_UP
component = literals | procedure | expression
literals = { literal }+ # string chaining, only allowed in directives!
procedure = SYM_REGEX "()" # procedure name, only allowed in directives!
FOLLOW_UP = `@` | symbol | EOF
......@@ -1663,7 +1662,7 @@ class EBNFGrammar(Grammar):
expression = sequence { :OR~ sequence }
sequence = ["§"] ( interleave | lookaround ) # "§" means all following terms mandatory
{ :AND~ ["§"] ( interleave | lookaround ) }
{ !`@` !(symbol :DEF) :AND~ ["§"] ( interleave | lookaround ) }
interleave = difference { "°" ["§"] difference }
lookaround = flowmarker § (oneormore | pure_elem)
difference = term ["-" § (oneormore | pure_elem)]
......@@ -1691,7 +1690,7 @@ class EBNFGrammar(Grammar):
#: flow-operators
flowmarker = "!" | "&" # '!' negative lookahead, '&' positive lookahead
| "<-!" | "<-&" # '<-' negative lookbehind, '<-&' positive lookbehind
| "<-!" | "<-&" # '<-!' negative lookbehind, '<-&' positive lookbehind
retrieveop = "::" | ":?" | ":" # '::' pop, ':?' optional pop, ':' retrieve
......@@ -1729,7 +1728,7 @@ class EBNFGrammar(Grammar):
EOF = !/./ [:?DEF] [:?OR] [:?AND] [:?ENDL] # [:?DEF], [:?OR], ... clear stack by eating stored value
[:?RNG_DELIM] [:?BRACE_SIGN] [:?CH_LEADIN] [:?TIMES] [:?RE_LEADIN] [:?RE_LEADOUT]
DEF = `=` | `:=` | `::=` | `<-` | /:\n/ | `: ` # if `: `, retrieve marker mustn't be followed by blank!
DEF = `=` | `:=` | `::=` | `<-` | /:\n/ | `: ` # with `: `, retrieve markers mustn't be followed by a blank!
OR = `|` | `/` !regex_heuristics
AND = `,` | ``
ENDL = `;` | ``
......@@ -1766,17 +1765,11 @@ class EBNFGrammar(Grammar):
countable = Forward()
element = Forward()
expression = Forward()
source_hash__ = "3bda01686407a47a9fd0a709bda53ae3"
source_hash__ = "c76fcc24e5077d4e150b771e6b60f0a1"
disposable__ = re.compile('component$|pure_elem$|countable$|FOLLOW_UP$|SYM_REGEX$|ANY_SUFFIX$|EOF$')
static_analysis_pending__ = [] # type: List[bool]
parser_initialization__ = ["upon instantiation"]
error_messages__ = {'definition': [
(re.compile(r','),
'Delimiter "," not expected in definition!\\nEither this was meant to be a directive '
'and the directive symbol @ is missing\\nor the error is due to inconsistent use of the '
'comma as a delimiter\\nfor the elements of a sequence.')]}
resume_rules__ = {'definition': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')],
'directive': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')]}
error_messages__ = {'definition': [(re.compile(r','), 'Delimiter "," not expected in definition!\\nEither this was meant to be a directive and the directive symbol @ is missing\\nor the error is due to inconsistent use of the comma as a delimiter\\nfor the elements of a sequence.')]}
COMMENT__ = r'(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)'
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
......@@ -1787,15 +1780,8 @@ class EBNFGrammar(Grammar):
SYM_REGEX = RegExp('(?!\\d)\\w+')
RE_CORE = RegExp('(?:(?<!\\\\)\\\\(?:/)|[^/])*')
regex_heuristics = Alternative(RegExp('[^ ]'), RegExp('[^/\\n*?+\\\\]*[*?+\\\\][^/\\n]/'))
literal_heuristics = Alternative(RegExp('~?\\s*"(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^"]*)*"'),
RegExp("~?\\s*'(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^']*)*'"),
RegExp('~?\\s*`(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^`]*)*`'),
RegExp('~?\\s*´(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^´]*)*´'),
RegExp('~?\\s*/(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^/]*)*/'))
char_range_heuristics = NegativeLookahead(Alternative(
RegExp('[\\n\\t ]'), Series(dwsp__, literal_heuristics),
Series(Option(Alternative(Text("::"), Text(":?"), Text(":"))),
SYM_REGEX, RegExp('\\s*\\]'))))
literal_heuristics = Alternative(RegExp('~?\\s*"(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^"]*)*"'), RegExp("~?\\s*'(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^']*)*'"), RegExp('~?\\s*`(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^`]*)*`'), RegExp('~?\\s*´(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^´]*)*´'), RegExp('~?\\s*/(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^/]*)*/'))
char_range_heuristics = NegativeLookahead(Alternative(RegExp('[\\n\\t ]'), Series(dwsp__, literal_heuristics), Series(Option(Alternative(Text("::"), Text(":?"), Text(":"))), SYM_REGEX, RegExp('\\s*\\]'))))
CH_LEADIN = Capture(Alternative(Text("0x"), Text("#x")))
RE_LEADOUT = Capture(Text("/"))
RE_LEADIN = Capture(Alternative(Series(Text("/"), Lookahead(regex_heuristics)), Text("^/")))
......@@ -1806,87 +1792,46 @@ class EBNFGrammar(Grammar):
ENDL = Capture(Alternative(Text(";"), Text("")))
AND = Capture(Alternative(Text(","), Text("")))
OR = Capture(Alternative(Text("|"), Series(Text("/"), NegativeLookahead(regex_heuristics))))
DEF = Capture(Alternative(Text("="), Text(":="), Text("::="),
Text("<-"), RegExp(':\\n'), Text(": ")))
EOF = Drop(Drop(Series(Drop(NegativeLookahead(RegExp('.'))),
Drop(Option(Drop(Pop(DEF, match_func=optional_last_value)))),
Drop(Option(Drop(Pop(OR, match_func=optional_last_value)))),
Drop(Option(Drop(Pop(AND, match_func=optional_last_value)))),
Drop(Option(Drop(Pop(ENDL, match_func=optional_last_value)))),
Drop(Option(Drop(Pop(RNG_DELIM, match_func=optional_last_value)))),
Drop(Option(Drop(Pop(BRACE_SIGN, match_func=optional_last_value)))),
Drop(Option(Drop(Pop(CH_LEADIN, match_func=optional_last_value)))),
Drop(Option(Drop(Pop(TIMES, match_func=optional_last_value)))),
Drop(Option(Drop(Pop(RE_LEADIN, match_func=optional_last_value)))),
Drop(Option(Drop(Pop(RE_LEADOUT, match_func=optional_last_value)))))))
DEF = Capture(Alternative(Text("="), Text(":="), Text("::="), Text("<-"), RegExp(':\\n'), Text(": ")))
EOF = Drop(Series(Drop(NegativeLookahead(RegExp('.'))), Drop(Option(Drop(Pop(DEF, match_func=optional_last_value)))), Drop(Option(Drop(Pop(OR, match_func=optional_last_value)))), Drop(Option(Drop(Pop(AND, match_func=optional_last_value)))), Drop(Option(Drop(Pop(ENDL, match_func=optional_last_value)))), Drop(Option(Drop(Pop(RNG_DELIM, match_func=optional_last_value)))), Drop(Option(Drop(Pop(BRACE_SIGN, match_func=optional_last_value)))), Drop(Option(Drop(Pop(CH_LEADIN, match_func=optional_last_value)))), Drop(Option(Drop(Pop(TIMES, match_func=optional_last_value)))), Drop(Option(Drop(Pop(RE_LEADIN, match_func=optional_last_value)))), Drop(Option(Drop(Pop(RE_LEADOUT, match_func=optional_last_value))))))
whitespace = Series(RegExp('~'), dwsp__)
any_char = Series(Text("."), dwsp__)
free_char = Alternative(RegExp('[^\\n\\[\\]\\\\]'), RegExp('\\\\[nrt`´\'"(){}\\[\\]/\\\\]'))
character = Series(Retrieve(CH_LEADIN), HEXCODE)
char_range = Series(Text("["), Lookahead(char_range_heuristics), Option(Text("^")),
Alternative(character, free_char),
ZeroOrMore(Alternative(Series(Option(Text("-")), character), free_char)),
Series(Text("]"), dwsp__))
char_range = Series(Text("["), Lookahead(char_range_heuristics), Option(Text("^")), Alternative(character, free_char), ZeroOrMore(Alternative(Series(Option(Text("-")), character), free_char)), Series(Text("]"), dwsp__))
regexp = Series(Retrieve(RE_LEADIN), RE_CORE, Retrieve(RE_LEADOUT), dwsp__)
plaintext = Alternative(Series(RegExp('`(?:(?<!\\\\)\\\\`|[^`])*?`'), dwsp__),
Series(RegExp('´(?:(?<!\\\\)\\\\´|[^´])*?´'), dwsp__))
literal = Alternative(Series(RegExp('"(?:(?<!\\\\)\\\\"|[^"])*?"'), dwsp__),
Series(RegExp("'(?:(?<!\\\\)\\\\'|[^'])*?'"), dwsp__))
plaintext = Alternative(Series(RegExp('`(?:(?<!\\\\)\\\\`|[^`])*?`'), dwsp__), Series(RegExp('´(?:(?<!\\\\)\\\\´|[^´])*?´'), dwsp__))
literal = Alternative(Series(RegExp('"(?:(?<!\\\\)\\\\"|[^"])*?"'), dwsp__), Series(RegExp("'(?:(?<!\\\\)\\\\'|[^'])*?'"), dwsp__))
symbol = Series(SYM_REGEX, dwsp__)
multiplier = Series(RegExp('[1-9]\\d*'), dwsp__)
no_range = Alternative(NegativeLookahead(multiplier),
Series(Lookahead(multiplier), Retrieve(TIMES)))
range = Series(RNG_BRACE, dwsp__, multiplier,
Option(Series(Retrieve(RNG_DELIM), dwsp__, multiplier)),
Pop(RNG_BRACE, match_func=matching_bracket), dwsp__)
counted = Alternative(Series(countable, range),
Series(countable, Retrieve(TIMES), dwsp__, multiplier),
Series(multiplier, Retrieve(TIMES), dwsp__, countable, mandatory=3))
option = Alternative(Series(NegativeLookahead(char_range), Series(Text("["), dwsp__),
expression, Series(Text("]"), dwsp__), mandatory=2),
Series(element, Series(Text("?"), dwsp__)))
repetition = Alternative(Series(Series(Text("{"), dwsp__), no_range,
expression, Series(Text("}"), dwsp__), mandatory=2),
Series(element, Series(Text("*"), dwsp__), no_range))
oneormore = Alternative(Series(Series(Text("{"), dwsp__), no_range, expression,
Series(Text("}+"), dwsp__)),
Series(element, Series(Text("+"), dwsp__)))
group = Series(Series(Text("("), dwsp__), no_range,
expression, Series(Text(")"), dwsp__), mandatory=2)
retrieveop = Alternative(Series(Text("::"), dwsp__),
Series(Text(":?"), dwsp__),
Series(Text(":"), dwsp__))
flowmarker = Alternative(Series(Text("!"), dwsp__), Series(Text("&"), dwsp__),
Series(Text("<-!"), dwsp__), Series(Text("<-&"), dwsp__))
no_range = Alternative(NegativeLookahead(multiplier), Series(Lookahead(multiplier), Retrieve(TIMES)))
range = Series(RNG_BRACE, dwsp__, multiplier, Option(Series(Retrieve(RNG_DELIM), dwsp__, multiplier)), Pop(RNG_BRACE, match_func=matching_bracket), dwsp__)
counted = Alternative(Series(countable, range), Series(countable, Retrieve(TIMES), dwsp__, multiplier), Series(multiplier, Retrieve(TIMES), dwsp__, countable, mandatory=3))
option = Alternative(Series(NegativeLookahead(char_range), Series(Text("["), dwsp__), expression, Series(Text("]"), dwsp__), mandatory=2), Series(element, Series(Text("?"), dwsp__)))
repetition = Alternative(Series(Series(Text("{"), dwsp__), no_range, expression, Series(Text("}"), dwsp__), mandatory=2), Series(element, Series(Text("*"), dwsp__), no_range))
oneormore = Alternative(Series(Series(Text("{"), dwsp__), no_range, expression, Series(Text("}+"), dwsp__)), Series(element, Series(Text("+"), dwsp__)))
group = Series(Series(Text("("), dwsp__), no_range, expression, Series(Text(")"), dwsp__), mandatory=2)
retrieveop = Alternative(Series(Text("::"), dwsp__), Series(Text(":?"), dwsp__), Series(Text(":"), dwsp__))
flowmarker = Alternative(Series(Text("!"), dwsp__), Series(Text("&"), dwsp__), Series(Text("<-!"), dwsp__), Series(Text("<-&"), dwsp__))
ANY_SUFFIX = RegExp('[?*+]')
element.set(Alternative(Series(Option(retrieveop), symbol, NegativeLookahead(Retrieve(DEF))),
literal, plaintext, regexp, char_range, Series(character, dwsp__),
any_char, whitespace, group))
literals = OneOrMore(literal)
pure_elem = Series(element, NegativeLookahead(ANY_SUFFIX), mandatory=1)
countable.set(Alternative(option, oneormore, element))
procedure = Series(SYM_REGEX, Series(Text("()"), dwsp__))
term = Alternative(oneormore, counted, repetition, option, pure_elem)
difference = Series(term, Option(Series(Series(Text("-"), dwsp__),
Alternative(oneormore, pure_elem), mandatory=1)))
difference = Series(term, Option(Series(Series(Text("-"), dwsp__), Alternative(oneormore, pure_elem), mandatory=1)))
lookaround = Series(flowmarker, Alternative(oneormore, pure_elem), mandatory=1)
interleave = Series(difference, ZeroOrMore(Series(Series(Text("°"), dwsp__),
Option(Series(Text("§"), dwsp__)),
difference)))
sequence = Series(Option(Series(Text("§"), dwsp__)), Alternative(interleave, lookaround),
ZeroOrMore(Series(Retrieve(AND), dwsp__, Option(Series(Text("§"), dwsp__)),
Alternative(interleave, lookaround))))
expression.set(Series(sequence, ZeroOrMore(Series(Retrieve(OR), dwsp__, sequence))))
interleave = Series(difference, ZeroOrMore(Series(Series(Text("°"), dwsp__), Option(Series(Text("§"), dwsp__)), difference)))
sequence = Series(Option(Series(Text("§"), dwsp__)), Alternative(interleave, lookaround), ZeroOrMore(Series(NegativeLookahead(Text("@")), NegativeLookahead(Series(symbol, Retrieve(DEF))), Retrieve(AND), dwsp__, Option(Series(Text("§"), dwsp__)), Alternative(interleave, lookaround))))
FOLLOW_UP = Alternative(Text("@"), symbol, EOF)
procedure = Series(SYM_REGEX, Series(Text("()"), dwsp__))
literals = OneOrMore(literal)
component = Alternative(regexp, literals, procedure, Series(symbol, NegativeLookahead(DEF)))
directive = Series(
Series(Text("@"), dwsp__), symbol, Series(Text("="), dwsp__),
Alternative(Series(component, ZeroOrMore(Series(Series(Text(","), dwsp__), component))),
expression),
Lookahead(FOLLOW_UP), mandatory=1)
definition = Series(symbol, Retrieve(DEF), dwsp__, Option(Series(Retrieve(OR), dwsp__)),
expression, Retrieve(ENDL), dwsp__, Lookahead(FOLLOW_UP), mandatory=1)
definition = Series(symbol, Retrieve(DEF), dwsp__, Option(Series(Retrieve(OR), dwsp__)), expression, Retrieve(ENDL), dwsp__, Lookahead(FOLLOW_UP), mandatory=1)
component = Alternative(literals, procedure, expression)
directive = Series(Series(Text("@"), dwsp__), symbol, Series(Text("="), dwsp__), component, ZeroOrMore(Series(Series(Text(","), dwsp__), component)), Lookahead(FOLLOW_UP), mandatory=1)
element.set(Alternative(Series(Option(retrieveop), symbol, NegativeLookahead(Retrieve(DEF))), literal, plaintext, regexp, char_range, Series(character, dwsp__), any_char, whitespace, group))
countable.set(Alternative(option, oneormore, element))
expression.set(Series(sequence, ZeroOrMore(Series(Retrieve(OR), dwsp__, sequence))))
syntax = Series(dwsp__, ZeroOrMore(Alternative(definition, directive)), EOF)
resume_rules__ = {'definition': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')],
'directive': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')]}
root__ = syntax
def __init__(self, root: Parser = None, static_analysis: Optional[bool] = None) -> None:
......@@ -1968,21 +1913,18 @@ class FixedEBNFGrammar(Grammar):
@ comment = /(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)/
# comments can be either C-Style: /* ... */
# or pascal/modula/oberon-style: (* ... *)
# or python-style: # ... \n,
# excluding, however, character markers: #x20
@ whitespace = /\s*/ # whitespace includes linefeeds
# or python-style: # ... \n, excluding, however, character markers: #x20
@ whitespace = /\s*/ # whitespace includes linefeed
@ literalws = right # trailing whitespace of literals will be ignored tacitly
@ disposable = component, pure_elem, countable, FOLLOW_UP, SYM_REGEX, ANY_SUFFIX, EOF
@ drop = whitespace, EOF # do not include these even in the concrete syntax tree
@ RNG_BRACE_filter = matching_bracket() # filter or transform content of RNG_BRACE on retrieve
# re-entry-rules for resuming after parsing-error
@ definition_resume = /\n\s*(?=@|\w+\w*\s*=)/
@ directive_resume = /\n\s*(?=@|\w+\w*\s*=)/
# specialized error messages for certain cases
@ definition_error = /,/, 'Delimiter "," not expected in definition!\nEither this was meant to '
......@@ -1990,14 +1932,13 @@ class FixedEBNFGrammar(Grammar):
'due to inconsistent use of the comma as a delimiter\nfor the elements '
'of a sequence.'
#: top-level
syntax = ~ { definition | directive } EOF
definition = symbol §DEF~ [ OR~ ] expression ENDL~ & FOLLOW_UP # [OR~] to support v. Rossum's syntax
directive = "@" §symbol "=" ( component { "," component } | expression ) & FOLLOW_UP
component = (regexp | literals | procedure | symbol !DEF)
directive = "@" §symbol "=" component { "," component } & FOLLOW_UP
component = literals | procedure | expression
literals = { literal }+ # string chaining, only allowed in directives!
procedure = SYM_REGEX "()" # procedure name, only allowed in directives!
......@@ -2111,19 +2052,11 @@ class FixedEBNFGrammar(Grammar):
countable = Forward()
element = Forward()
expression = Forward()
source_hash__ = "d0735678e82e6d7cbf75958080a607ff"
source_hash__ = "d39bd97362e79f1a15bdca37c067d78b"
disposable__ = re.compile('component$|pure_elem$|countable$|FOLLOW_UP$|SYM_REGEX$|ANY_SUFFIX$|EOF$')
static_analysis_pending__ = [] # type: List[bool]
parser_initialization__ = ["upon instantiation"]
error_messages__ = {
'definition':
[(re.compile(r','),
'Delimiter "," not expected in definition!\\n'
'Either this was meant to be a directive and the directive symbol @ is missing\\n'
'or the error is due to inconsistent use of the comma as a delimiter\\n'
'for the elements of a sequence.')]}
resume_rules__ = {'definition': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')],
'directive': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')]}
error_messages__ = {'definition': [(re.compile(r','), 'Delimiter "," not expected in definition!\\nEither this was meant to be a directive and the directive symbol @ is missing\\nor the error is due to inconsistent use of the comma as a delimiter\\nfor the elements of a sequence.')]}
COMMENT__ = r'(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)'
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
......@@ -2133,6 +2066,9 @@ class FixedEBNFGrammar(Grammar):
HEXCODE = RegExp('[A-Fa-f0-9]{1,8}')
SYM_REGEX = RegExp('(?!\\d)\\w+')
RE_CORE = RegExp('(?:(?<!\\\\)\\\\(?:/)|[^/])*')
regex_heuristics = Alternative(RegExp('[^ ]'), RegExp('[^/\\n*?+\\\\]*[*?+\\\\][^/\\n]/'))
literal_heuristics = Alternative(RegExp('~?\\s*"(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^"]*)*"'), RegExp("~?\\s*'(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^']*)*'"), RegExp('~?\\s*`(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^`]*)*`'), RegExp('~?\\s*´(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^´]*)*´'), RegExp('~?\\s*/(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^/]*)*/'))
char_range_heuristics = NegativeLookahead(Alternative(RegExp('[\\n\\t ]'), Series(dwsp__, literal_heuristics), Series(Option(Alternative(Text("::"), Text(":?"), Text(":"))), SYM_REGEX, RegExp('\\s*\\]'))))
CH_LEADIN = Text("0x")
RE_LEADOUT = Text("/")
RE_LEADIN = Text("/")
......@@ -2144,69 +2080,45 @@ class FixedEBNFGrammar(Grammar):
AND = Text("")
OR = Text("|")
DEF = Text("=")
EOF = Drop(Drop(NegativeLookahead(RegExp('.'))))
EOF = Drop(NegativeLookahead(RegExp('.')))
whitespace = Series(RegExp('~'), dwsp__)
any_char = Series(Text("."), dwsp__)
free_char = Alternative(RegExp('[^\\n\\[\\]\\\\]'), RegExp('\\\\[nrt`´\'"(){}\\[\\]/\\\\]'))
character = Series(CH_LEADIN, HEXCODE)
char_range = Series(Text("["), Lookahead(char_range_heuristics), Option(Text("^")), Alternative(character, free_char), ZeroOrMore(Alternative(Series(Option(Text("-")), character), free_char)), Series(Text("]"), dwsp__))
regexp = Series(RE_LEADIN, RE_CORE, RE_LEADOUT, dwsp__)
plaintext = Alternative(Series(RegExp('`(?:(?<!\\\\)\\\\`|[^`])*?`'), dwsp__),
Series(RegExp('´(?:(?<!\\\\)\\\\´|[^´])*?´'), dwsp__))
literal = Alternative(Series(RegExp('"(?:(?<!\\\\)\\\\"|[^"])*?"'), dwsp__),
Series(RegExp("'(?:(?<!\\\\)\\\\'|[^'])*?'"), dwsp__))
plaintext = Alternative(Series(RegExp('`(?:(?<!\\\\)\\\\`|[^`])*?`'), dwsp__), Series(RegExp('´(?:(?<!\\\\)\\\\´|[^´])*?´'), dwsp__))
literal = Alternative(Series(RegExp('"(?:(?<!\\\\)\\\\"|[^"])*?"'), dwsp__), Series(RegExp("'(?:(?<!\\\\)\\\\'|[^'])*?'"), dwsp__))
symbol = Series(SYM_REGEX, dwsp__)
multiplier = Series(RegExp('[1-9]\\d*'), dwsp__)
no_range = Alternative(NegativeLookahead(multiplier),
Series(Lookahead(multiplier), TIMES))
range = Series(RNG_OPEN, dwsp__, multiplier, Option(Series(RNG_DELIM, dwsp__, multiplier)),
RNG_CLOSE, dwsp__)
counted = Alternative(Series(countable, range), Series(countable, TIMES, dwsp__, multiplier),
Series(multiplier, TIMES, dwsp__, countable, mandatory=3))
option = Alternative(
Series(Series(Text("["), dwsp__), expression, Series(Text("]"), dwsp__), mandatory=1),
Series(element, Series(Text("?"), dwsp__)))
repetition = Alternative(
Series(Series(Text("{"), dwsp__), no_range, expression,
Series(Text("}"), dwsp__), mandatory=2),
Series(element, Series(Text("*"), dwsp__), no_range))
oneormore = Alternative(
Series(Series(Text("{"), dwsp__), no_range, expression, Series(Text("}+"), dwsp__)),
Series(element, Series(Text("+"), dwsp__)))
group = Series(Series(Text("("), dwsp__), no_range, expression,
Series(Text(")"), dwsp__), mandatory=2)
retrieveop = Alternative(
Series(Text("::"), dwsp__), Series(Text(":?"), dwsp__), Series(Text(":"), dwsp__))
flowmarker = Alternative(
Series(Text("!"), dwsp__), Series(Text("&"), dwsp__),
Series(Text("<-!"), dwsp__), Series(Text("<-&"), dwsp__))
no_range = Alternative(NegativeLookahead(multiplier), Series(Lookahead(multiplier), TIMES))
range = Series(RNG_OPEN, dwsp__, multiplier, Option(Series(RNG_DELIM, dwsp__, multiplier)), RNG_CLOSE, dwsp__)
counted = Alternative(Series(countable, range), Series(countable, TIMES, dwsp__, multiplier), Series(multiplier, TIMES, dwsp__, countable, mandatory=3))
option = Alternative(Series(Series(Text("["), dwsp__), expression, Series(Text("]"), dwsp__), mandatory=1), Series(element, Series(Text("?"), dwsp__)))
repetition = Alternative(Series(Series(Text("{"), dwsp__), no_range, expression, Series(Text("}"), dwsp__), mandatory=2), Series(element, Series(Text("*"), dwsp__), no_range))
oneormore = Alternative(Series(Series(Text("{"), dwsp__), no_range, expression, Series(Text("}+"), dwsp__)), Series(element, Series(Text("+"), dwsp__)))
group = Series(Series(Text("("), dwsp__), no_range, expression, Series(Text(")"), dwsp__), mandatory=2)
retrieveop = Alternative(Series(Text("::"), dwsp__), Series(Text(":?"), dwsp__), Series(Text(":"), dwsp__))
flowmarker = Alternative(Series(Text("!"), dwsp__), Series(Text("&"), dwsp__), Series(Text("<-!"), dwsp__), Series(Text("<-&"), dwsp__))
ANY_SUFFIX = RegExp('[?*+]')
element.set(Alternative(
Series(Option(retrieveop), symbol, NegativeLookahead(DEF)),
literal, plaintext, regexp, Series(character, dwsp__), any_char, whitespace, group))
literals = OneOrMore(literal)
pure_elem = Series(element, NegativeLookahead(ANY_SUFFIX), mandatory=1)
countable.set(Alternative(option, oneormore, element))
procedure = Series(SYM_REGEX, Series(Text("()"), dwsp__))
term = Alternative(oneormore, counted, repetition, option, pure_elem)
difference = Series(term, Option(Series(
Series(Text("-"), dwsp__), Alternative(oneormore, pure_elem), mandatory=1)))
difference = Series(term, Option(Series(Series(Text("-"), dwsp__), Alternative(oneormore, pure_elem), mandatory=1)))
lookaround = Series(flowmarker, Alternative(oneormore, pure_elem), mandatory=1)
interleave = Series(difference, ZeroOrMore(
Series(Series(Text("°"), dwsp__), Option(Series(Text("§"), dwsp__)), difference)))
sequence = Series(
Option(Series(Text("§"), dwsp__)), Alternative(interleave, lookaround),
ZeroOrMore(Series(AND, dwsp__, Option(Series(Text("§"), dwsp__)),
Alternative(interleave, lookaround))))
expression.set(Series(sequence, ZeroOrMore(Series(OR, dwsp__, sequence))))
interleave = Series(difference, ZeroOrMore(Series(Series(Text("°"), dwsp__), Option(Series(Text("§"), dwsp__)), difference)))
sequence = Series(Option(Series(Text("§"), dwsp__)), Alternative(interleave, lookaround), ZeroOrMore(Series(AND, dwsp__, Option(Series(Text("§"), dwsp__)), Alternative(interleave, lookaround))))
FOLLOW_UP = Alternative(Text("@"), symbol, EOF)
procedure = Series(SYM_REGEX, Series(Text("()"), dwsp__))
literals = OneOrMore(literal)
component = Alternative(regexp, literals, procedure, Series(symbol, NegativeLookahead(DEF)))
directive = Series(
Series(Text("@"), dwsp__), symbol, Series(Text("="), dwsp__),
Alternative(Series(component, ZeroOrMore(Series(Series(Text(","), dwsp__), component))),
expression),
Lookahead(FOLLOW_UP), mandatory=1)
definition = Series(symbol, DEF, dwsp__, Option(Series(OR, dwsp__)), expression, ENDL, dwsp__,
Lookahead(FOLLOW_UP), mandatory=1)
definition = Series(symbol, DEF, dwsp__, Option(Series(OR, dwsp__)), expression, ENDL, dwsp__, Lookahead(FOLLOW_UP), mandatory=1)
component = Alternative(literals, procedure, expression)
directive = Series(Series(Text("@"), dwsp__), symbol, Series(Text("="), dwsp__), component, ZeroOrMore(Series(Series(Text(","), dwsp__), component)), Lookahead(FOLLOW_UP), mandatory=1)
element.set(Alternative(Series(Option(retrieveop), symbol, NegativeLookahead(DEF)), literal, plaintext, regexp, Series(character, dwsp__), any_char, whitespace, group))
countable.set(Alternative(option, oneormore, element))
expression.set(Series(sequence, ZeroOrMore(Series(OR, dwsp__, sequence))))
syntax = Series(dwsp__, ZeroOrMore(Alternative(definition, directive)), EOF)
resume_rules__ = {'definition': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')],
'directive': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')]}
root__ = syntax
......
# ebnf.py - EBNF -> Python-Parser compilation for DHParser
#
# Copyright 2016 by Eckhart Arnold (arnold@badw.de)
# Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License.
"""
Module ``ebnf`` provides an EBNF-parser-generator that compiles an
EBNF-Grammar into avPython-code that can be executed to parse source text
conforming to this grammar into concrete syntax trees.
Specifying Grammers with EBNF
-----------------------------
With DHParser, Grammars can be specified either directly in Python-code
(see :py:mod:`parse`) or in one of several EBNF-dialects. (Yes,
DHParser supports several different variants of EBNF! This makes it easy
to crate a parser directly from Grammars found in external sources.)
"EBNF" stands for the "Extended-Backus-Naur-Form" which is a common
formalism for specifying Grammars for context-free-languages.
(see https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form)
The recommended way of compiling grammars with DHParser is to either
write the EBNF-specification for that Grammar into a text-file and then
compile EBNF-source to an executable as well as importable Python-module
with the help of the "dhparser"-skript. Or, for bigger projects, to
create a new domain-specific-language-project with the DHParser-skript
as described in the step-by-step-guide.
However, here we will show how to compile an EBNF-specified grammar
from within Python-code and how to execute the parser that was
generated by compiling the grammar.
As an example, we will realize a json-parser (https://www.json.org/).
Let's start with creating some test-data::
>>> testobj = {'array': [1, 2.0, "a string"], 'number': -1.3e+25, 'bool': False}
>>> import json
>>> testdata = json.dumps(testobj)
>>> testdata
'{"array": [1, 2.0, "a string"], "number": -1.3e+25, "bool": false}'
We define the json-Grammar (see https://www.json.org/) in
top-down manner in EBNF. We'll use a regular-expression look-alike
syntax. EBNF, as you may recall, consists of a sequence of symbol
definitions. The definiens of those definitions either is a string
literal or regular expression or other symbols or a combination
of these with four different operators: 1. sequences
2. alternatives 3. options and 4. repetitions. Here is how these
elements are denoted in classical and regex-like EBNF-syntax:
======================== ================== ================
element classical EBNF regex-like
======================== ================== ================
insignificant whitespace ~ ~
string literal "..." or \\`...\\` "..." or \\`...\\`
regular expr. /.../ /.../
sequences A B C A B C
alternatives A | B | C A | B | C
options [ ... ] ...?
repetions { ... } ...*
one or more ...+
grouping (...) (...)
======================== ================== ================
"insignificant whitespace" is a speciality of DHParser. Denoting
insignificant whitespace with a particular sign ``~`` allows to eliminate
it already during the parsing process without burdening later
syntax-tree-processing stages with this common task. DHParser offers
several more facilities to restrain the verbosity of the concrete
syntax tree, so that the outcome of the parsing stage comes close (or
at least closer) to the intended abstract-syntax-tree, already.
JSON consists of two complex data types, 1) associative arrays,
called "object" and sequences of heterogeneous data, called array; and
of four simple data types, 1) string, 2) number, 3) bool and 4) null.
The structure of a JSON file can easily be described in EBNF::
>>> grammar = '''
... json = ~ _element _EOF
... _EOF = /$/
... _element = object | array | string | number | bool | null
... object = "{" ~ member ( "," ~ §member )* "}" ~
... member = string ":" ~ _element
... array = "[" ~ ( _element ( "," ~ _element )* )? "]" ~
... string = `"` _CHARS `"` ~
... _CHARS = /[^"\\\\\\]+/ | /\\\\\\[\\\\/bnrt\\\\\\]/
... number = _INT _FRAC? _EXP? ~
... _INT = `-`? ( /[1-9][0-9]+/ | /[0-9]/ )
... _FRAC = `.` /[0-9]+/
... _EXP = (`E`|`e`) [`+`|`-`] /[0-9]+/
... bool = "true" ~ | "false" ~
... null = "null" ~ '''
This is a rather common EBNF-grammar. A few peculiarities are noteworthy, though:
First of all you might notice that some components of the grammar
(or "prduction rules" as they are commonly called) have names with a leading
underscore ``_``. It is a convention to mark those elements, in which we are on
interested on their own account, with an underscore ``_``. When moving from the
concrete syntax-tree to a more abstract syntax-tree, these elements could be
substituted by their content, to simplify the tree.
Secondly, some production rules carry a name written in captial letters. This is also
a convention to mark those symbols which with other parser-generators would
represent tokens delivered by a lexical scanner. DHParser is a "scanner-less"
parser, which means that the breaking down of the string into meaningful tokens
is done in place with regular expressions (like in the definition of ``_EOF``)
or simple combinations of regular expressions (see the definition of ``_INT`` above).
Their is no sharp distinction between tokens and other symbols in DHParser,
but we keep it as a loose convention. Regular expressions are enclosed in forward
slashes and follow the standard syntax of Perl-style regular expression that is
also used by the "re"-module of the Python standard library. (Don't worry about
the number of backslashes in the line defining ``_CHARS`` for now!)
Finally, it is another helpful conention to indent the defintions of symbols
that have only been introduced to simplify an otherwise uneccessarily
complicated definition (e.g. the definition of ``number``, above) or to make
it more understandable by giving names to its componentns (like ``_EOF``).
Let's try this grammar on our test-string. In order to compile
this grammar into executable Python-code, we use the high-level-function
:py:func:`~dsl.create_parser` from the :py:mod:`dsl`-module.
>>> from DHParser.dsl import create_parser
>>> # from DHParser.dsl import compileEBNF
>>> # print(compileEBNF(grammar))
>>> parser = create_parser(grammar, branding="JSON")
>>> syntax_tree = parser(testdata)
>>> syntax_tree.content
'{"array": [1, 2.0, "a string"], "number": -1.3e+25, "bool": false}'
As expected serializing the content of the resulting syntax-tree yields exactly
the input-string of the parsing process. What we cannot see here, is that the
parser has structured the string into the individual elements described in the
grammar. Since the concrete syntax-tree that the parser vields is rather
verbose, it would not make sense to print it out. We'll just look at a small
part of it, to see what it looks like. Let's just pick the sub-tree that
captures the first json-array within the syntax-tree::
>>> print(syntax_tree.pick('array').as_sxpr())
(array
(:Text "[")
(_element
(number
(_INT "1")))
(:Text ",")
(:Whitespace " ")
(_element
(number
(_INT "2")
(_FRAC
(:Text ".")
(:RegExp "0"))))
(:Text ",")
(:Whitespace " ")
(_element
(string
(:Text '"')
(_CHARS "a string")
(:Text '"')))
(:Text "]"))
The nodes of the syntax-tree carry the names of the production rules
by which they have been generated. Nodes, that have been created by
components of a prduction receive the name of of the parser-type
that has created the node (see :py:mod:`parse`) prefixed
with a colon ":". In DHParser, these nodes are called "anonymous",
because they lack the name of a proper grammatical component.
.. _simplifying_syntax_trees:
Simplifying Syntax-Trees while Parsing
--------------------------------------
Usually, anonymous nodes are what you want to get rid of in the course
of transforming the concrete syntax-tree into an abstract syntax-tree.