Commit b00c2671 authored by eckhart's avatar eckhart
Browse files

ebnf.py: added a fast-path ebnf-Parser if less flexibility regarding the variants

of EBNF-syntax is acceptable.
parent 157bd90e
......@@ -336,33 +336,59 @@ CONFIG_PRESET['default_literalws'] = "none"
# Default value for the brand of EBNF that DHParser accepts
# 'fixed' - Allows to use suffix syntax (?, +, *) as well as classic
# EBNF-syntax ([], {}). The delimiters are fixed before first use to
# the DHParser-standard and will not be read from configuration-value
# "delimiter_set".
# 'classic' - relatively closest to the ISO-standard, i.e. uses [] and {}
# for optional and zero or more elements, respectively. Does not allow
# the ?, +, * suffixes. Allows the specification of character-ranges
# within square brackets only with the ordinal unicode numbers,
# not with the characters itself, i.e. [0x41-0x5A]
# 'regex-like' - similar to regular expression syntax, allows ?, +, *
# suffixes for optional, one or more repetitions, zero or more
# repetitions, but not {} or []. Allows character-ranges within
# square bracket in any form.
# 'peg-like' - like regex-like, but uses / instead of | for the
# alternative-parser. Does not allow regular expressions between, i.e.
# / ... / within the EBNF-code!
# not with the characters itself, i.e. [0x41-0x5A]. Delimiters will
# be configured on first use.
# 'strict' - allows both classic and regex-like syntax to be mixed, but
# allows character ranges within square brackets with ordinal values,
# only. Uses | as delimiter for alternatives.
# 'configurable' - like fixed, but the delimiter constants will be configured
# from the configuration-value 'delimiter_set' (see below).
# 'heuristic' - the most liberal mode, allows about everything. However,
# because it employs heuristics to distinguish ambiguous cases, it
# may lead to unexpected errors and require the user to resolve the
# ambiguities
# 'regex-like' - similar to regular expression syntax, allows ?, +, *
# suffixes for optional, one or more repetitions, zero or more
# repetitions, but not {} or []. Allows character-ranges within
# square bracket in any form.
# 'peg-like' - like regex-like, but uses / instead of | for the
# alternative-parser. Does not allow regular expressions between, i.e.
# / ... / within the EBNF-code!
# Default value: "fixed"
EBNF_FIXED_SYNTAX = "fixed"
EBNF_CLASSIC_SYNTAX = "classic"
EBNF_ANY_SYNTAX_STRICT = "strict"
EBNF_CONFIGURABLE_SYNTAX = "configurable"
EBNF_ANY_SYNTAX_HEURISTICAL = "heuristic"
EBNF_REGULAR_EXPRESSION_SYNTAX = "regex-like"
EBNF_PARSING_EXPRESSION_GRAMMAR_SYNTAX = "peg-like"
CONFIG_PRESET['syntax_variant'] = EBNF_ANY_SYNTAX_STRICT
CONFIG_PRESET['syntax_variant'] = EBNF_FIXED_SYNTAX
# Set of delimiters when using the 'configurable'-Grammar
CONFIG_PRESET['delimiter_set'] = {
'DEF': '=',
'OR': '|',
'AND': '',
'ENDL': '',
'RNG_OPEN': '{',
'RNG_CLOSE': '}',
'RNG_DELIM': ',',
'TIMES': '*',
'RE_LEADIN': '/',
'RE_LEADOUT': '/',
'CH_LEADIN': '0x'
}
########################################################################
......
......@@ -41,7 +41,7 @@ from DHParser.error import Error, AMBIGUOUS_ERROR_HANDLING, WARNING, REDECLARED_
from DHParser.parse import Parser, Grammar, mixin_comment, mixin_nonempty, Forward, RegExp, \
Drop, Lookahead, NegativeLookahead, Alternative, Series, Option, ZeroOrMore, OneOrMore, \
Text, Capture, Retrieve, Pop, optional_last_value, GrammarError, Whitespace, Always, Never, \
INFINITE, matching_bracket, ParseFunc
INFINITE, matching_bracket, ParseFunc, update_scanner
from DHParser.preprocess import nil_preprocessor, PreprocessorFunc
from DHParser.syntaxtree import Node, WHITESPACE_PTYPE, TOKEN_PTYPE, EMPTY_NODE
from DHParser.toolkit import load_if_file, escape_re, escape_control_characters, md5, \
......@@ -505,6 +505,129 @@ class EBNFGrammar(Grammar):
)))
class FixedEBNFGrammar(Grammar):
r"""Faster version of EBNF, where delimiters are not determined on
first use, but defined as constant Text-parsers. They can still be
adjusted with function `parse.update_scanner()`.
"""
countable = Forward()
element = Forward()
expression = Forward()
source_hash__ = "d0735678e82e6d7cbf75958080a607ff"
anonymous__ = re.compile('pure_elem$|countable$|FOLLOW_UP$|SYM_REGEX$|ANY_SUFFIX$|EOF$')
static_analysis_pending__ = [] # type: List[bool]
parser_initialization__ = ["upon instantiation"]
error_messages__ = {
'definition':
[(re.compile(r','),
'Delimiter "," not expected in definition!\\n'
'Either this was meant to be a directive and the directive symbol @ is missing\\n'
'or the error is due to inconsistent use of the comma as a delimiter\\n'
'for the elements of a sequence.')]}
resume_rules__ = {'definition': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')],
'directive': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')]}
COMMENT__ = r'(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)'
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = Drop(Whitespace(WSP_RE__))
HEXCODE = RegExp('[A-Fa-f0-9]{1,8}')
SYM_REGEX = RegExp('(?!\\d)\\w+')
RE_CORE = RegExp('(?:(?<!\\\\)\\\\(?:/)|[^/])*')
regex_heuristics = Alternative(RegExp('[^ ]'), RegExp('[^/\\n*?+\\\\]*[*?+\\\\][^/\\n]/'))
literal_heuristics = Alternative(RegExp('~?\\s*"(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^"]*)*"'),
RegExp("~?\\s*'(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^']*)*'"),
RegExp('~?\\s*`(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^`]*)*`'),
RegExp('~?\\s*´(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^´]*)*´'),
RegExp('~?\\s*/(?:[\\\\]\\]|[^\\]]|[^\\\\]\\[[^/]*)*/'))
char_range_heuristics = NegativeLookahead(
Alternative(RegExp('[\\n\\t ]'), Series(dwsp__, literal_heuristics),
Series(Option(Alternative(Text("::"), Text(":?"), Text(":"))),
SYM_REGEX, RegExp('\\s*\\]'))))
CH_LEADIN = Text("0x")
RE_LEADOUT = Text("/")
RE_LEADIN = Text("/")
TIMES = Text("*")
RNG_DELIM = Text(",")
RNG_CLOSE = Text("}")
RNG_OPEN = Text("{")
ENDL = Text("")
AND = Text("")
OR = Text("|")
DEF = Text("=")
EOF = Drop(Drop(NegativeLookahead(RegExp('.'))))
whitespace = Series(RegExp('~'), dwsp__)
any_char = Series(Text("."), dwsp__)
free_char = Alternative(RegExp('[^\\n\\[\\]\\\\]'), RegExp('\\\\[nrt`´\'"(){}\\[\\]/\\\\]'))
character = Series(CH_LEADIN, HEXCODE)
char_range = Series(Text("["), Lookahead(char_range_heuristics), Option(Text("^")),
Alternative(character, free_char),
ZeroOrMore(Alternative(Series(Option(Text("-")), character), free_char)),
Series(Text("]"), dwsp__))
regexp = Series(RE_LEADIN, RE_CORE, RE_LEADOUT, dwsp__)
plaintext = Alternative(Series(RegExp('`(?:(?<!\\\\)\\\\`|[^`])*?`'), dwsp__),
Series(RegExp('´(?:(?<!\\\\)\\\\´|[^´])*?´'), dwsp__))
literal = Alternative(Series(RegExp('"(?:(?<!\\\\)\\\\"|[^"])*?"'), dwsp__),
Series(RegExp("'(?:(?<!\\\\)\\\\'|[^'])*?'"), dwsp__))
symbol = Series(SYM_REGEX, dwsp__)
multiplier = Series(RegExp('[1-9]\\d*'), dwsp__)
no_range = Alternative(NegativeLookahead(multiplier),
Series(Lookahead(multiplier), TIMES))
range = Series(RNG_OPEN, dwsp__, multiplier, Option(Series(RNG_DELIM, dwsp__, multiplier)),
RNG_CLOSE, dwsp__)
counted = Alternative(Series(countable, range), Series(countable, TIMES, dwsp__, multiplier),
Series(multiplier, TIMES, dwsp__, countable, mandatory=3))
option = Alternative(
Series(Series(Text("["), dwsp__), expression, Series(Text("]"), dwsp__), mandatory=1),
Series(element, Series(Text("?"), dwsp__)))
repetition = Alternative(
Series(Series(Text("{"), dwsp__), no_range, expression,
Series(Text("}"), dwsp__), mandatory=2),
Series(element, Series(Text("*"), dwsp__), no_range))
oneormore = Alternative(
Series(Series(Text("{"), dwsp__), no_range, expression, Series(Text("}+"), dwsp__)),
Series(element, Series(Text("+"), dwsp__)))
group = Series(Series(Text("("), dwsp__), no_range, expression,
Series(Text(")"), dwsp__), mandatory=2)
retrieveop = Alternative(
Series(Text("::"), dwsp__), Series(Text(":?"), dwsp__), Series(Text(":"), dwsp__))
flowmarker = Alternative(
Series(Text("!"), dwsp__), Series(Text("&"), dwsp__),
Series(Text("<-!"), dwsp__), Series(Text("<-&"), dwsp__))
ANY_SUFFIX = RegExp('[?*+]')
element.set(Alternative(
Series(Option(retrieveop), symbol, NegativeLookahead(DEF)),
literal, plaintext, regexp, Series(character, dwsp__), any_char, whitespace, group))
pure_elem = Series(element, NegativeLookahead(ANY_SUFFIX), mandatory=1)
countable.set(Alternative(option, oneormore, element))
term = Alternative(oneormore, counted, repetition, option, pure_elem)
difference = Series(term, Option(Series(
Series(Text("-"), dwsp__), Alternative(oneormore, pure_elem), mandatory=1)))
lookaround = Series(flowmarker, Alternative(oneormore, pure_elem), mandatory=1)
interleave = Series(difference, ZeroOrMore(
Series(Series(Text("°"), dwsp__), Option(Series(Text("§"), dwsp__)), difference)))
sequence = Series(
Option(Series(Text("§"), dwsp__)), Alternative(interleave, lookaround),
ZeroOrMore(Series(AND, dwsp__, Option(Series(Text("§"), dwsp__)),
Alternative(interleave, lookaround))))
expression.set(Series(sequence, ZeroOrMore(Series(OR, dwsp__, sequence))))
FOLLOW_UP = Alternative(Text("@"), symbol, EOF)
procedure = Series(SYM_REGEX, Series(Text("()"), dwsp__))
literals = OneOrMore(literal)
directive = Series(
Series(Text("@"), dwsp__), symbol, Series(Text("="), dwsp__),
Alternative(regexp, literals, procedure, Series(symbol, NegativeLookahead(DEF))),
ZeroOrMore(Series(
Series(Text(","), dwsp__),
Alternative(regexp, literals, procedure, Series(symbol, NegativeLookahead(DEF))))),
Lookahead(FOLLOW_UP), mandatory=1)
definition = Series(symbol, DEF, dwsp__, Option(Series(OR, dwsp__)), expression, ENDL, dwsp__,
Lookahead(FOLLOW_UP), mandatory=1, err_msgs=error_messages__["definition"])
syntax = Series(dwsp__, ZeroOrMore(Alternative(definition, directive)), EOF)
root__ = syntax
def grammar_changed(grammar_class, grammar_source: str) -> bool:
"""
Returns ``True`` if ``grammar_class`` does not reflect the latest
......@@ -540,12 +663,30 @@ def grammar_changed(grammar_class, grammar_source: str) -> bool:
def get_ebnf_grammar() -> EBNFGrammar:
"""Returns a thread-local EBNF-Grammar-object for parsing EBNF sources."""
THREAD_LOCALS = access_thread_locals()
mode = get_config_value('syntax_variant')
try:
grammar = THREAD_LOCALS.ebnf_grammar_singleton
if mode in ('fixed', 'configurable'):
if not isinstance(grammar, FixedEBNFGrammar):
raise AttributeError
else:
if not isinstance(grammar, EBNFGrammar):
raise AttributeError
except AttributeError:
THREAD_LOCALS.ebnf_grammar_singleton = EBNFGrammar()
grammar = THREAD_LOCALS.ebnf_grammar_singleton
grammar.mode = get_config_value('syntax_variant')
if mode in ('fixed', 'configurable'):
grammar = FixedEBNFGrammar()
if mode == "fixed":
# configure grammar once
update_scanner(grammar, get_config_value('delimiter_set'))
THREAD_LOCALS.ebnf_grammar_singleton = grammar
else:
grammar = EBNFGrammar()
THREAD_LOCALS.ebnf_grammar_singleton = grammar
if mode == 'configurable':
# configure grammar on each request of the grammar object
update_scanner(grammar, get_config_value('delimiter_set'))
elif mode != 'fixed':
grammar.mode = mode
return grammar
......@@ -600,7 +741,7 @@ EBNF_AST_transformation_table = {
"counted":
[remove_children('TIMES')],
"range":
[remove_children('BRACE_SIGN', 'RNG_BRACE', 'RNG_DELIM')],
[remove_children('BRACE_SIGN', 'RNG_BRACE', 'RNG_DELIM', 'RNG_OPEN', 'RNG_CLOSE')],
"symbol, literal, any_char":
[reduce_single_child],
"plaintext":
......@@ -615,8 +756,8 @@ EBNF_AST_transformation_table = {
[],
(TOKEN_PTYPE, WHITESPACE_PTYPE, "whitespace"):
[reduce_single_child],
"EOF, DEF, OR, AND, ENDL, BRACE_SIGN, RNG_BRACE, RNG_DELIM, TIMES, "
"RE_LEADIN, RE_CORE, RE_LEADOUT, CH_LEADIN":
"EOF, DEF, OR, AND, ENDL, BRACE_SIGN, RNG_BRACE, RNG_DELIM, RNG_OPEN, "
"RNG_CLOSE, TIMES, RE_LEADIN, RE_CORE, RE_LEADOUT, CH_LEADIN":
[],
"*":
[replace_by_single_child]
......@@ -1243,6 +1384,7 @@ class EBNFCompiler(Compiler):
for i in range(len(definitions)):
if definitions[i][0] in self.variables:
definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
self.definitions[definitions[i][0]] = definitions[i][1]
# add special fields for Grammar class
......@@ -1430,6 +1572,27 @@ class EBNFCompiler(Compiler):
'Rule "%s" is not connected to parser root "%s" !' %
(leftover, self.root_symbol), WARNING)
# check for filters assigned to non-existing or uncaptured symbols
def directive_node(tree, directive) -> Node:
"""Returns the node, where the given directive was stated in the
EBNF-source."""
for dr in tree.select('directive'):
if dr.pick('symbol').content == directive:
return dr
return tree
for symbol in self.directives.filter:
if symbol not in self.symbols:
self.tree.new_error(directive_node(self.tree, symbol + '_filter'),
'Filter declared for non-existent symbol "%s"' % symbol,
WARNING)
else:
if not self.definitions[symbol].startswith('Capture('):
self.tree.new_error(directive_node(self.tree, symbol + '_filter'),
'Filter declared for uncaptured symbol "%s"' % symbol,
WARNING)
# set root_symbol parser and assemble python grammar definition
if self.root_symbol and 'root__' not in self.rules:
......
......@@ -57,7 +57,6 @@ __all__ = ('ParserError',
'FlagFunc',
'ParseFunc',
'Parser',
'UnknownParserError',
'AnalysisError',
'GrammarError',
'Grammar',
......@@ -68,6 +67,7 @@ __all__ = ('ParserError',
'Text',
'DropText',
'RegExp',
'update_scanner',
'RE',
'TKN',
'Whitespace',
......@@ -777,12 +777,6 @@ def mixin_nonempty(whitespace: str) -> str:
return whitespace
class UnknownParserError(KeyError):
"""UnknownParserError is raised if a Grammar object is called with a
parser that does not exist or if in the course of parsing a parser
is referred to that does not exist."""
AnalysisError = Tuple[str, Parser, Error] # pname, parser, error
# TODO: replace with a named tuple?
......@@ -1125,7 +1119,14 @@ class Grammar:
return duplicate
def __init__(self, root: Parser = None) -> None:
def __init__(self, root: Parser = None, static_analysis: Optional[bool] = None) -> None:
"""Constructor of class Grammar.
:param root: Overrides default root parser. By default the root parser
is the parser assigned to the class field `root__`. This is useful for
executing or testing certain parts of a complex parser ensemble.
:param static_analysis: If not None, this overrides the config value
"static_analysis".
"""
self.all_parsers__ = set() # type: Set[Parser]
# add compiled regular expression for comments, if it does not already exist
if not hasattr(self, 'comment_rx__') or self.comment_rx__ is None:
......@@ -1171,9 +1172,10 @@ class Grammar:
assert 'root_parser__' in self.__dict__
assert self.root_parser__ == self.__dict__['root_parser__']
if self.static_analysis_pending__ \
and get_config_value('static_analysis') in {'early', 'late'}:
# try:
if (self.static_analysis_pending__
and (static_analysis
or (static_analysis is None
and get_config_value('static_analysis') in {'early', 'late'}))):
result = self.static_analysis()
# clears any stored errors without overwriting the pointer
while self.static_analysis_errors__:
......@@ -1183,8 +1185,6 @@ class Grammar:
if has_errors:
raise GrammarError(result)
self.static_analysis_pending__.pop()
# except (NameError, AttributeError) as e:
# pass # don't fail the initialization of PLACEHOLDER
def __str__(self):
return self.__class__.__name__
......@@ -1202,7 +1202,7 @@ class Grammar:
parser.apply(self._add_parser__)
assert self[key] == parser
return self[key]
raise UnknownParserError('Unknown parser "%s" !' % key)
raise AttributeError('Unknown parser "%s" !' % key)
def __contains__(self, key):
......@@ -1815,6 +1815,35 @@ class Whitespace(RegExp):
return '~'
def update_scanner(grammar: Grammar, leaf_parsers: Dict[str, str]):
"""Updates the "scanner" of a grammar by overwriting the `text` or
`regex`-fields of some of or all of its leaf parsers with new values.
This works, of course, only for those parsers that are assigned
to a symbol in the Grammar class.
:param grammar: The grammar-object for which the leaf parsers
shall be updated.
:param leaf_parsers: A mapping of parser names to strings that
are interpreted as plain text (if the parser name refers to
a `Text`-parser or as regular expressions, if the parser name
refers to a `RegExp`-parser
:raises AttributeError: in case a leaf parser name in the
dictionary does not exist or does not refer to a `Text`
or `RegExp`-parser.
"""
for pname, t in leaf_parsers.items():
parser = grammar[pname]
if isinstance(parser, Text):
assert isinstance(t, str)
cast(Text, parser).text = t
elif isinstance(parser, RegExp):
cast(RegExp, parser).regexp = re.compile(t) if isinstance(t, str) else t
else:
raise AttributeError('Parser %s is not a Text- oder RegExp-Parser, but %s'
% (pname, type(parser)))
########################################################################
#
# Meta parser classes, i.e. parsers that contain other parsers
......
......@@ -38,7 +38,7 @@ templatedir = os.path.join(os.path.dirname(scriptdir.rstrip('/')), 'templates')
from DHParser.compile import compile_source
from DHParser.configuration import access_presets, finalize_presets, \
EBNF_ANY_SYNTAX_HEURISTICAL
EBNF_ANY_SYNTAX_HEURISTICAL, EBNF_ANY_SYNTAX_STRICT, EBNF_FIXED_SYNTAX
from DHParser.dsl import compileDSL, compile_on_disk
from DHParser.error import is_error
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler
......
......@@ -41,7 +41,7 @@ from DHParser.configuration import get_config_value
from DHParser.error import Error, is_error, adjust_error_locations, PARSER_LOOKAHEAD_MATCH_ONLY, \
PARSER_LOOKAHEAD_FAILURE_ONLY, MANDATORY_CONTINUATION_AT_EOF, AUTORETRIEVED_SYMBOL_NOT_CLEARED
from DHParser.log import is_logging, clear_logs, local_log_dir, log_parsing_history
from DHParser.parse import UnknownParserError, Lookahead
from DHParser.parse import Lookahead
from DHParser.syntaxtree import Node, RootNode, parse_tree, flatten_sxpr, ZOMBIE_TAG
from DHParser.trace import set_tracer, all_descendants, trace_history
from DHParser.transform import traverse, remove_children
......@@ -383,7 +383,7 @@ def grammar_unit(test_unit, parser_factory, transformer_factory, report='REPORT'
if has_lookahead(parser_name):
set_tracer(all_descendants(parser[parser_name]), trace_history)
track_history = True
except UnknownParserError:
except AttributeError:
pass
assert parser_name, "Missing parser name in test %s!" % unit_name
......@@ -416,7 +416,7 @@ def grammar_unit(test_unit, parser_factory, transformer_factory, report='REPORT'
errflag = len(errata)
try:
cst = parser(test_code, parser_name)
except UnknownParserError as upe:
except AttributeError as upe:
cst = RootNode()
cst = cst.new_error(Node(ZOMBIE_TAG, "").with_pos(0), str(upe))
clean_test_name = str(test_name).replace('*', '')
......@@ -497,7 +497,7 @@ def grammar_unit(test_unit, parser_factory, transformer_factory, report='REPORT'
errflag = len(errata)
try:
cst = parser(test_code, parser_name)
except UnknownParserError as upe:
except AttributeError as upe:
node = Node(ZOMBIE_TAG, "").with_pos(0)
cst = RootNode(node).new_error(node, str(upe))
errata.append('Unknown parser "{}" in fail test "{}"!'.format(
......
......@@ -17,7 +17,7 @@
# - replace the regex_heuristics by an always matching parser
#
# Ambiguities can also be avoided by NOT using all the syntactic variants
# made possible by this EBNF-grammar within one and the same EBNF-docum
# made possible by this EBNF-grammar within one and the same EBNF-document
@ comment = /(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)/
# comments can be either C-Style: /* ... */
......
......@@ -76,7 +76,7 @@ class EBNFGrammar(Grammar):
countable = Forward()
element = Forward()
expression = Forward()
source_hash__ = "09b88d557e08f59db56613dadff966e3"
source_hash__ = "94480ce7a73ec2c5f878ecb207b43073"
anonymous__ = re.compile('pure_elem$|countable$|FOLLOW_UP$|SYM_REGEX$|ANY_SUFFIX$|EOF$')
static_analysis_pending__ = [] # type: List[bool]
parser_initialization__ = ["upon instantiation"]
......
# EBNF-Grammar in EBNF
# This is faster version of EBNF relying on fixed constants for delimiters,
# rather than variables that are captured on first use as in "EBNF.ebnf".
# Different syntactical variants are not detected by the grammar itself,
# but need to be configured either by adjusting the definitions of DEF, OR,
# AND, ENDL, RNG_OPEN, RNG_CLOSE, RNG_DELIM, CH_LEADIN, TIMES, RE_LEADIN,
# RE_LEADOUT either within this grammar definition or in the Grammar-object
# changing the `text`-field of the respective parser objects.
@ comment = /(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)/
# comments can be either C-Style: /* ... */
# or pascal/modula/oberon-style: (* ... *)
# or python-style: # ... \n, excluding, however, character markers: #x20
@ whitespace = /\s*/ # whitespace includes linefeed
@ literalws = right # trailing whitespace of literals will be ignored tacitly
@ anonymous = pure_elem, countable, FOLLOW_UP, SYM_REGEX, ANY_SUFFIX, EOF
@ drop = whitespace, EOF # do not include these even in the concrete syntax tree
@ RNG_BRACE_filter = matching_bracket() # filter or transform content of RNG_BRACE on retrieve
# re-entry-rules for resuming after parsing-error
@ definition_resume = /\n\s*(?=@|\w+\w*\s*=)/
@ directive_resume = /\n\s*(?=@|\w+\w*\s*=)/
# specialized error messages for certain cases
@ definition_error = /,/, 'Delimiter "," not expected in definition!\nEither this was meant to '
'be a directive and the directive symbol @ is missing\nor the error is '
'due to inconsistent use of the comma as a delimiter\nfor the elements '
'of a sequence.'
#: top-level
syntax = ~ { definition | directive } EOF
definition = symbol §DEF~ [ OR~ ] expression ENDL~ & FOLLOW_UP # [OR~] to support v. Rossum's syntax
directive = "@" §symbol "=" (regexp | literals | procedure | symbol !DEF)
{ "," (regexp | literals | procedure | symbol !DEF) } & FOLLOW_UP
literals = { literal }+ # string chaining, only allowed in directives!
procedure = SYM_REGEX "()" # procedure name, only allowed in directives!
FOLLOW_UP = `@` | symbol | EOF
#: components
expression = sequence { OR~ sequence }
sequence = ["§"] ( interleave | lookaround ) # "§" means all following terms mandatory
{ AND~ ["§"] ( interleave | lookaround ) }
interleave = difference { "°" ["§"] difference }
lookaround = flowmarker § (oneormore | pure_elem)
difference = term ["-" § (oneormore | pure_elem)]
term = oneormore | counted | repetition | option | pure_elem
#: elements
countable = option | oneormore | element
pure_elem = element § !ANY_SUFFIX # element strictly without a suffix
element = [retrieveop] symbol !DEF # negative lookahead to be sure it's not a definition
| literal
| plaintext
| regexp
# | char_range
| character ~
| any_char
| whitespace
| group
ANY_SUFFIX = /[?*+]/
#: flow-operators
flowmarker = "!" | "&" # '!' negative lookahead, '&' positive lookahead
| "<-!" | "<-&" # '<-!' negative lookbehind, '<-&' positive lookbehind
retrieveop = "::" | ":?" | ":" # '::' pop, ':?' optional pop, ':' retrieve
#: groups
group = "(" no_range §expression ")"
oneormore = "{" no_range expression "}+" | element "+"
repetition = "{" no_range §expression "}" | element "*" no_range
option = # !char_range
"[" §expression "]" | element "?"
counted = countable range | countable TIMES~ multiplier | multiplier TIMES~ §countable
range = RNG_OPEN~ multiplier [ RNG_DELIM~ multiplier ] RNG_CLOSE~
no_range = !multiplier | &multiplier TIMES
multiplier = /[1-9]\d*/~
#: leaf-elements
symbol = SYM_REGEX ~ # e.g. expression, term, parameter_list
literal = /"(?:(?<!\\)\\"|[^"])*?"/~ # e.g. "(", '+', 'while'
| /'(?:(?<!\\)\\'|[^'])*?'/~ # whitespace following literals will be ignored tacitly.
plaintext = /`(?:(?<!\\)\\`|[^`])*?`/~ # like literal but does not eat whitespace
| /´(?:(?<!\\)\\´|[^´])*?´/~
regexp = RE_LEADIN RE_CORE RE_LEADOUT ~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
# regexp = /\/(?:(?<!\\)\\(?:\/)|[^\/])*?\//~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
char_range = `[` &char_range_heuristics
[`^`] (character | free_char) { [`-`] character | free_char } "]"
character = CH_LEADIN HEXCODE
free_char = /[^\n\[\]\\]/ | /\\[nrt`´'"(){}\[\]\/\\]/
any_char = "."
whitespace = /~/~ # insignificant whitespace
#: delimiters
EOF = !/./
DEF = `=`
OR = `|`
AND = ``
ENDL = ``
RNG_OPEN = `{`
RNG_CLOSE = `}`
RNG_DELIM = `,`
TIMES = `*`
RE_LEADIN = `/`
RE_LEADOUT = `/`
CH_LEADIN = `0x`
#: heuristics
char_range_heuristics = ! ( /[\n\t ]/
| ~ literal_heuristics
| [`::`|`:?`|`:`] SYM_REGEX /\s*\]/ )
literal_heuristics = /~?\s*"(?:[\\]\]|[^\]]|[^\\]\[[^"]*)*"/
| /~?\s*'(?:[\\]\]|[^\]]|[^\\]\[[^']*)*'/
| /~?\s*`(?:[\\]\]|[^\]]|[^\\]\[[^`]*)*`/
| /~?\s*´(?:[\\]\]|[^\]]|[^\\]\[[^´]*)*´/
| /~?\s*\/(?:[\\]\]|[^\]]|[^\\]\[[^\/]*)*\//
regex_heuristics = /[^ ]/ | /[^\/\n*?+\\]*[*?+\\][^\/\n]\//