Commit c666cc16 authored by eckhart's avatar eckhart
Browse files

parse.py: Inifinite-Loop-protection for counted and interleave-parser

parent df5ef70f
......@@ -84,6 +84,7 @@ __all__ = ('ErrorCode',
'BAD_MANDATORY_SETUP',
'DUPLICATE_PARSERS_IN_ALTERNATIVE',
'BAD_ORDER_OF_ALTERNATIVES',
'BAD_REPETITION_COUNT',
'TREE_PROCESSING_CRASH',
'COMPILER_CRASH',
'AST_TRANSFORM_CRASH')
......@@ -142,6 +143,7 @@ NARY_WITHOUT_PARSERS = ErrorCode(1540)
BAD_MANDATORY_SETUP = ErrorCode(1550)
DUPLICATE_PARSERS_IN_ALTERNATIVE = ErrorCode(1560)
BAD_ORDER_OF_ALTERNATIVES = ErrorCode(1570)
BAD_REPETITION_COUNT = ErrorCode(1580)
# fatal errors
......
......@@ -42,7 +42,7 @@ from DHParser.error import Error, ErrorCode, is_error, MANDATORY_CONTINUATION, \
MALFORMED_ERROR_STRING, MANDATORY_CONTINUATION_AT_EOF, DUPLICATE_PARSERS_IN_ALTERNATIVE, \
CAPTURE_WITHOUT_PARSERNAME, CAPTURE_DROPPED_CONTENT_WARNING, LOOKAHEAD_WITH_OPTIONAL_PARSER, \
BADLY_NESTED_OPTIONAL_PARSER, BAD_ORDER_OF_ALTERNATIVES, BAD_MANDATORY_SETUP, \
OPTIONAL_REDUNDANTLY_NESTED_WARNING, NARY_WITHOUT_PARSERS, CAPTURE_STACK_NOT_EMPTY
OPTIONAL_REDUNDANTLY_NESTED_WARNING, NARY_WITHOUT_PARSERS, CAPTURE_STACK_NOT_EMPTY, BAD_REPETITION_COUNT
from DHParser.log import CallItem, HistoryRecord
from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME
from DHParser.stringview import StringView, EMPTY_STRING_VIEW
......@@ -60,6 +60,8 @@ __all__ = ('ParserError',
'AnalysisError',
'GrammarError',
'Grammar',
'Always',
'Never',
'PreprocessorToken',
'Token',
'DropToken',
......@@ -1536,10 +1538,21 @@ GRAMMAR_PLACEHOLDER = Grammar()
########################################################################
#
# _Token and Regular Expression parser classes (i.e. leaf classes)
# Special parser classes: Alway, Never, PreprocessorToken (leaf classes)
#
########################################################################
class Always(Parser):
"""A parser that always matches, but does not capture anything."""
def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
return EMPTY_NODE, text
class Never(Parser):
"""A parser that never matches."""
def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
return None, text
class PreprocessorToken(Parser):
"""
......@@ -1599,6 +1612,12 @@ class PreprocessorToken(Parser):
return None, text
########################################################################
#
# _Token and Regular Expression parser classes (leaf classes)
#
########################################################################
class Token(Parser):
"""
Parses plain text strings. (Could be done by RegExp as well, but is faster.)
......@@ -1817,7 +1836,10 @@ class CombinedParser(Parser):
nr.extend(child.children)
elif child._result or child.tag_name[0] != ':':
nr.append(child)
return Node(self.tag_name, tuple(nr))
if nr or not self.anonymous:
return Node(self.tag_name, tuple(nr))
else:
return EMPTY_NODE
return Node(self.tag_name, results) # unoptimized code
elif N == 1:
return self._return_value(results[0])
......@@ -1974,17 +1996,17 @@ class ZeroOrMore(Option):
@cython.locals(n=cython.int)
def _parse(self, text: StringView) -> Tuple[Optional[Node], StringView]:
results = () # type: Tuple[Node, ...]
len = text.__len__()
n = len + 1 # type: int
while len < n: # text and len(text) < n:
n = len
length = text.__len__()
n = length + 1 # type: int
while length < n: # text and length(text) < n:
n = length
node, text = self.parser(text)
len = text.__len__()
length = text.__len__()
if node is None:
break
if node._result or not node.tag_name.startswith(':'): # drop anonymous empty nodes
results += (node,)
if len == n:
if length == n:
break # avoid infinite loop
nd = self._return_values(results) # type: Node
return nd, text
......@@ -2023,18 +2045,18 @@ class OneOrMore(UnaryParser):
results = () # type: Tuple[Node, ...]
text_ = text # type: StringView
match_flag = False
len = text.__len__()
n = len + 1 # type: int
while len < n: # text_ and len(text_) < n:
n = len
length = text.__len__()
n = length + 1 # type: int
while length < n: # text_ and len(text_) < n:
n = length
node, text_ = self.parser(text_)
len = text_.__len__()
length = text_.__len__()
if node is None:
break
match_flag = True
if node._result or not node.tag_name.startswith(':'): # drop anonymous empty nodes
results += (node,)
if len == n:
if length == n:
break # avoid infinite loop
if not match_flag:
return None, text
......@@ -2107,16 +2129,25 @@ class Counted(UnaryParser):
def _parse(self, text: StringView):
results = () # Tuple[Node, ...]
text_ = text
length = text_.__len__()
for _ in range(self.repetitions[0]):
node, text_ = self.parser(text_)
if node is None:
return None, text
results += (node,)
n = length
length = text_.__len__()
if length == n:
break # avoid infinite loop
for _ in range(self.repetitions[1] - self.repetitions[0]):
node, text_ = self.parser(text_)
if node is None:
break
results += (node,)
n = length
length = text_.__len__()
if length == n:
break # avoid infinite loop
return self._return_values(results), text_
def is_optional(self) -> Optional[bool]:
......@@ -2130,8 +2161,8 @@ class Counted(UnaryParser):
if a < 0 or b < 0 or a > b or a > INFINITE or b > INFINITE:
return [self.static_error(
'Repetition count [a=%i, b=%i] for parser %s violates requirement '
'0 <= a <= b <= infinity = 2^30 !' % (a, b, str(self))
)]
'0 <= a <= b <= infinity = 2^30' % (a, b, str(self)),
BAD_REPETITION_COUNT)]
return None
......@@ -2457,9 +2488,6 @@ class Alternative(NaryParser):
return errors or None
class Interleave(MandatoryNary):
r"""Parse elements in arbitrary order.
......@@ -2512,6 +2540,7 @@ class Interleave(MandatoryNary):
counter = [0] * len(self.parsers)
consumed = set() # type: Set[Parser]
error = None # type: Optional[Error]
length = text_.__len__()
while True:
# there is an order of testing, but no promise about the order of testing, here!
for i, parser in enumerate(self.parsers):
......@@ -2541,6 +2570,10 @@ class Interleave(MandatoryNary):
results += (err_node,)
if reloc < 0:
break
n = length
length = text_.__len__()
if length == n:
break # avoid infinite loop
nd = self._return_values(results) # type: Node
if error and reloc < 0:
raise ParserError(nd.with_pos(self.grammar.document_length__ - len(text)),
......@@ -2610,8 +2643,8 @@ class Interleave(MandatoryNary):
if a < 0 or b < 0 or a > b or a > INFINITE or b > INFINITE:
return [self.static_error(
'Repetition count [a=%i, b=%i] for parser %s violates requirement '
'0 <= a <= b <= infinity = 2^30 !' % (a, b, str(parser))
)]
'0 <= a <= b <= infinity = 2^30' % (a, b, str(parser)),
BAD_REPETITION_COUNT)]
return None
......
# EBNF-Grammar in EBNF
@ comment = /(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\//
# comments can be either C-Style: /* ... */ or
# python-style: # ... \n, excluding, however, character markers: #x20
# This grammar is tuned for flexibility, that is, it supports as many
# different flavors of EBNF as possible. However, this flexibility
# comes at the cost of some ambiguities. In particular:
#
# 1. the alternative OR-operator / could be mistaken for the start
# of a regular expression and vice versa, and
# 2- character ranges [a-z] can be mistaken for optional blocks
# and vice versa
#
# A strategy to avoid these amnbiguities is to replace the `free_char`-
# and the `no_regex_heursitics`-parser by a never matching parser.
# As a consequence, you loose the alternative OR-operator and character
# ranges can only be specified with the character codes (e.g. 0x41),
# rather than the character itself (e.g. A). Of course, ambiguities can
# and should also be avoided by not using all the syntactic variants
# made possible by this EBNF-grammar within one and the same EBNF-
# document. However, by the setting the free_char and no_regex-heuristics
# parser to "never match", the parser can still help you to avoid mistakes
# arising from ambiguity which you would have to pay attention to
# otherwise.
@ comment = /(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)/
# comments can be either C-Style: /* ... */
# or pascal/modula/oberon-style: (* ... *)
# or python-style: # ... \n, excluding, however, character markers: #x20
@ whitespace = /\s*/ # whitespace includes linefeed
@ literalws = right # trailing whitespace of literals will be ignored tacitly
@ anonymous = pure_elem, FOLLOW_UP, SYM_REGEX, EOF
......@@ -29,8 +52,8 @@
syntax = [~//] { definition | directive } EOF
definition = symbol §:DEF~ expression :ENDL~ & FOLLOW_UP
directive = "@" §symbol "=" (regexp | literals | procedure | symbol)
{ "," (regexp | literals | procedure | symbol) } & FOLLOW_UP
directive = "@" §symbol "=" (regexp | literals | procedure | symbol !DEF)
{ "," (regexp | literals | procedure | symbol !DEF) } & FOLLOW_UP
literals = { literal }+ # string chaining, only allowed in directives!
procedure = SYM_REGEX "()" # procedure name, only allowed in directives!
......@@ -51,7 +74,7 @@ term = oneormore | counted | repetition | option | pure_elem
#: elements
pure_elem = element § !/[?*+]/ # element strictly without a suffix
element = [retrieveop] symbol !DEF # negative lookahead to be sure it's not a definition
element = [retrieveop] symbol !:DEF # negative lookahead to be sure it's not a definition
| literal
| plaintext
| regexp
......@@ -70,13 +93,14 @@ retrieveop = "::" | ":?" | ":" # '::' pop, ':?' optional pop, '
#: groups
group = "(" !multiplier §expression ")"
oneormore = "{" !multiplier expression "}+" | element "+"
repetition = "{" !multiplier §expression "}" | element "*" !multiplier
group = "(" no_range §expression ")"
oneormore = "{" no_range expression "}+" | element "+"
repetition = "{" no_range §expression "}" | element "*" no_range
option = !char_range "[" §expression "]" | element "?"
counted = element RANGE | element :TIMES~ multiplier | multiplier :TIMES~ §element
counted = element range | element :TIMES~ multiplier | multiplier :TIMES~ §element
RANGE = RNG_BRACE~ multiplier [:RNG_DELIM~ multiplier] ::RNG_BRACE~
range = RNG_BRACE~ multiplier [:RNG_DELIM~ multiplier] ::RNG_BRACE~
no_range = !multiplier | &multiplier :TIMES
multiplier = /\d+/~
......@@ -86,28 +110,38 @@ symbol = SYM_REGEX ~ # e.g. expression, term, paramet
literal = /"(?:(?<!\\)\\"|[^"])*?"/~ # e.g. "(", '+', 'while'
| /'(?:(?<!\\)\\'|[^'])*?'/~ # whitespace following literals will be ignored tacitly.
plaintext = /`(?:(?<!\\)\\`|[^`])*?`/~ # like literal but does not eat whitespace
regexp = /\/(?:(?<!\\)\\(?:\/)|[^\/])*?\//~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
regexp = :RE_LEADIN RE_CORE :RE_LEADOUT ~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
# regexp = /\/(?:(?<!\\)\\(?:\/)|[^\/])*?\//~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
char_range = `[` !(literal | plaintext | regexp )
(character | free_char)
[ `-` (character | free_char) { character | free_char } ] `]`
character = :CH_LEADIN HEXCODE
char_range = '[' character '-' character ']'
free_char = /[^\n(){}\[\]\/\\]/ | /\\[\n(){}\[\]\/\\]/
whitespace = /~/~ # insignificant whitespace
RE_CORE = /(?:(?<!\\)\\(?:\/)|[^\/])*/ # core of a regular expression, i.e. the dots in /.../
SYM_REGEX = /(?!\d)\w+/ # regular expression for symbols
HEXCODE = /[A-Fa-f0-9]/+
#: delimiters
DEF = `=` | `:=` | `::=`
OR = `|` # removed due to ambiguity: `/`
DEF = `=` | `:=` | `::=` | `<-`
OR = `|` | `/` &no_regex_heuristics
AND = `,` | ``
ENDL = `;` | ``
RNG_BRACE = :BRACE_SIGN
BRACE_SIGN = `{`
BRACE_SIGN = `{` | `(`
RNG_DELIM = `,`
TIMES = `*`
RE_LEADIN = `/` | `^/`
RE_LEADOUT = `/`
CH_LEADIN = `0x` | `#x`
no_regex_heuristics = &` ` !/[^\/\n*?+\\]*[*?+\\][^\/\n]\//
EOF = !/./ [:?DEF] [:?OR] [:?AND] [:?ENDL] # [:?DEF], [:?OR], ... clear stack by eating stored value
[:?RNG_DELIM] [:?BRACE_SIGN] [:?CH_LEADIN] [:?TIMES]
[:?RNG_DELIM] [:?BRACE_SIGN] [:?CH_LEADIN] [:?TIMES] [:?RE_LEADIN] [:?RE_LEADOUT]
......@@ -77,53 +77,63 @@ class FlexibleEBNFGrammar(Grammar):
DEF = Forward()
ENDL = Forward()
OR = Forward()
RE_LEADIN = Forward()
RE_LEADOUT = Forward()
RNG_DELIM = Forward()
TIMES = Forward()
character = Forward()
element = Forward()
expression = Forward()
source_hash__ = "962e48ea1622c9b397ef94805c4588ad"
literal = Forward()
plaintext = Forward()
regexp = Forward()
source_hash__ = "f39460f9496f3c1309339053226affd4"
anonymous__ = re.compile('pure_elem$|FOLLOW_UP$|SYM_REGEX$|EOF$')
static_analysis_pending__ = [] # type: List[bool]
parser_initialization__ = ["upon instantiation"]
error_messages__ = {'definition': [(re.compile(r','), 'Delimiter "," not expected in definition!\\nEither this was meant to be a directive and the directive symbol @ is missing\\nor the error is due to inconsistent use of the comma as a delimiter\\nfor the elements of a sequence.')]}
resume_rules__ = {'definition': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')],
'directive': [re.compile(r'\n\s*(?=@|\w+\w*\s*=)')]}
COMMENT__ = r'(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/'
COMMENT__ = r'(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)'
comment_rx__ = re.compile(COMMENT__)
WHITESPACE__ = r'\s*'
WSP_RE__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
wsp__ = Whitespace(WSP_RE__)
dwsp__ = Drop(Whitespace(WSP_RE__))
EOF = Drop(Drop(Series(Drop(NegativeLookahead(RegExp('.'))), Drop(Option(Drop(Pop(DEF, match_func=optional_last_value)))), Drop(Option(Drop(Pop(OR, match_func=optional_last_value)))), Drop(Option(Drop(Pop(AND, match_func=optional_last_value)))), Drop(Option(Drop(Pop(ENDL, match_func=optional_last_value)))), Drop(Option(Drop(Pop(RNG_DELIM, match_func=optional_last_value)))), Drop(Option(Drop(Pop(BRACE_SIGN, match_func=optional_last_value)))), Drop(Option(Drop(Pop(CH_LEADIN, match_func=optional_last_value)))), Drop(Option(Drop(Pop(TIMES, match_func=optional_last_value)))))))
EOF = Drop(Drop(Series(Drop(NegativeLookahead(RegExp('.'))), Drop(Option(Drop(Pop(DEF, match_func=optional_last_value)))), Drop(Option(Drop(Pop(OR, match_func=optional_last_value)))), Drop(Option(Drop(Pop(AND, match_func=optional_last_value)))), Drop(Option(Drop(Pop(ENDL, match_func=optional_last_value)))), Drop(Option(Drop(Pop(RNG_DELIM, match_func=optional_last_value)))), Drop(Option(Drop(Pop(BRACE_SIGN, match_func=optional_last_value)))), Drop(Option(Drop(Pop(CH_LEADIN, match_func=optional_last_value)))), Drop(Option(Drop(Pop(TIMES, match_func=optional_last_value)))), Drop(Option(Drop(Pop(RE_LEADIN, match_func=optional_last_value)))), Drop(Option(Drop(Pop(RE_LEADOUT, match_func=optional_last_value)))))))
no_regex_heuristics = Series(Lookahead(Token(" ")), NegativeLookahead(RegExp('[^/\\n*?+\\\\]*[*?+\\\\][^/\\n]/')))
CH_LEADIN.set(Capture(Alternative(Token("0x"), Token("#x"))))
RE_LEADOUT.set(Capture(Token("/")))
RE_LEADIN.set(Capture(Alternative(Token("/"), Token("^/"))))
TIMES.set(Capture(Token("*")))
RNG_DELIM.set(Capture(Token(",")))
BRACE_SIGN.set(Capture(Token("{")))
BRACE_SIGN.set(Capture(Alternative(Token("{"), Token("("))))
RNG_BRACE = Capture(Retrieve(BRACE_SIGN))
ENDL.set(Capture(Alternative(Token(";"), Token(""))))
AND.set(Capture(Alternative(Token(","), Token(""))))
OR.set(Capture(Token("|")))
DEF.set(Capture(Alternative(Token("="), Token(":="), Token("::="))))
OR.set(Capture(Alternative(Token("|"), Series(Token("/"), Lookahead(no_regex_heuristics)))))
DEF.set(Capture(Alternative(Token("="), Token(":="), Token("::="), Token("<-"))))
HEXCODE = OneOrMore(RegExp('[A-Fa-f0-9]'))
SYM_REGEX = RegExp('(?!\\d)\\w+')
RE_CORE = RegExp('(?:(?<!\\\\)\\\\(?:/)|[^/])*')
whitespace = Series(RegExp('~'), dwsp__)
char_range = Series(Series(Token('['), dwsp__), character, Series(Token('-'), dwsp__), character, Series(Token(']'), dwsp__))
character.set(Series(Retrieve(CH_LEADIN), HEXCODE))
regexp = Series(RegExp('/(?:(?<!\\\\)\\\\(?:/)|[^/])*?/'), dwsp__)
plaintext = Series(RegExp('`(?:(?<!\\\\)\\\\`|[^`])*?`'), dwsp__)
literal = Alternative(Series(RegExp('"(?:(?<!\\\\)\\\\"|[^"])*?"'), dwsp__), Series(RegExp("'(?:(?<!\\\\)\\\\'|[^'])*?'"), dwsp__))
free_char = Alternative(RegExp('[^\\n(){}\\[\\]/\\\\]'), RegExp('\\\\[\\n(){}\\[\\]/\\\\]'))
character = Series(Retrieve(CH_LEADIN), HEXCODE)
char_range = Series(Token("["), NegativeLookahead(Alternative(literal, plaintext, regexp)), Alternative(character, free_char), Option(Series(Token("-"), Alternative(character, free_char), ZeroOrMore(Alternative(character, free_char)))), Token("]"))
regexp.set(Series(Retrieve(RE_LEADIN), RE_CORE, Retrieve(RE_LEADOUT), dwsp__))
plaintext.set(Series(RegExp('`(?:(?<!\\\\)\\\\`|[^`])*?`'), dwsp__))
literal.set(Alternative(Series(RegExp('"(?:(?<!\\\\)\\\\"|[^"])*?"'), dwsp__), Series(RegExp("'(?:(?<!\\\\)\\\\'|[^'])*?'"), dwsp__)))
symbol = Series(SYM_REGEX, dwsp__)
multiplier = Series(RegExp('\\d+'), dwsp__)
RANGE = Series(RNG_BRACE, dwsp__, multiplier, Option(Series(Retrieve(RNG_DELIM), dwsp__, multiplier)), Pop(RNG_BRACE, match_func=matching_bracket), dwsp__)
counted = Alternative(Series(element, RANGE), Series(element, Retrieve(TIMES), dwsp__, multiplier), Series(multiplier, Retrieve(TIMES), dwsp__, element, mandatory=3))
no_range = Alternative(NegativeLookahead(multiplier), Series(Lookahead(multiplier), Retrieve(TIMES)))
range = Series(RNG_BRACE, dwsp__, multiplier, Option(Series(Retrieve(RNG_DELIM), dwsp__, multiplier)), Pop(RNG_BRACE, match_func=matching_bracket), dwsp__)
counted = Alternative(Series(element, range), Series(element, Retrieve(TIMES), dwsp__, multiplier), Series(multiplier, Retrieve(TIMES), dwsp__, element, mandatory=3))
option = Alternative(Series(NegativeLookahead(char_range), Series(Token("["), dwsp__), expression, Series(Token("]"), dwsp__), mandatory=2), Series(element, Series(Token("?"), dwsp__)))
repetition = Alternative(Series(Series(Token("{"), dwsp__), NegativeLookahead(multiplier), expression, Series(Token("}"), dwsp__), mandatory=2), Series(element, Series(Token("*"), dwsp__), NegativeLookahead(multiplier)))
oneormore = Alternative(Series(Series(Token("{"), dwsp__), NegativeLookahead(multiplier), expression, Series(Token("}+"), dwsp__)), Series(element, Series(Token("+"), dwsp__)))
group = Series(Series(Token("("), dwsp__), NegativeLookahead(multiplier), expression, Series(Token(")"), dwsp__), mandatory=2)
repetition = Alternative(Series(Series(Token("{"), dwsp__), no_range, expression, Series(Token("}"), dwsp__), mandatory=2), Series(element, Series(Token("*"), dwsp__), no_range))
oneormore = Alternative(Series(Series(Token("{"), dwsp__), no_range, expression, Series(Token("}+"), dwsp__)), Series(element, Series(Token("+"), dwsp__)))
group = Series(Series(Token("("), dwsp__), no_range, expression, Series(Token(")"), dwsp__), mandatory=2)
retrieveop = Alternative(Series(Token("::"), dwsp__), Series(Token(":?"), dwsp__), Series(Token(":"), dwsp__))
flowmarker = Alternative(Series(Token("!"), dwsp__), Series(Token("&"), dwsp__), Series(Token("<-!"), dwsp__), Series(Token("<-&"), dwsp__))
element.set(Alternative(Series(Option(retrieveop), symbol, NegativeLookahead(DEF)), literal, plaintext, regexp, char_range, character, whitespace, group))
element.set(Alternative(Series(Option(retrieveop), symbol, NegativeLookahead(Retrieve(DEF))), literal, plaintext, regexp, char_range, character, whitespace, group))
pure_elem = Series(element, NegativeLookahead(RegExp('[?*+]')), mandatory=1)
term = Alternative(oneormore, counted, repetition, option, pure_elem)
difference = Series(term, Option(Series(Series(Token("-"), dwsp__), Alternative(oneormore, pure_elem), mandatory=1)))
......@@ -134,7 +144,7 @@ class FlexibleEBNFGrammar(Grammar):
FOLLOW_UP = Alternative(Token("@"), symbol, EOF)
procedure = Series(SYM_REGEX, Series(Token("()"), dwsp__))
literals = OneOrMore(literal)
directive = Series(Series(Token("@"), dwsp__), symbol, Series(Token("="), dwsp__), Alternative(regexp, literals, procedure, symbol), ZeroOrMore(Series(Series(Token(","), dwsp__), Alternative(regexp, literals, procedure, symbol))), Lookahead(FOLLOW_UP), mandatory=1)
directive = Series(Series(Token("@"), dwsp__), symbol, Series(Token("="), dwsp__), Alternative(regexp, literals, procedure, Series(symbol, NegativeLookahead(DEF))), ZeroOrMore(Series(Series(Token(","), dwsp__), Alternative(regexp, literals, procedure, Series(symbol, NegativeLookahead(DEF))))), Lookahead(FOLLOW_UP), mandatory=1)
definition = Series(symbol, Retrieve(DEF), dwsp__, expression, Retrieve(ENDL), dwsp__, Lookahead(FOLLOW_UP), mandatory=1, err_msgs=error_messages__["definition"])
syntax = Series(Option(Series(dwsp__, RegExp(''))), ZeroOrMore(Alternative(definition, directive)), EOF)
root__ = syntax
......
# EBNF-Grammar in EBNF
@ comment = /#.*(?:\n|$)/ # comments start with '#' and eat all chars up to and including '\n'
@ comment = /(?!#x[A-Fa-f0-9])#.*(?:\n|$)|\/\*(?:.|\n)*?\*\/|\(\*(?:.|\n)*?\*\)/
# comments can be either C-Style: /* ... */
# or pascal/modula/oberon-style: (* ... *)
# or python-style: # ... \n, excluding, however, character markers: #x20
@ whitespace = /\s*/ # whitespace includes linefeed
@ literalws = right # trailing whitespace of literals will be ignored tacitly
@ anonymous = pure_elem, EOF
@ anonymous = pure_elem, FOLLOW_UP, SYM_REGEX, EOF
@ drop = whitespace, EOF # do not include these even in the concrete syntax tree
@ RNG_BRACE_filter = matching_bracket() # filter or transform content of RNG_BRACE on retrieve
# re-entry-rules for resuming after parsing-error
@ definition_resume = /\n\s*(?=@|\w+\w*\s*=)/
@ directive_resume = /\n\s*(?=@|\w+\w*\s*=)/
# specialized error messages for certain cases
@ definition_error = /,/, 'Delimiter "," not expected in definition!\nEither this was meant to '
'be a directive and the directive symbol @ is missing\nor the error is '
'due to inconsistent use of the comma as a delimiter\nfor the elements '
'of a sequence.'
#: top-level
syntax = [~//] { definition | directive } §EOF
definition = symbol §:DEF~ expression :ENDL~
directive = "@" §symbol "="
(regexp | literal | symbol)
{ "," (regexp | literal | symbol) }
syntax = [~//] { definition | directive } EOF
definition = symbol §:DEF~ expression :ENDL~ & FOLLOW_UP
directive = "@" §symbol "=" (regexp | literals | procedure | symbol)
{ "," (regexp | literals | procedure | symbol) } & FOLLOW_UP
literals = { literal }+ # string chaining, only allowed in directives!
procedure = SYM_REGEX "()" # procedure name, only allowed in directives!
FOLLOW_UP = `@` | symbol | EOF
#: components
expression = sequence { :OR~ sequence }
sequence = ["§"] ( interleave | lookaround ) # "§" means all following terms mandatory
{ :AND~ ["§"] ( interleave | lookaround ) }
interleave = term { "°" ["§"] term }
lookaround = flowmarker (oneormore | pure_elem)
term = oneormore | repetition | option | pure_elem
interleave = difference { "°" ["§"] difference }
lookaround = flowmarker § (oneormore | pure_elem)
difference = term ["-" § (oneormore | pure_elem)]
term = oneormore | counted | repetition | option | pure_elem
#: elements
......@@ -31,36 +56,66 @@ element = [retrieveop] symbol !DEF # negative lookahead to be sure
| literal
| plaintext
| regexp
| char_range
| character
| whitespace
| group
#: flow-operators
flowmarker = "!" | "&" # '!' negative lookahead, '&' positive lookahead
| "<-!" | "<-&" # '<-' negative lookbehind, '<-&' positive lookbehind
retrieveop = "::" | ":?" | ":" # '::' pop, ':?' optional pop, ':' retrieve
#: groups
group = "(" §expression ")"
oneormore = "{" expression "}+" | element "+"
repetition = "{" §expression "}" | element "*"
option = "[" §expression "]" | element "?"
group = "(" no_range §expression ")"
oneormore = "{" no_range expression "}+" | element "+"
repetition = "{" no_range §expression "}" | element "*" no_range
option = !char_range "[" §expression "]" | element "?"
counted = element range | element :TIMES~ multiplier | multiplier :TIMES~ §element
range = RNG_BRACE~ multiplier [:RNG_DELIM~ multiplier] ::RNG_BRACE~
no_range = !multiplier | &multiplier :TIMES
multiplier = /\d+/~
#: leaf-elements
symbol = /(?!\d)\w+/~ # e.g. expression, term, parameter_list
symbol = SYM_REGEX ~ # e.g. expression, term, parameter_list
literal = /"(?:(?<!\\)\\"|[^"])*?"/~ # e.g. "(", '+', 'while'
| /'(?:(?<!\\)\\'|[^'])*?'/~ # whitespace following literals will be ignored tacitly.
plaintext = /`(?:(?<!\\)\\`|[^`])*?`/~ # like literal but does not eat whitespace
regexp = /\/(?:(?<!\\)\\(?:\/)|[^\/])*?\//~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
char_range = '[' (character | free_char) '-' (character | free_char) ']'
character = :CH_LEADIN HEXCODE | '[' free_char ']'
free_char = /[0-9]/ | [^\w\n(){}\[\]/\\`'"] | `\` /[\w(){}\[\]/\\`'"]/
whitespace = /~/~ # insignificant whitespace
SYM_REGEX = /(?!\d)\w+/ # regular expression for symbols
HEXCODE = /[A-Fa-f0-9]/+
#: delimiters
DEF = `=` | `:=` | `::=`
OR = `|`
DEF = `=` | `:=` | `::=` | `<-`
OR = `|` | `/` &` ` no_regex_heuristics
AND = `,` | ``
ENDL = `;` | ``
RNG_BRACE = :BRACE_SIGN
BRACE_SIGN = `{` | `(`
RNG_DELIM = `,`
TIMES = `*`
RE_LEADIN = `/` | `%/`
RE_LEADOUT = `/`
CH_LEADIN = `0x` | `#x`
no_regex_heuristics = !regexp
EOF = !/./ [:?DEF] [:?OR] [:?AND] [:?ENDL] # [:?DEF], [:?OR], ... clear stack by eating stored value
[:?RNG_DELIM] [:?BRACE_SIGN] [:?CH_LEADIN] [:?TIMES] [:?RE_LEADIN] [:?RE_LEADOUT]
# Hierarchical syntax
Grammar <- Spacing Definition+ EndOfFile
Definition <- Identifier LEFTARROW Expression
Expression <- Sequence (SLASH Sequence)*
Sequence <- Prefix*
Prefix <- (AND / NOT)? Suffix
Suffix <- Primary (QUESTION / STAR / PLUS)?
Primary <- Identifier !LEFTARROW
/ OPEN Expression CLOSE
/ Literal / Class / DOT
# Lexical syntax
Identifier <- IdentStart IdentCont* Spacing
IdentStart <- [a-zA-Z_]
IdentCont <- IdentStart / [0-9]
Literal <- [’] (![’] Char)* [’] Spacing
/ ["] (!["] Char)* ["] Spacing
Class <- ’[’ (!’]’ Range)* ’]’ Spacing
Range <- Char ’-’ Char / Char
Char <- ’\\’ [nrt’"\[\]\\]
/ ’\\’ [0-2][0-7][0-7]
/ ’\\’ [0-7][0-7]?
/ !’\\’ .
/
<-
<-
<-
/
/
/ [’] (![’] Char)* [’] Spacing
["] (!["] Char)* ["] Spacing
Char ’-’ Char / Char
’\\’ [nrt’"\[\]\\]
’\\’ [0-2][0-7][0-7]
’\\’ [0-7][0-7]?
!’\\’ .
LEFTARROW
SLASH
AND
NOT
QUESTION
STAR
PLUS
OPEN
CLOSE
DOT <-
<-
<-
<-
<-
<-
<-
<-
<-