The expiration time for new job artifacts in CI/CD pipelines is now 30 days (GitLab default). Previously generated artifacts in already completed jobs will not be affected by the change. The latest artifacts for all jobs in the latest successful pipelines will be kept. More information: https://gitlab.lrz.de/help/user/admin_area/settings/continuous_integration.html#default-artifacts-expiration

Commit 2cb7b778 authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

- regex bug in EBNF.ebnf and ebnf.EBNFGrammar fixed

parent f277041a
......@@ -124,7 +124,7 @@ class EBNFGrammar(Grammar):
wspR__ = WSP__
EOF = NegativeLookahead(RE('.', wR=''))
list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
regexp = RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
regexp = RE(r'~?/(?:\\/|[^/])*?/~?') # RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
symbol = RE('(?!\\d)\\w+')
option = Series(Token("["), expression, Required(Token("]")))
......@@ -300,7 +300,8 @@ class EBNFCompilerError(Exception):
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!? Is this possible at compile time?
class EBNFCompiler(Compiler):
"""Generates a Parser from an abstract syntax tree of a grammar specified
"""
Generates a Parser from an abstract syntax tree of a grammar specified
in EBNF-Notation.
"""
COMMENT_KEYWORD = "COMMENT__"
......@@ -316,10 +317,12 @@ class EBNFCompiler(Compiler):
'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
'vertical': r'\s*'}
def __init__(self, grammar_name="", grammar_source=""):
super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
self._reset()
def _reset(self):
self._result = '' # type: str
self.rules = OrderedDict() # type: OrderedDict[str, List[Node]]
......@@ -340,11 +343,15 @@ class EBNFCompiler(Compiler):
def result(self) -> str:
return self._result
# methods for generating skeleton code for scanner, transformer, and compiler
def gen_scanner_skeleton(self) -> str:
name = self.grammar_name + "Scanner"
return "def %s(text):\n return text\n" % name \
+ SCANNER_FACTORY.format(NAME=self.grammar_name)
def gen_transformer_skeleton(self) -> str:
if not self.rules:
raise EBNFCompilerError('Compiler must be run before calling '
......@@ -363,6 +370,7 @@ class EBNFCompiler(Compiler):
transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
return '\n'.join(transtable)
def gen_compiler_skeleton(self) -> str:
if not self.rules:
raise EBNFCompilerError('Compiler has not been run before calling '
......@@ -387,9 +395,12 @@ class EBNFCompiler(Compiler):
compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
return '\n'.join(compiler)
def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
# fix capture of variables that have been defined before usage [sic!]
def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
"""
Creates the Python code for the parser after compilation of
the EBNF-Grammar
"""
if self.variables:
for i in range(len(definitions)):
if definitions[i][0] in self.variables:
......@@ -470,6 +481,9 @@ class EBNFCompiler(Compiler):
+ GRAMMAR_FACTORY.format(NAME=self.grammar_name)
return self._result
## compilation methods
def on_syntax(self, node: Node) -> str:
self._reset()
definitions = []
......@@ -489,6 +503,7 @@ class EBNFCompiler(Compiler):
return self.assemble_parser(definitions, node)
def on_definition(self, node: Node) -> Tuple[str, str]:
rule = str(node.children[0])
if rule in self.rules:
......@@ -520,9 +535,11 @@ class EBNFCompiler(Compiler):
rule, defn = rule + ':error', '"' + errmsg + '"'
return rule, defn
@staticmethod
def _check_rx(node: Node, rx: str) -> str:
"""Checks whether the string `rx` represents a valid regular
"""
Checks whether the string `rx` represents a valid regular
expression. Makes sure that multiline regular expressions are
prepended by the multiline-flag. Returns the regular expression string.
"""
......@@ -534,6 +551,7 @@ class EBNFCompiler(Compiler):
(repr(rx), str(re_error)))
return rx
def on_directive(self, node: Node) -> str:
key = str(node.children[0]).lower()
assert key not in self.directives['tokens']
......@@ -593,19 +611,24 @@ class EBNFCompiler(Compiler):
', '.join(list(self.directives.keys()))))
return ""
def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
"""Compiles any non-terminal, where `parser_class` indicates the Parser class
"""
Compiles any non-terminal, where `parser_class` indicates the Parser class
name for the particular non-terminal.
"""
arguments = [self._compile(r) for r in node.children] + custom_args
return parser_class + '(' + ', '.join(arguments) + ')'
def on_expression(self, node) -> str:
return self.non_terminal(node, 'Alternative')
def on_term(self, node) -> str:
return self.non_terminal(node, 'Series')
def on_factor(self, node: Node) -> str:
assert node.children
assert len(node.children) >= 2, node.as_sxpr()
......@@ -639,22 +662,28 @@ class EBNFCompiler(Compiler):
node.add_error('Unknown prefix "%s".' % prefix)
return ""
def on_option(self, node) -> str:
return self.non_terminal(node, 'Optional')
def on_repetition(self, node) -> str:
return self.non_terminal(node, 'ZeroOrMore')
def on_oneormore(self, node) -> str:
return self.non_terminal(node, 'OneOrMore')
def on_regexchain(self, node) -> str:
raise EBNFCompilerError("Not yet implemented!")
def on_group(self, node) -> str:
raise EBNFCompilerError("Group nodes should have been eliminated by "
"AST transformation!")
def on_symbol(self, node: Node) -> str: # called only for symbols on the right hand side!
symbol = str(node) # ; assert result == cast(str, node.result)
if symbol in self.directives['tokens']:
......@@ -667,9 +696,11 @@ class EBNFCompiler(Compiler):
self.recursive.add(symbol)
return symbol
def on_literal(self, node) -> str:
return 'Token(' + str(node).replace('\\', r'\\') + ')' # return 'Token(' + ', '.join([node.result]) + ')' ?
def on_regexp(self, node: Node) -> str:
rx = str(node)
name = [] # type: List[str]
......@@ -694,6 +725,7 @@ class EBNFCompiler(Compiler):
return '"' + errmsg + '"'
return 'RE(' + ', '.join([arg] + name) + ')'
def on_list_(self, node) -> Set[str]:
assert node.children
return set(item.result.strip() for item in node.children)
......
......@@ -52,7 +52,7 @@ def selftest(file_name):
else:
# compile the grammar again using the result of the previous
# compilation as parser
for i in range(100):
for i in range(1):
result = compileDSL(grammar, nil_scanner, result, transformer, compiler)
print(result)
return result
......
......@@ -30,7 +30,7 @@ option = "[" expression §"]"
symbol = /(?!\d)\w+/~ # e.g. expression, factor, parameter_list
literal = /"(?:[^"]|\\")*?"/~ # e.g. "(", '+', 'while'
| /'(?:[^']|\\')*?'/~ # whitespace following literals will be ignored tacitly.
regexp = /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
regexp = /~?\/(?:\\\/|[^\/])*?\/~?/~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
# '~' is a whitespace-marker, if present leading or trailing
# whitespace of a regular expression will be ignored tacitly.
list_ = /\w+/~ { "," /\w+/~ } # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
......
......@@ -113,10 +113,10 @@ text = { cfgtext | (BRACKETS //~) }+
cfgtext = { word_sequence | (ESCAPED //~) }+
word_sequence = { TEXTCHUNK //~ }+
blockcmd = /\/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
blockcmd = /[\\]/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
#######################################################################
......
......@@ -163,10 +163,10 @@ class LaTeXGrammar(Grammar):
cfgtext = { word_sequence | (ESCAPED //~) }+
word_sequence = { TEXTCHUNK //~ }+
blockcmd = /A/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
blockcmd = /[\\]/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
#######################################################################
......@@ -192,7 +192,7 @@ class LaTeXGrammar(Grammar):
block_enrivonment = Forward()
block_of_paragraphs = Forward()
text_elements = Forward()
source_hash__ = "7ef00020ebbb2b82e36d38460de56370"
source_hash__ = "9f1579db1994211dc53dd4a8f317bfb6"
parser_initialization__ = "upon instantiation"
COMMENT__ = r'%.*(?:\n|$)'
WSP__ = mixin_comment(whitespace=r'[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?', comment=r'%.*(?:\n|$)')
......@@ -208,7 +208,7 @@ class LaTeXGrammar(Grammar):
MATH = RE('[\\w_^{}[\\]]*')
NAME = Capture(RE('\\w+'))
CMDNAME = RE('\\\\(?:(?!_)\\w)+')
blockcmd = Series(RE('A', wR=''), Alternative(Series(Token("begin{"), Alternative(Token("enumerate"), Token("itemize"), Token("figure"), Token("quote"), Token("quotation"), Token("tabular")), Token("}")), Token("subsection"), Token("section"), Token("chapter"), Token("subsubsection"), Token("paragraph"), Token("subparagraph"), Token("item")))
blockcmd = Series(RE('[\\\\]', wR=''), Alternative(Series(Token("begin{"), Alternative(Token("enumerate"), Token("itemize"), Token("figure"), Token("quote"), Token("quotation"), Token("tabular")), Token("}")), Token("subsection"), Token("section"), Token("chapter"), Token("subsubsection"), Token("paragraph"), Token("subparagraph"), Token("item")))
word_sequence = OneOrMore(Series(TEXTCHUNK, RE('')))
cfgtext = OneOrMore(Alternative(word_sequence, Series(ESCAPED, RE(''))))
text = OneOrMore(Alternative(cfgtext, Series(BRACKETS, RE(''))))
......
......@@ -20,6 +20,10 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
try:
import regex as re
except ImportError:
import re
import sys
from functools import partial
from multiprocessing import Pool
......@@ -102,8 +106,11 @@ class TestEBNFParser:
def test_RE(self):
gr = get_ebnf_grammar()
m = gr.regexp.main.regexp.match(r'/\\/ xxx /')
assert m.group().find('x') < 0, m.group()
m = gr.regexp.main.regexp.match(r'/[\\\\]/ xxx /')
rs = m.group()
assert rs.find('x') < 0, rs.group()
rx = re.compile(rs[1:-1])
assert rx.match(r'\\')
def test_literal(self):
snippet = '"literal" '
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment