Commit 2cb7b778 authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

- regex bug in EBNF.ebnf and ebnf.EBNFGrammar fixed

parent f277041a
......@@ -124,7 +124,7 @@ class EBNFGrammar(Grammar):
wspR__ = WSP__
EOF = NegativeLookahead(RE('.', wR=''))
list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
regexp = RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
regexp = RE(r'~?/(?:\\/|[^/])*?/~?') # RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
symbol = RE('(?!\\d)\\w+')
option = Series(Token("["), expression, Required(Token("]")))
......@@ -300,7 +300,8 @@ class EBNFCompilerError(Exception):
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!? Is this possible at compile time?
class EBNFCompiler(Compiler):
"""Generates a Parser from an abstract syntax tree of a grammar specified
"""
Generates a Parser from an abstract syntax tree of a grammar specified
in EBNF-Notation.
"""
COMMENT_KEYWORD = "COMMENT__"
......@@ -316,10 +317,12 @@ class EBNFCompiler(Compiler):
'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
'vertical': r'\s*'}
def __init__(self, grammar_name="", grammar_source=""):
super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
self._reset()
def _reset(self):
self._result = '' # type: str
self.rules = OrderedDict() # type: OrderedDict[str, List[Node]]
......@@ -340,11 +343,15 @@ class EBNFCompiler(Compiler):
def result(self) -> str:
return self._result
# methods for generating skeleton code for scanner, transformer, and compiler
def gen_scanner_skeleton(self) -> str:
name = self.grammar_name + "Scanner"
return "def %s(text):\n return text\n" % name \
+ SCANNER_FACTORY.format(NAME=self.grammar_name)
def gen_transformer_skeleton(self) -> str:
if not self.rules:
raise EBNFCompilerError('Compiler must be run before calling '
......@@ -363,6 +370,7 @@ class EBNFCompiler(Compiler):
transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
return '\n'.join(transtable)
def gen_compiler_skeleton(self) -> str:
if not self.rules:
raise EBNFCompilerError('Compiler has not been run before calling '
......@@ -387,9 +395,12 @@ class EBNFCompiler(Compiler):
compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
return '\n'.join(compiler)
def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
# fix capture of variables that have been defined before usage [sic!]
def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
"""
Creates the Python code for the parser after compilation of
the EBNF-Grammar
"""
if self.variables:
for i in range(len(definitions)):
if definitions[i][0] in self.variables:
......@@ -470,6 +481,9 @@ class EBNFCompiler(Compiler):
+ GRAMMAR_FACTORY.format(NAME=self.grammar_name)
return self._result
## compilation methods
def on_syntax(self, node: Node) -> str:
self._reset()
definitions = []
......@@ -489,6 +503,7 @@ class EBNFCompiler(Compiler):
return self.assemble_parser(definitions, node)
def on_definition(self, node: Node) -> Tuple[str, str]:
rule = str(node.children[0])
if rule in self.rules:
......@@ -520,9 +535,11 @@ class EBNFCompiler(Compiler):
rule, defn = rule + ':error', '"' + errmsg + '"'
return rule, defn
@staticmethod
def _check_rx(node: Node, rx: str) -> str:
"""Checks whether the string `rx` represents a valid regular
"""
Checks whether the string `rx` represents a valid regular
expression. Makes sure that multiline regular expressions are
prepended by the multiline-flag. Returns the regular expression string.
"""
......@@ -534,6 +551,7 @@ class EBNFCompiler(Compiler):
(repr(rx), str(re_error)))
return rx
def on_directive(self, node: Node) -> str:
key = str(node.children[0]).lower()
assert key not in self.directives['tokens']
......@@ -593,19 +611,24 @@ class EBNFCompiler(Compiler):
', '.join(list(self.directives.keys()))))
return ""
def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
"""Compiles any non-terminal, where `parser_class` indicates the Parser class
"""
Compiles any non-terminal, where `parser_class` indicates the Parser class
name for the particular non-terminal.
"""
arguments = [self._compile(r) for r in node.children] + custom_args
return parser_class + '(' + ', '.join(arguments) + ')'
def on_expression(self, node) -> str:
return self.non_terminal(node, 'Alternative')
def on_term(self, node) -> str:
return self.non_terminal(node, 'Series')
def on_factor(self, node: Node) -> str:
assert node.children
assert len(node.children) >= 2, node.as_sxpr()
......@@ -639,22 +662,28 @@ class EBNFCompiler(Compiler):
node.add_error('Unknown prefix "%s".' % prefix)
return ""
def on_option(self, node) -> str:
return self.non_terminal(node, 'Optional')
def on_repetition(self, node) -> str:
return self.non_terminal(node, 'ZeroOrMore')
def on_oneormore(self, node) -> str:
return self.non_terminal(node, 'OneOrMore')
def on_regexchain(self, node) -> str:
raise EBNFCompilerError("Not yet implemented!")
def on_group(self, node) -> str:
raise EBNFCompilerError("Group nodes should have been eliminated by "
"AST transformation!")
def on_symbol(self, node: Node) -> str: # called only for symbols on the right hand side!
symbol = str(node) # ; assert result == cast(str, node.result)
if symbol in self.directives['tokens']:
......@@ -667,9 +696,11 @@ class EBNFCompiler(Compiler):
self.recursive.add(symbol)
return symbol
def on_literal(self, node) -> str:
return 'Token(' + str(node).replace('\\', r'\\') + ')' # return 'Token(' + ', '.join([node.result]) + ')' ?
def on_regexp(self, node: Node) -> str:
rx = str(node)
name = [] # type: List[str]
......@@ -694,6 +725,7 @@ class EBNFCompiler(Compiler):
return '"' + errmsg + '"'
return 'RE(' + ', '.join([arg] + name) + ')'
def on_list_(self, node) -> Set[str]:
assert node.children
return set(item.result.strip() for item in node.children)
......
......@@ -52,7 +52,7 @@ def selftest(file_name):
else:
# compile the grammar again using the result of the previous
# compilation as parser
for i in range(100):
for i in range(1):
result = compileDSL(grammar, nil_scanner, result, transformer, compiler)
print(result)
return result
......
......@@ -30,7 +30,7 @@ option = "[" expression §"]"
symbol = /(?!\d)\w+/~ # e.g. expression, factor, parameter_list
literal = /"(?:[^"]|\\")*?"/~ # e.g. "(", '+', 'while'
| /'(?:[^']|\\')*?'/~ # whitespace following literals will be ignored tacitly.
regexp = /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
regexp = /~?\/(?:\\\/|[^\/])*?\/~?/~ # e.g. /\w+/, ~/#.*(?:\n|$)/~
# '~' is a whitespace-marker, if present leading or trailing
# whitespace of a regular expression will be ignored tacitly.
list_ = /\w+/~ { "," /\w+/~ } # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
......
......@@ -113,10 +113,10 @@ text = { cfgtext | (BRACKETS //~) }+
cfgtext = { word_sequence | (ESCAPED //~) }+
word_sequence = { TEXTCHUNK //~ }+
blockcmd = /\/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
blockcmd = /[\\]/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
#######################################################################
......
......@@ -163,10 +163,10 @@ class LaTeXGrammar(Grammar):
cfgtext = { word_sequence | (ESCAPED //~) }+
word_sequence = { TEXTCHUNK //~ }+
blockcmd = /A/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
blockcmd = /[\\]/ ("begin{" ("enumerate" | "itemize" | "figure" | "quote"
| "quotation" | "tabular") "}"
| "subsection" | "section" | "chapter" | "subsubsection"
| "paragraph" | "subparagraph" | "item")
#######################################################################
......@@ -192,7 +192,7 @@ class LaTeXGrammar(Grammar):
block_enrivonment = Forward()
block_of_paragraphs = Forward()
text_elements = Forward()
source_hash__ = "7ef00020ebbb2b82e36d38460de56370"
source_hash__ = "9f1579db1994211dc53dd4a8f317bfb6"
parser_initialization__ = "upon instantiation"
COMMENT__ = r'%.*(?:\n|$)'
WSP__ = mixin_comment(whitespace=r'[ \t]*(?:\n(?![ \t]*\n)[ \t]*)?', comment=r'%.*(?:\n|$)')
......@@ -208,7 +208,7 @@ class LaTeXGrammar(Grammar):
MATH = RE('[\\w_^{}[\\]]*')
NAME = Capture(RE('\\w+'))
CMDNAME = RE('\\\\(?:(?!_)\\w)+')
blockcmd = Series(RE('A', wR=''), Alternative(Series(Token("begin{"), Alternative(Token("enumerate"), Token("itemize"), Token("figure"), Token("quote"), Token("quotation"), Token("tabular")), Token("}")), Token("subsection"), Token("section"), Token("chapter"), Token("subsubsection"), Token("paragraph"), Token("subparagraph"), Token("item")))
blockcmd = Series(RE('[\\\\]', wR=''), Alternative(Series(Token("begin{"), Alternative(Token("enumerate"), Token("itemize"), Token("figure"), Token("quote"), Token("quotation"), Token("tabular")), Token("}")), Token("subsection"), Token("section"), Token("chapter"), Token("subsubsection"), Token("paragraph"), Token("subparagraph"), Token("item")))
word_sequence = OneOrMore(Series(TEXTCHUNK, RE('')))
cfgtext = OneOrMore(Alternative(word_sequence, Series(ESCAPED, RE(''))))
text = OneOrMore(Alternative(cfgtext, Series(BRACKETS, RE(''))))
......
......@@ -20,6 +20,10 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
try:
import regex as re
except ImportError:
import re
import sys
from functools import partial
from multiprocessing import Pool
......@@ -102,8 +106,11 @@ class TestEBNFParser:
def test_RE(self):
gr = get_ebnf_grammar()
m = gr.regexp.main.regexp.match(r'/\\/ xxx /')
assert m.group().find('x') < 0, m.group()
m = gr.regexp.main.regexp.match(r'/[\\\\]/ xxx /')
rs = m.group()
assert rs.find('x') < 0, rs.group()
rx = re.compile(rs[1:-1])
assert rx.match(r'\\')
def test_literal(self):
snippet = '"literal" '
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment