16.12.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit c75f9148 authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

Regexp handling greatly simplified

parent c9195402
......@@ -132,6 +132,12 @@ def line_col(text, pos):
return line, column
class ZOMBIE_PARSER:
"""Serves as a substitute for a Parser instance. Required by
`Node`-objects."""
name = "ZOMBIE"
class Node:
def __init__(self, parser, result):
# self.children = False # will be set by the following assignment
......@@ -164,7 +170,7 @@ class Node:
assert (isinstance(result, tuple) and
all(isinstance(child, Node) for child in result)) or \
isinstance(result, Node) or \
isinstance(result, str)
isinstance(result, str), str(result)
if isinstance(result, Node):
result = (result,)
self._result = result or ''
......@@ -219,7 +225,6 @@ class Node:
return head + '\n'.join([tab + dataF(s)
for s in str(self.result).split('\n')]) + tail
def as_sexpr(self, src=None):
"""Returns content as S-expression, i.e. in lisp-like form.
......@@ -237,11 +242,13 @@ class Node:
s += " '(err '(%s))" % ' '.join(str(err).replace('"', r'\"')
for err in node.errors)
return s
return self.as_tree(' ', opening, lambda node: ')')
# lambda s: '"' + s.replace('"', r'\"') + '"')
def pretty(s):
return '"%s"' % s if s.find('"') < 0 \
else "'%s'" % s if s.find("'") < 0 \
else '"%s"' % s.replace('"', r'\"')
return self.as_tree(' ', opening, lambda node: ')', pretty)
def as_xml(self, src=None):
"""Returns content as S-expression, i.e. in lisp-like form.
"""Returns content as XML-tree.
Args:
src: The source text or `None`. In case the source text is given
......@@ -537,10 +544,6 @@ class ParserHeadquarter:
return result if not stitches else Node(None, tuple(stitches))
ZOMBIE_PARSER = Parser() # zombie object to avoid distinction of cases
# for the Node.parser variable
##############################################################################
#
# Token and Regular Expression parser classes (i.e. leaf classes)
......@@ -581,16 +584,55 @@ class ScannerToken(Parser):
return None, text
RE_GRP = Parser(name="RE_group") # Parser to indicate re groups
RE_WS = Parser(WHITESPACE_KEYWORD)
# class RegExp(Parser):
# def __init__(self, regexp, orig_re = '', name=None):
# super(RegExp, self).__init__(name)
# # self.name = name
# self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp
# self.orig_re = orig_re
#
# def __deepcopy__(self, memo):
# # this method is obsolete with the new `regex` module!
# try:
# regexp = copy.deepcopy(self.regexp)
# except TypeError:
# regexp = self.regexp.pattern
# duplicate = RegExp(self.name, regexp, self.orig_re)
# duplicate.name = self.name # this ist needed!!!!
# duplicate.regexp = self.regexp
# duplicate.orig_re = self.orig_re
# duplicate.headquarter = self.headquarter
# duplicate.visited = copy.deepcopy(self.visited, memo)
# duplicate.recursion_counter = copy.deepcopy(self.recursion_counter,
# memo)
# return duplicate
#
# def __call__(self, text):
# match = text[0:1] != BEGIN_SCANNER_TOKEN and self.regexp.match(text) # ESC starts a scanner token.
# if match:
# end = match.end()
# groups = set(match.groups())
# if len(groups) >= 1:
# split = sorted([i for i in reduce(lambda s, r: s | set(r),
# match.regs, set()) if i >= 0])
# parts = (text[i:j] for i, j in zip(split[:-1], split[1:]))
# result = tuple(Node(None if part in groups else RE_WS, part)
# for part in parts)
# if all(r.parser == RE_WS for r in result):
# return Node(RE_WS, text[:end]), text[end:]
# return Node(self, result), text[end:]
# return Node(self, match.group()), text[end:]
# return None, text
#
# def __str__(self):
# pattern = self.orig_re or self.regexp.pattern # for readability of error messages !
# return Parser.__str__(self) + "/" + pattern + "/"
class RegExp(Parser):
def __init__(self, regexp, orig_re = '', name=None):
def __init__(self, regexp, name=None):
super(RegExp, self).__init__(name)
# self.name = name
self.regexp = re.compile(regexp) if isinstance(regexp, str) else regexp
self.orig_re = orig_re
def __deepcopy__(self, memo):
# this method is obsolete with the new `regex` module!
......@@ -598,10 +640,9 @@ class RegExp(Parser):
regexp = copy.deepcopy(self.regexp)
except TypeError:
regexp = self.regexp.pattern
duplicate = RegExp(self.name, regexp, self.orig_re)
duplicate = RegExp(self.name, regexp)
duplicate.name = self.name # this ist needed!!!!
duplicate.regexp = self.regexp
duplicate.orig_re = self.orig_re
duplicate.headquarter = self.headquarter
duplicate.visited = copy.deepcopy(self.visited, memo)
duplicate.recursion_counter = copy.deepcopy(self.recursion_counter,
......@@ -612,22 +653,42 @@ class RegExp(Parser):
match = text[0:1] != BEGIN_SCANNER_TOKEN and self.regexp.match(text) # ESC starts a scanner token.
if match:
end = match.end()
groups = set(match.groups())
if len(groups) >= 1:
split = sorted([i for i in reduce(lambda s, r: s | set(r),
match.regs, set()) if i >= 0])
parts = (text[i:j] for i, j in zip(split[:-1], split[1:]))
result = tuple(Node(None if part in groups else RE_WS, part)
for part in parts)
if all(r.parser == RE_WS for r in result):
return Node(RE_WS, text[:end]), text[end:]
return Node(self, result), text[end:]
return Node(self, match.group()), text[end:]
return Node(self, text[:end]), text[end:]
return None, text
def __str__(self):
return Parser.__str__(self) + '/' + self.regexp.pattern + '/'
class RE(Parser):
def __init__(self, regexp, wL='', wR='', name=None):
super(RE, self).__init__(name)
self.wL = RegExp(wL, WHITESPACE_KEYWORD) if wL else ''
self.wR = RegExp(wR, WHITESPACE_KEYWORD) if wR else ''
self.main = RegExp(regexp)
def __call__(self, text):
# assert self.main.regexp.pattern != "@"
t = text
wL, t = self.wL(t) if self.wL else (None, t)
main, t = self.main(t)
if main:
wR, t = self.wR(t) if self.wR else (None, t)
result = tuple(nd for nd in (wL, main, wR) if nd)
return Node(self, result), t
return None, text
def __str__(self):
pattern = self.orig_re or self.regexp.pattern # for readability of error messages !
return Parser.__str__(self) + "/" + pattern + "/"
return Parser.__str__(self) + ('~' if self.wL else '') + \
'/' + self.main.regexp.pattern + '/' + ('~' if self.wR else '')
def apply(self, func):
if super(RE, self).apply(func):
if self.wL:
self.wL.apply(func)
if self.wR:
self.wR.apply(func)
self.main.apply(func)
def escape_re(s):
......@@ -646,17 +707,17 @@ def mixin_comment(whitespace, comment):
wspc = '(?:' + whitespace + '(?:' + comment + whitespace + ')*)'
return wspc
def RE(regexp, wL='', wR='', name=None):
rA = '('
rB = '\n)' if regexp.find('(?x)') >= 0 else ')' # otherwise the closing bracket might erroneously
# be append to the end of a line comment!
return RegExp(wL + rA + regexp + rB + wR, regexp,
name or TOKEN_KEYWORD)
#
# def RE(regexp, wL='', wR='', name=None):
# rA = '('
# rB = '\n)' if regexp.find('(?x)') >= 0 else ')' # otherwise the closing bracket might erroneously
# # be append to the end of a line comment!
# return RegExp(wL + rA + regexp + rB + wR, regexp,
# name or TOKEN_KEYWORD)
def Token(token, wL='', wR='', name=None):
return RE(escape_re(token), wL, wR, name)
return RE(escape_re(token), wL, wR, name or TOKEN_KEYWORD)
##############################################################################
......@@ -1081,7 +1142,7 @@ def remove_children_if(node, condition):
is_whitespace = lambda node: not node.result or (isinstance(node.result, str)
and not node.result.strip())
is_comment = lambda node: node.parser == RE_WS
is_comment = lambda node: node.name == WHITESPACE_KEYWORD
is_scanner_token = lambda node: isinstance(node.parser, ScannerToken)
is_expendable = lambda node: is_whitespace(node) or is_comment(node) or \
is_scanner_token(node)
......@@ -1105,6 +1166,7 @@ def flatten(node):
new_result = []
for child in node.result:
if not child.parser.name:
assert isinstance(child.result, tuple), node.as_sexpr()
flatten(child)
new_result.extend(child.result)
else:
......@@ -1149,8 +1211,8 @@ AST_SYMBOLS = {'replace_by_single_child', 'reduce_single_child',
'is_comment', 'is_scanner_token', 'is_expendable',
'remove_whitespace', 'remove_comments',
'remove_scanner_tokens', 'remove_expendables', 'flatten',
'remove_tokens', 'remove_enclosing_delimiters', 'partial',
'TOKEN_KEYWORD', 'WHITESPACE_KEYWORD', 'RE_GRP', 'RE_WS'}
'remove_tokens', 'remove_enclosing_delimiters',
'TOKEN_KEYWORD', 'WHITESPACE_KEYWORD', 'partial'}
##############################################################################
......@@ -1296,6 +1358,10 @@ class EBNFGrammar(ParserHeadquarter):
root__ = syntax
def TTTest(node):
# assert not (str(node.parser).startswith("RE") and node.children[0].result == '"-&"'), node.as_sexpr()
return node
EBNFTransTable = {
# AST Transformations for EBNF-grammar
"syntax":
......@@ -1318,7 +1384,7 @@ EBNFTransTable = {
(TOKEN_KEYWORD, WHITESPACE_KEYWORD):
[remove_expendables, reduce_single_child],
"":
[remove_expendables, replace_by_single_child]
[TTTest, remove_expendables, replace_by_single_child]
}
......@@ -1937,11 +2003,14 @@ def test(file_name):
# print(syntax_tree.as_xml())
# print(result)
# print(syntax_tree.as_sexpr(grammar))
# print(errors)
# print(compiler.gen_AST_Skeleton())
# print(compiler.gen_Compiler_Skeleton())
result = compileDSL(grammar, result, EBNFTransTable, compiler)
print(result)
if errors:
print(errors)
sys.exit(1)
else:
result = compileDSL(grammar, result, EBNFTransTable, compiler)
print(result)
return result
......
......@@ -100,7 +100,7 @@ class MLWGrammar(ParserHeadquarter):
DATEI_ENDE = !/./
NIEMALS = /(?!.)/
"""
source_hash__ = "6b1abff4990e942856fc354a2c965e0b"
source_hash__ = "7a55cb4440d934ce0300c8610a3b4c33"
parser_initialization__ = "upon instatiation"
wsp__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
NIEMALS = RE('(?!.)')
......@@ -112,37 +112,37 @@ class MLWGrammar(ParserHeadquarter):
WORT_GROSS = RE('[A-ZÄÖÜ][a-zäöüß]+', wR=wsp__)
WORT = RE('[A-ZÄÖÜ]?[a-zäöüß]+', wR=wsp__)
Name = Sequence(WORT, ZeroOrMore(Alternative(WORT, RE('[A-ZÄÖÜÁÀ]\\.'))))
Autorinfo = Sequence(Alternative(Token("AUTORIN", wL=wsp__, wR=wsp__), Token("AUTOR", wL=wsp__, wR=wsp__)), Name)
Zusatz = Sequence(Token("ZUSATZ", wL=wsp__, wR=wsp__), RE('\\s?.*'))
EinBeleg = Sequence(OneOrMore(Sequence(NegativeLookahead(Sequence(RE('\\s*'), Alternative(Token("*", wL=wsp__, wR=wsp__), Token("BEDEUTUNG", wL=wsp__, wR=wsp__), Token("AUTOR", wL=wsp__, wR=wsp__), Token("NAME", wL=wsp__, wR=wsp__), Token("ZUSATZ", wL=wsp__, wR=wsp__)))), RE('\\s?.*'))), Optional(Zusatz))
Belege = Sequence(Token("BELEGE", wL=wsp__, wR=wsp__), ZeroOrMore(Sequence(Token("*", wL=wsp__, wR=wsp__), EinBeleg)))
DeutscheBedeutung = Sequence(Token("DEU", wL=wsp__, wR=wsp__), RE('(?:(?![A-ZÄÖÜ][A-ZÄÖÜ]).)+', wR=wsp__))
LateinischeBedeutung = Sequence(Token("LAT", wL=wsp__, wR=wsp__), RE('(?:(?![A-ZÄÖÜ][A-ZÄÖÜ]).)+', wR=wsp__))
Autorinfo = Sequence(Alternative(Token("AUTORIN", wR=wsp__, wL=wsp__), Token("AUTOR", wR=wsp__, wL=wsp__)), Name)
Zusatz = Sequence(Token("ZUSATZ", wR=wsp__, wL=wsp__), RE('\\s?.*'))
EinBeleg = Sequence(OneOrMore(Sequence(NegativeLookahead(Sequence(RE('\\s*'), Alternative(Token("*", wR=wsp__, wL=wsp__), Token("BEDEUTUNG", wR=wsp__, wL=wsp__), Token("AUTOR", wR=wsp__, wL=wsp__), Token("NAME", wR=wsp__, wL=wsp__), Token("ZUSATZ", wR=wsp__, wL=wsp__)))), RE('\\s?.*'))), Optional(Zusatz))
Belege = Sequence(Token("BELEGE", wR=wsp__, wL=wsp__), ZeroOrMore(Sequence(Token("*", wR=wsp__, wL=wsp__), EinBeleg)))
DeutscheBedeutung = Sequence(Token("DEU", wR=wsp__, wL=wsp__), RE('(?:(?![A-ZÄÖÜ][A-ZÄÖÜ]).)+', wR=wsp__))
LateinischeBedeutung = Sequence(Token("LAT", wR=wsp__, wL=wsp__), RE('(?:(?![A-ZÄÖÜ][A-ZÄÖÜ]).)+', wR=wsp__))
Interpretamente = Sequence(LateinischeBedeutung, DeutscheBedeutung, Optional(Belege))
Bedeutungskategorie = RE('(?:(?![A-ZÄÖÜ][A-ZÄÖÜ]).)+', wR=wsp__)
Bedeutung = Alternative(Interpretamente, Bedeutungskategorie)
BedeutungsPosition = OneOrMore(Sequence(Token("BEDEUTUNG", wL=wsp__, wR=wsp__), Bedeutung))
BedeutungsPosition = OneOrMore(Sequence(Token("BEDEUTUNG", wR=wsp__, wL=wsp__), Bedeutung))
VerweisZiel = RE('<\\w+>', wR=wsp__, wL=wsp__)
Verweis = RE('>>\\w+', wR=wsp__, wL=wsp__)
Beleg = Verweis
Schreibweise = Alternative(Token("vizreg-", wL=wsp__, wR=wsp__), Token("festregel(a)", wL=wsp__, wR=wsp__), Token("fezdregl(a)", wL=wsp__, wR=wsp__), Token("fat-", wL=wsp__, wR=wsp__))
SWVariante = Sequence(Schreibweise, Token(":", wL=wsp__, wR=wsp__), Beleg)
SWTyp = Alternative(Token("script.", wL=wsp__, wR=wsp__), Token("script. fat-", wL=wsp__, wR=wsp__))
SchreibweisenPosition = Sequence(Token("SCHREIBWEISE", wL=wsp__, wR=wsp__), Required(SWTyp), Token(":", wL=wsp__, wR=wsp__), Required(SWVariante), ZeroOrMore(Sequence(Token(",", wL=wsp__, wR=wsp__), Required(SWVariante))))
Schreibweise = Alternative(Token("vizreg-", wR=wsp__, wL=wsp__), Token("festregel(a)", wR=wsp__, wL=wsp__), Token("fezdregl(a)", wR=wsp__, wL=wsp__), Token("fat-", wR=wsp__, wL=wsp__))
SWVariante = Sequence(Schreibweise, Token(":", wR=wsp__, wL=wsp__), Beleg)
SWTyp = Alternative(Token("script.", wR=wsp__, wL=wsp__), Token("script. fat-", wR=wsp__, wL=wsp__))
SchreibweisenPosition = Sequence(Token("SCHREIBWEISE", wR=wsp__, wL=wsp__), Required(SWTyp), Token(":", wR=wsp__, wL=wsp__), Required(SWVariante), ZeroOrMore(Sequence(Token(",", wR=wsp__, wL=wsp__), Required(SWVariante))))
ArtikelKopf = SchreibweisenPosition
_genus = Alternative(Token("maskulinum", wL=wsp__, wR=wsp__), Token("m.", wL=wsp__, wR=wsp__), Token("femininum", wL=wsp__, wR=wsp__), Token("f.", wL=wsp__, wR=wsp__), Token("neutrum", wL=wsp__, wR=wsp__), Token("n.", wL=wsp__, wR=wsp__))
_genus = Alternative(Token("maskulinum", wR=wsp__, wL=wsp__), Token("m.", wR=wsp__, wL=wsp__), Token("femininum", wR=wsp__, wL=wsp__), Token("f.", wR=wsp__, wL=wsp__), Token("neutrum", wR=wsp__, wL=wsp__), Token("n.", wR=wsp__, wL=wsp__))
Flexion = RE('-?[a-z]+', wR=wsp__)
Flexionen = Sequence(Flexion, ZeroOrMore(Sequence(Token(",", wL=wsp__, wR=wsp__), Required(Flexion))))
GVariante = Sequence(Flexionen, Optional(_genus), Token(":", wL=wsp__, wR=wsp__), Beleg)
GrammatikVarianten = Sequence(Token(";", wL=wsp__, wR=wsp__), Required(GVariante))
_wortart = Alternative(Token("nomen", wL=wsp__, wR=wsp__), Token("n.", wL=wsp__, wR=wsp__), Token("verb", wL=wsp__, wR=wsp__), Token("v.", wL=wsp__, wR=wsp__), Token("adverb", wL=wsp__, wR=wsp__), Token("adv.", wL=wsp__, wR=wsp__), Token("adjektiv", wL=wsp__, wR=wsp__), Token("adj.", wL=wsp__, wR=wsp__))
GrammatikPosition = Sequence(Token("GRAMMATIK", wL=wsp__, wR=wsp__), Required(_wortart), Required(Token(";", wL=wsp__, wR=wsp__)), Required(Flexionen), Optional(_genus), ZeroOrMore(GrammatikVarianten), Optional(Alternative(Token(";", wL=wsp__, wR=wsp__), Token(".", wL=wsp__, wR=wsp__))))
LVZusatz = Token("sim.", wL=wsp__, wR=wsp__)
Flexionen = Sequence(Flexion, ZeroOrMore(Sequence(Token(",", wR=wsp__, wL=wsp__), Required(Flexion))))
GVariante = Sequence(Flexionen, Optional(_genus), Token(":", wR=wsp__, wL=wsp__), Beleg)
GrammatikVarianten = Sequence(Token(";", wR=wsp__, wL=wsp__), Required(GVariante))
_wortart = Alternative(Token("nomen", wR=wsp__, wL=wsp__), Token("n.", wR=wsp__, wL=wsp__), Token("verb", wR=wsp__, wL=wsp__), Token("v.", wR=wsp__, wL=wsp__), Token("adverb", wR=wsp__, wL=wsp__), Token("adv.", wR=wsp__, wL=wsp__), Token("adjektiv", wR=wsp__, wL=wsp__), Token("adj.", wR=wsp__, wL=wsp__))
GrammatikPosition = Sequence(Token("GRAMMATIK", wR=wsp__, wL=wsp__), Required(_wortart), Required(Token(";", wR=wsp__, wL=wsp__)), Required(Flexionen), Optional(_genus), ZeroOrMore(GrammatikVarianten), Optional(Alternative(Token(";", wR=wsp__, wL=wsp__), Token(".", wR=wsp__, wL=wsp__))))
LVZusatz = Token("sim.", wR=wsp__, wL=wsp__)
LVariante = RE('(?:[a-z]|-)+', wR=wsp__, wL=wsp__)
LemmaVarianten = Sequence(Token("VARIANTEN", wL=wsp__, wR=wsp__), Required(LVariante), ZeroOrMore(Sequence(Token(",", wL=wsp__, wR=wsp__), Required(LVariante))), Optional(Sequence(Token(";", wL=wsp__, wR=wsp__), Required(LVZusatz))))
_tll = Token("*", wL=wsp__, wR=wsp__)
LemmaVarianten = Sequence(Token("VARIANTEN", wR=wsp__, wL=wsp__), Required(LVariante), ZeroOrMore(Sequence(Token(",", wR=wsp__, wL=wsp__), Required(LVariante))), Optional(Sequence(Token(";", wR=wsp__, wL=wsp__), Required(LVZusatz))))
_tll = Token("*", wR=wsp__, wL=wsp__)
Lemma = Sequence(Optional(_tll), WORT_KLEIN)
LemmaPosition = Sequence(Token("LEMMA", wL=wsp__, wR=wsp__), Required(Lemma), Optional(LemmaVarianten), Required(GrammatikPosition))
LemmaPosition = Sequence(Token("LEMMA", wR=wsp__, wL=wsp__), Required(Lemma), Optional(LemmaVarianten), Required(GrammatikPosition))
Artikel = Sequence(Optional(LEER), Required(LemmaPosition), Optional(ArtikelKopf), Required(BedeutungsPosition), Required(Autorinfo), Optional(LEER), DATEI_ENDE)
root__ = Artikel
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment