In January 2021 we will introduce a 10 GB quota for project repositories. Higher limits for individual projects will be available on request. Please see https://doku.lrz.de/display/PUBLIC/GitLab for more information.

Commit 8e3b05a7 authored by di68kap's avatar di68kap

- transform.py: weitere Transformationsregeln

- MLW: AST-Transformations
parent e64661d8
......@@ -616,20 +616,25 @@ def mock_syntax_tree(sxpr):
that does not match an opening bracket matched earlier within the same
package."""
s = s.strip()
while s[0] != ')':
if s[0] != '(':
raise ValueError('"(" expected, not ' + s[:10])
# assert s[0] == '(', s
level = 1
k = 1
while level > 0:
if s[k] == '(':
level += 1
elif s[k] == ')':
level -= 1
k += 1
yield s[:k]
s = s[k:].strip()
try:
while s[0] != ')':
if s[0] != '(':
raise ValueError('"(" expected, not ' + s[:10])
# assert s[0] == '(', s
level = 1
k = 1
while level > 0:
if s[k] == '(':
level += 1
elif s[k] == ')':
level -= 1
k += 1
yield s[:k]
s = s[k:].strip()
except IndexError:
errmsg = ('Malformed S-expression. Unprocessed part: "%s"' % s) if s \
else 'Malformed S-expression. Closing bracket(s) ")" missing.'
raise AssertionError(errmsg)
sxpr = sxpr.strip()
if sxpr[0] != '(':
......@@ -637,6 +642,9 @@ def mock_syntax_tree(sxpr):
# assert sxpr[0] == '(', sxpr
sxpr = sxpr[1:].strip()
match = re.match(r'[\w:]+', sxpr)
if match is None:
raise AssertionError('Malformed S-expression Node-tagname or identifier expected, '
'not "%s"' % sxpr[:40].replace('\n', ''))
name, class_name = (sxpr[:match.end()].split(':') + [''])[:2]
sxpr = sxpr[match.end():].strip()
if sxpr[0] == '(':
......
......@@ -130,7 +130,7 @@ def logging(dirname="LOGS"):
save = LOGGING
except NameError:
save = ""
LOGGING = dirname
LOGGING = dirname or ""
yield
LOGGING = save
......
......@@ -44,6 +44,7 @@ __all__ = ('TransformationDict',
'merge_children',
'replace_content',
'apply_if',
'traverse_locally',
'is_anonymous',
'is_whitespace',
'is_empty',
......@@ -51,6 +52,14 @@ __all__ = ('TransformationDict',
'is_token',
'is_one_of',
'has_content',
'lstrip',
'rstrip',
'strip',
'keep_children',
'keep_children_if',
'keep_tokens',
'keep_nodes',
'keep_content',
'remove_children_if',
'remove_nodes',
'remove_content',
......@@ -63,7 +72,6 @@ __all__ = ('TransformationDict',
'remove_infix_operator',
'remove_single_child',
'remove_tokens',
'keep_children',
'flatten',
'forbid',
'require',
......@@ -508,10 +516,60 @@ def is_expendable(context: List[Node]) -> bool:
return is_empty(context) or is_whitespace(context)
@transformation_factory(Callable)
def lstrip(context: List[Node], condition: Callable = is_expendable):
"""Recursively removes all leading child-nodes that fulfill a given condition."""
node = context[-1]
i = 1
while i > 0 and node.children:
lstrip(context + [node.children[0]], condition)
i, L = 0, len(node.children)
while i < L and condition(context + [node.children[i]]):
i += 1
if i > 0:
node.result = node.children[i:]
@transformation_factory(Callable)
def rstrip(context: List[Node], condition: Callable = is_expendable):
"""Recursively removes all leading nodes that fulfill a given condition."""
node = context[-1]
i, L = 0, len(node.children)
while i < L and node.children:
rstrip(context + [node.children[-1]], condition)
L = len(node.children)
i = L
while i > 0 and condition(context + [node.children[i-1]]):
i -= 1
if i < L:
node.result = node.children[:i]
@transformation_factory(Callable)
def strip(context: List[Node], condition: Callable = is_expendable) -> str:
"""Removes leading and trailing child-nodes that fulfill a given condition."""
lstrip(context, condition)
rstrip(context, condition)
@transformation_factory(AbstractSet[str])
def is_token(context: List[Node], tokens: AbstractSet[str] = frozenset()) -> bool:
"""Checks whether the last node in the context is has `ptype == TOKEN_PTYPE`
and it's content without leading or trailing whitespace child-nodes
matches one of the given tokens. If no tokens are given, any token is a match.
"""
def stripped(nd: Node) -> str:
# assert node.parser.ptype == TOKEN_PTYPE
if nd.children:
i, k = 0, len(nd.children)
while i < len(nd.children) and nd.children[i] == WHITESPACE_PTYPE:
i += 1
while k > 0 and nd.children[k-1] == WHITESPACE_PTYPE:
k -= 1
return "".join(child.content for child in node.children[i:k])
return nd.content
node = context[-1]
return node.parser.ptype == TOKEN_PTYPE and (not tokens or node.result in tokens)
return node.parser.ptype == TOKEN_PTYPE and (not tokens or stripped(node) in tokens)
@transformation_factory(AbstractSet[str])
......@@ -526,13 +584,44 @@ def has_content(context: List[Node], regexp: str) -> bool:
return bool(re.match(regexp, context[-1].content))
@transformation_factory(AbstractSet[str])
def has_parent(context: List[Node], tag_name_set: AbstractSet[str]) -> bool:
"""Checks whether a node with one of the given tag names appears somewhere
in the context before the last node in the context."""
for i in range(2, len(context)):
if context[-i].tag_name in tag_name_set:
return True
return False
@transformation_factory(Callable)
def apply_if(context: List[Node], transformation: Callable, condition: Callable):
"""Applies a transformation only if a certain condition is met."""
node = context[-1]
if condition(node):
if condition(context):
transformation(context)
# @transformation_factory(List[Callable])
# def apply_to_child(context: List[Node], transformations: List[Callable], condition: Callable):
# """Applies a list of transformations to those children that meet a specifc condition."""
# node = context[-1]
# for child in node.children:
# context.append(child)
# if condition(context):
# for transform in transformations:
# transform(context)
# context.pop()
@transformation_factory(Dict)
def traverse_locally(context: List[Node],
processing_table: Dict, # actually: ProcessingTableType
key_func: Callable=key_tag_name): # actually: KeyFunc
"""Transforms the syntax tree starting from the last node in the context
according to the given processing table. The purpose of this function is
to apply certain transformations locally, i.e. only for those nodes that
have the last node in the context as their parent node.
"""
traverse(context[-1], processing_table, key_func)
@transformation_factory(slice)
def keep_children(context: List[Node], section: slice = slice(None)):
......@@ -543,7 +632,35 @@ def keep_children(context: List[Node], section: slice = slice(None)):
@transformation_factory(Callable)
def remove_children_if(context: List[Node], condition: Callable): # , section: slice = slice(None)):
def keep_children_if(context: List[Node], condition: Callable):
"""Removes all children for which `condition()` returns `True`."""
node = context[-1]
if node.children:
node.result = tuple(c for c in node.children if condition(context + [c]))
@transformation_factory
def keep_tokens(context: List[Node], tokens: AbstractSet[str] = frozenset()):
"""Removes any among a particular set of tokens from the immediate
descendants of a node. If ``tokens`` is the empty set, all tokens
are removed."""
keep_children_if(context, partial(is_token, tokens=tokens))
@transformation_factory
def keep_nodes(context: List[Node], tag_names: AbstractSet[str]):
"""Removes children by tag name."""
keep_children_if(context, partial(is_one_of, tag_name_set=tag_names))
@transformation_factory
def keep_content(context: List[Node], regexp: str):
"""Removes children depending on their string value."""
keep_children_if(context, partial(has_content, regexp=regexp))
@transformation_factory(Callable)
def remove_children_if(context: List[Node], condition: Callable):
"""Removes all children for which `condition()` returns `True`."""
node = context[-1]
if node.children:
......@@ -576,16 +693,16 @@ def remove_children_if(context: List[Node], condition: Callable): # , section:
remove_whitespace = remove_children_if(is_whitespace) # partial(remove_children_if, condition=is_whitespace)
remove_empty = remove_children_if(is_empty)
remove_expendables = remove_children_if(is_expendable) # partial(remove_children_if, condition=is_expendable)
remove_first = apply_if(keep_children(slice(1, None)), lambda nd: len(nd.children) > 1)
remove_last = apply_if(keep_children(slice(None, -1)), lambda nd: len(nd.children) > 1)
remove_brackets = apply_if(keep_children(slice(1, -1)), lambda nd: len(nd.children) >= 2)
remove_first = apply_if(keep_children(slice(1, None)), lambda ctx: len(ctx[-1].children) > 1)
remove_last = apply_if(keep_children(slice(None, -1)), lambda ctx: len(ctx[-1].children) > 1)
remove_brackets = apply_if(keep_children(slice(1, -1)), lambda ctx: len(ctx[-1].children) >= 2)
remove_infix_operator = keep_children(slice(0, None, 2))
remove_single_child = apply_if(keep_children(slice(0)), lambda nd: len(nd.children) == 1)
remove_single_child = apply_if(keep_children(slice(0)), lambda ctx: len(ctx[-1].children) == 1)
@transformation_factory
def remove_tokens(context: List[Node], tokens: AbstractSet[str] = frozenset()):
"""Reomoves any among a particular set of tokens from the immediate
"""Removes any among a particular set of tokens from the immediate
descendants of a node. If ``tokens`` is the empty set, all tokens
are removed."""
remove_children_if(context, partial(is_token, tokens=tokens))
......
......@@ -22,7 +22,7 @@ SCHREIBWEISE
BEDEUTUNG
LAT pannus, faciale, sudarium
DEU Gesichts-, Schweißtuch {usu liturg.; de re v. {=> eintrag/ibi_X}}
DEU Gesichts-tuch, Schweißtuch {usu liturg.;; de re v. {=> eintrag/ibi_X}}
* Catal.: thes. Germ.; 28,11 (post 851) "-um III".
* Form.: Sangall.; {#ibi_2} 39 p. 421,16
......
......@@ -36,7 +36,7 @@ LemmaWort = LAT_WORT
LemmaVarianten = LemmaVariante { [";" | ","] [ZW] LemmaVariante } [ ABS Zusatz ]
LemmaVariante = LAT_WORT [Zusatz]
LemmaVariante = LAT_WORT [Zusatz] # Ist eine Lemma immer ein einzelnes Wort?
## GRAMMATIK-POSITION ##
......@@ -113,8 +113,10 @@ Bedeutungskategorie = { EINZEILER [LZ] [Zusatz] [LZ] } §":"
Interpretamente = LateinischeBedeutung (LZ | " " | "--") §DeutscheBedeutung [":"]
LateinischeBedeutung = LAT [ZW] LateinischerAusdruck { "," LateinischerAusdruck }
DeutscheBedeutung = DEU [ZW] DeutscherAusdruck { "," DeutscherAusdruck }
LateinischerAusdruck = { <(LAT_WORT | "(" { LAT_WORT }+ ")") [Zusatz]> }+
DeutscherAusdruck = { <(DEU_WORT | "(" { DEU_WORT }+ ")") [Zusatz]> }+
LateinischerAusdruck = { <LateinischesWort [Zusatz]> }+
DeutscherAusdruck = { <DeutschesWort [Zusatz]> }+
LateinischesWort = (LAT_WORT | "(" { LAT_WORT }+ ")")
DeutschesWort = (DEU_WORT | "(" { DEU_WORT }+ ")")
LAT = "LATEINISCH" | "LAT"
DEU = "DEUTSCH" | "DEU"
......
......@@ -24,8 +24,10 @@ from DHParser import logging, is_filename, load_if_file, \
traverse, remove_children_if, merge_children, is_anonymous, \
reduce_single_child, replace_by_single_child, replace_or_reduce, remove_whitespace, \
remove_expendables, remove_empty, remove_tokens, flatten, is_whitespace, \
is_empty, is_expendable, collapse, replace_content, remove_nodes, remove_content, remove_brackets, replace_parser, \
keep_children, is_one_of, has_content, apply_if, remove_first, remove_last
is_empty, is_expendable, collapse, replace_content, remove_nodes, remove_content, \
remove_brackets, replace_parser, traverse_locally, remove_nodes, \
keep_children, is_one_of, has_content, apply_if, remove_first, remove_last, \
lstrip, rstrip, strip, keep_nodes
#######################################################################
......@@ -88,7 +90,7 @@ class MLWGrammar(Grammar):
LemmaVarianten = LemmaVariante { [";" | ","] [ZW] LemmaVariante } [ ABS Zusatz ]
LemmaVariante = LAT_WORT [Zusatz]
LemmaVariante = LAT_WORT [Zusatz] # Ist eine Lemma immer ein einzelnes Wort?
## GRAMMATIK-POSITION ##
......@@ -165,8 +167,10 @@ class MLWGrammar(Grammar):
Interpretamente = LateinischeBedeutung (LZ | " " | "--") §DeutscheBedeutung [":"]
LateinischeBedeutung = LAT [ZW] LateinischerAusdruck { "," LateinischerAusdruck }
DeutscheBedeutung = DEU [ZW] DeutscherAusdruck { "," DeutscherAusdruck }
LateinischerAusdruck = { <(LAT_WORT | "(" { LAT_WORT }+ ")") [Zusatz]> }+
DeutscherAusdruck = { <(DEU_WORT | "(" { DEU_WORT }+ ")") [Zusatz]> }+
LateinischerAusdruck = { <LateinischesWort [Zusatz]> }+
DeutscherAusdruck = { <DeutschesWort [Zusatz]> }+
LateinischesWort = (LAT_WORT | "(" { LAT_WORT }+ ")")
DeutschesWort = (DEU_WORT | "(" { DEU_WORT }+ ")")
LAT = "LATEINISCH" | "LAT"
DEU = "DEUTSCH" | "DEU"
......@@ -305,7 +309,7 @@ class MLWGrammar(Grammar):
flexion = Forward()
genus = Forward()
wortart = Forward()
source_hash__ = "a01b075b877de8bc46f92fa3b3e5b028"
source_hash__ = "ded96803a4eb4164ea8d2cf18924172b"
parser_initialization__ = "upon instantiation"
COMMENT__ = r'(?:\/\/.*)|(?:\/\*(?:.|\n)*?\*\/)'
WHITESPACE__ = r'[\t ]*'
......@@ -388,8 +392,10 @@ class MLWGrammar(Grammar):
GRI = Alternative(Token("GRIECHISCH"), Token("GRIECH"), Token("GRIE"), Token("GRI"))
DEU = Alternative(Token("DEUTSCH"), Token("DEU"))
LAT = Alternative(Token("LATEINISCH"), Token("LAT"))
DeutscherAusdruck = OneOrMore(AllOf(Alternative(DEU_WORT, Series(Token("("), OneOrMore(DEU_WORT), Token(")"))), Option(Zusatz)))
LateinischerAusdruck = OneOrMore(AllOf(Alternative(LAT_WORT, Series(Token("("), OneOrMore(LAT_WORT), Token(")"))), Option(Zusatz)))
DeutschesWort = Alternative(DEU_WORT, Series(Token("("), OneOrMore(DEU_WORT), Token(")")))
LateinischesWort = Alternative(LAT_WORT, Series(Token("("), OneOrMore(LAT_WORT), Token(")")))
DeutscherAusdruck = OneOrMore(AllOf(DeutschesWort, Option(Zusatz)))
LateinischerAusdruck = OneOrMore(AllOf(LateinischesWort, Option(Zusatz)))
DeutscheBedeutung = Series(DEU, Option(ZW), DeutscherAusdruck, ZeroOrMore(Series(Token(","), DeutscherAusdruck)))
LateinischeBedeutung = Series(LAT, Option(ZW), LateinischerAusdruck, ZeroOrMore(Series(Token(","), LateinischerAusdruck)))
Interpretamente = Series(LateinischeBedeutung, Alternative(LZ, Token(" "), Token("--")), DeutscheBedeutung, Option(Token(":")), mandatory=2)
......@@ -454,54 +460,84 @@ def get_grammar() -> MLWGrammar:
#
#######################################################################
LemmaVariante_table = {
"LAT_WORT, DEU_WORT": [remove_whitespace, reduce_single_child],
"Zusatz": [reduce_single_child]
}
MLW_AST_transformation_table = {
# AST Transformations for the MLW-grammar
"+": [remove_empty, remove_tokens,
remove_nodes('ZWW', 'LZ', 'DPP', 'COMMENT__', 'ABS', 'SEM')],
"+": [remove_empty, remove_nodes('ZWW', 'LZ', 'DPP', 'COMMENT__', 'ABS', 'SEM'),
remove_tokens(",", "{", "}", "=>")],
"Autor": [reduce_single_child],
"Artikel": [],
"LemmaPosition": [],
"LemmaPosition": [remove_first],
"Lemma": [],
"klassisch": [],
"gesichert": [],
"LemmaVarianten": [],
"LemmaWort": [],
"klassisch": [reduce_single_child],
"gesichert": [reduce_single_child],
"LemmaVariante": [reduce_single_child, traverse_locally(LemmaVariante_table)],
"LemmaVarianten": [flatten, remove_nodes("ZW")],
"LemmaWort": [reduce_single_child],
"LemmaZusatz": [],
"lzs_typ": [],
"GrammatikPosition": [],
"GrammatikPosition": [remove_first, flatten],
"wortart": [replace_or_reduce],
"GrammatikVarianten": [],
"flexion": [],
"FLEX": [],
"deklination": [],
"konjugation": [],
"FLEX": [remove_whitespace, reduce_single_child],
"genus": [replace_or_reduce],
"EtymologiePosition": [],
"EtymologieVarianten": [],
"EtymologieVariante": [],
"ArtikelKopf": [replace_by_single_child],
"SchreibweisenPosition": [],
"SchreibweisenPosition, StrukturPosition, VerwechselungsPosition": [remove_first],
"SWTyp": [replace_or_reduce],
"SWVariante": [],
"Schreibweise": [replace_by_single_child],
"BedeutungsPosition": [],
"Kategorie": [],
"Varianten": [flatten],
"Variante": [],
"Gegenstand": [reduce_single_child],
"Besonderheit": [reduce_single_child],
"BedeutungsPosition": [flatten, remove_tokens("BEDEUTUNG")],
"Bedeutung": [],
"U1Bedeutung, U2Bedeutung, U3Bedeutung, U4Bedeutung, U5Bedeutung":
[remove_first, flatten],
"Bedeutungskategorie": [],
"Beleg": [],
"BelegText": [partial(strip, condition=lambda context: is_expendable(context)
or has_content(context, '[".]')),
reduce_single_child],
"BelegStelle": [flatten],
"Interpretamente": [],
"LateinischeBedeutung": [],
"DeutscheBedeutung": [],
"Belege": [],
"LateinischeBedeutung": [remove_nodes("LAT"), flatten],
"DeutscheBedeutung": [remove_nodes("DEU"), flatten],
"LateinischerAusdruck": [flatten, reduce_single_child],
"DeutscherAusdruck": [flatten, reduce_single_child],
"LateinischesWort, DeutschesWort": [strip, collapse],
"Belege": [flatten, remove_tokens("*")],
"Beleg": [],
"EinBeleg": [],
"Zusatz": [],
"ArtikelVerfasser": [],
"Zitat": [flatten, remove_nodes("ZW")],
"Zusatz": [reduce_single_child, flatten, remove_tokens(";;", ";")],
"ArtikelVerfasser": [remove_first],
"Stellenverzeichnis": [remove_first],
"Verweisliste": [flatten, remove_tokens("*")],
"Stellenverweis": [flatten],
"Name": [],
"Stelle": [collapse],
"SW_LAT": [replace_or_reduce],
"SW_DEU": [replace_or_reduce],
"SW_GRIECH": [replace_or_reduce],
"Beleg": [replace_by_single_child],
"Verweis": [],
"Verweis": [remove_tokens("=>")],
"VerweisZiel": [],
"Anker": [remove_tokens("#"), reduce_single_child],
"Werk": [reduce_single_child],
"ZielName": [replace_by_single_child],
"URL": [flatten, keep_nodes('protokoll', 'domäne', 'pfad', 'ziel')],
"NAMENS_ABKÜRZUNG": [],
"NAME": [],
"DEU_WORT": [reduce_single_child],
......@@ -512,6 +548,7 @@ MLW_AST_transformation_table = {
"GROSSSCHRIFT": [],
"GROSSFOLGE": [],
"BUCHSTABENFOLGE": [],
"EINZEILER, FREITEXT, MEHRZEILER": [strip, collapse],
"ZEICHENFOLGE": [],
"TR": [replace_or_reduce],
"ABS": [replace_or_reduce],
......@@ -526,7 +563,7 @@ MLW_AST_transformation_table = {
"KOMMENTARZEILEN": [],
"DATEI_ENDE": [],
"NIEMALS": [],
":Token": [],
":Token": [remove_whitespace, reduce_single_child],
"RE": reduce_single_child,
"*": replace_by_single_child
}
......
......@@ -13,9 +13,7 @@ Match-test "1"
### AST
(Lemma
(LemmaWort
(LAT_WORT
"facitergula"
)
"facitergula"
)
)
......@@ -41,49 +39,17 @@ Match-test "1"
### AST
(LemmaVarianten
(LAT_WORT
(LemmaVariante
"fascitergula"
)
(:ZeroOrMore
(:Series
(ZW
(ZEILENSPRUNG
(:RegExp
""
""
)
)
)
(LAT_WORT
"facietergula"
)
)
(:Series
(ZW
(ZEILENSPRUNG
(:RegExp
""
""
)
)
)
(LAT_WORT
"facistergula"
)
)
(:Series
(ZW
(ZEILENSPRUNG
(:RegExp
""
""
)
)
)
(LAT_WORT
"farcutergula"
)
)
(LemmaVariante
"facietergula"
)
(LemmaVariante
"facistergula"
)
(LemmaVariante
"farcutergula"
)
)
......@@ -95,7 +61,7 @@ Match-test "2"
### AST
(LemmaVarianten
(LAT_WORT
(LemmaVariante
"fascitergula"
)
)
......@@ -108,7 +74,7 @@ Match-test "3"
### AST
(LemmaVarianten
(LAT_WORT
(LemmaVariante
(:RegExp
"fascitergula"
)
......@@ -118,17 +84,10 @@ Match-test "3"
)
(LemmaVariante
(LAT_WORT
(:RegExp
"facietergula"
)
(:Whitespace
" "
)
"facietergula"
)
(Zusatz
(DEU_WORT
"sim."
)
"sim."
)
)
)
......@@ -141,27 +100,18 @@ Match-test "4"
### AST
(LemmaVarianten
(LAT_WORT
(LemmaVariante
"fascitergula"
)
(:ZeroOrMore
(LemmaVariante
"facietergula"
)
(LemmaVariante
(LAT_WORT
"facietergula"
"fascistergula"
)
(LemmaVariante
(LAT_WORT
(:RegExp
"fascistergula"
)
(:Whitespace
" "
)
)
(Zusatz
(DEU_WORT
"sim."
)
)
(Zusatz
"sim."
)
)
)
......@@ -195,79 +145,43 @@ Match-test "1"
(LemmaPosition
(Lemma
(LemmaWort
(LAT_WORT
"facitergula"
)
"facitergula"
)
)
(LemmaVarianten
(LAT_WORT
(LemmaVariante
"fascitergula"
)
(:ZeroOrMore
(:Series
(ZW
(ZEILENSPRUNG
(:RegExp
""
""
)
)
)
(LAT_WORT
"facietergula"
)
)
(:Series
(ZW
(ZEILENSPRUNG
(:RegExp
""
""
)