Commit ec0ad8d7 authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

Merge remote-tracking branch 'origin/set_parser' into set_parser

# Conflicts:
#	DHParser/parser.py
parents d6437380 70a1728a
......@@ -1471,7 +1471,7 @@ class Series(NaryOperator):
class Alternative(NaryOperator):
"""
Matches if at least one of several alternatives matches. Returns
Matches if one of several alternatives matches. Returns
the first match.
This parser represents the EBNF-operator "|" with the qualification
......@@ -1491,13 +1491,12 @@ class Alternative(NaryOperator):
EBNF-Notation: `... | ...`
EBNF-Example: `sentence = /\d+\.\d+/ | /\d+/`
"""
def __init__(self, *parsers: Parser, name: str = '') -> None:
def __init__(self, *parsers: Parser, name: str='') -> None:
super(Alternative, self).__init__(*parsers, name=name)
assert len(self.parsers) >= 1
# only the last alternative may be optional. Could this be checked at compile time?
assert all(not isinstance(p, Option) for p in self.parsers[:-1]), \
"Parser-specification Error (EBNF): only the last alternative may be optional!"
self.been_here = dict() # type: Dict[int, int]
"Parser-specification Error: only the last alternative may be optional!"
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
for parser in self.parsers:
......@@ -1511,7 +1510,6 @@ class Alternative(NaryOperator):
def reset(self):
super(Alternative, self).reset()
self.been_here = {}
return self
# The following operator definitions add syntactical sugar, so one can write:
......@@ -1535,14 +1533,14 @@ class Alternative(NaryOperator):
return self
class FullSet(NaryOperator):
class AllOf(NaryOperator):
"""
Matches if all elemtns of a set of parsers match. Each parser must
Matches if all elements of a set of parsers match. Each parser must
match exactly once. Other than in a sequence, the order in which
the parsers match is arbitrary, however.
Example:
>>> prefixes = FullSet(Token("A"), Token("B"))
>>> prefixes = AllOf(Token("A"), Token("B"))
>>> Grammar(prefixes)('A B').content()
'A B'
>>> Grammar(prefixes)('B A').content()
......@@ -1551,65 +1549,88 @@ class FullSet(NaryOperator):
EBNF-Notation: `<... ...>` (sequence of parsers enclosed by angular brackets)
EBNF-Example: `set = <letter letter_or_digit>`
"""
# TODO: Implement set
RX_ARGUMENT = re.compile(r'\s(\S)')
NOPE = 1000
def __init__(self, *parsers: Parser, mandatory: int = NOPE, name: str = '') -> None:
super(Series, self).__init__(*parsers, name=name)
L = len(self.parsers)
assert 1 <= L < Series.NOPE, 'Length %i of series exceeds maximum length of %i' \
% (L, Series.NOPE)
if mandatory < 0: mandatory += L
assert 0 <= mandatory < L or mandatory == Series.NOPE
self.mandatory = mandatory
def __init__(self, *parsers: Parser, name: str = '') -> None:
if len(parsers) == 1:
assert isinstance(parsers[0], Series), \
"Parser-specification Error: No single arguments other than a Series " \
"allowed as arguments for AllOf-Parser !"
parsers = parsers[0].parsers
super().__init__(*parsers, name=name)
def __deepcopy__(self, memo):
parsers = copy.deepcopy(self.parsers, memo)
return self.__class__(*parsers, mandatory=self.mandatory, name=self.name)
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
results = () # type: Tuple[Node, ...]
text_ = text # type: StringView
pos = 0
for parser in self.parsers:
node, text_ = parser(text_)
if not node:
if pos < self.mandatory:
return None, text
else:
# Provide useful error messages
m = text.search(Series.RX_ARGUMENT)
i = max(1, text.index(m.regs[1][0])) if m else 1
node = Node(self, text_[:i])
node.add_error('%s expected; "%s" found!' % (str(parser), text_[:10]),
code=Error.MANDATORY_CONTINUATION)
text_ = text_[i:]
results += (node,)
# if node.error_flag: # break on first error
# break
pos += 1
pset = set(self.parsers) # type: Set[Parser]
while pset:
# TODO: Ordnung berücksichtigen, kein SET verwenden!
for parser in pset:
node, text__ = parser(text_)
if node:
results += (node,)
text_ = text__
pset.remove(parser)
break
else:
return None, text
assert len(results) <= len(self.parsers)
return Node(self, results), text_
def __repr__(self):
return " ".join([parser.repr for parser in self.parsers[:self.mandatory]]
+ (['§'] if self.mandatory != Series.NOPE else [])
+ [parser.repr for parser in self.parsers[self.mandatory:]])
return '<' + ' '.join(parser.repr for parser in self.parsers) + '>'
# The following operator definitions add syntactical sugar, so one can write:
# `RE('\d+') + Optional(RE('\.\d+)` instead of `Series(RE('\d+'), Optional(RE('\.\d+))`
@staticmethod
def combined_mandatory(left, right):
left_mandatory, left_length = (left.mandatory, len(left.parsers)) \
if isinstance(left, Series) else (Series.NOPE, 1)
if left_mandatory != Series.NOPE:
return left_mandatory
right_mandatory = right.mandatory if isinstance(right, Series) else Series.NOPE
if right_mandatory != Series.NOPE:
return right_mandatory + left_length
return Series.NOPE
class SomeOf(NaryOperator):
"""
Matches if at least one element of a set of parsers match. No parser
must match more than once . Other than in a sequence, the order in which
the parsers match is arbitrary, however.
Example:
>>> prefixes = SomeOf(Token("A"), Token("B"))
>>> Grammar(prefixes)('A B').content()
'A B'
>>> Grammar(prefixes)('B A').content()
'B A'
>>> Grammar(prefixes)('B').content()
'B'
EBNF-Notation: `<... ...>` (sequence of parsers enclosed by angular brackets)
EBNF-Example: `set = <letter letter_or_digit>`
"""
def __init__(self, *parsers: Parser, name: str = '') -> None:
if len(parsers) == 1:
assert isinstance(parsers[0], Alternative), \
"Parser-specification Error: No single arguments other than a Alternative " \
"allowed as arguments for SomeOf-Parser !"
parsers = parsers[0].parsers
super().__init__(*parsers, name=name)
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
results = () # type: Tuple[Node, ...]
text_ = text # type: StringView
pset = set(self.parsers) # type: Set[Parser]
while pset:
# TODO: Ordnung berücksichtigen, kein Set verwenden!!!
for parser in pset:
node, text__ = parser(text_)
if node:
results += (node,)
text_ = text__
pset.remove(parser)
break
else:
pset = set()
assert len(results) <= len(self.parsers)
if results:
return Node(self, results), text_
else:
return None, text
def __repr__(self):
return '<' + ' | '.join(parser.repr for parser in self.parsers) + '>'
......
......@@ -65,8 +65,9 @@ a few drawbacks to this approach:
particular, there are no good open source XML-Editors.
On the other hand, there are good reasons why XML is used in the
humanities: Important encoding standards like TEI-XML are defined in
XML. Its strict syntax and the possibility to check data against a
humanities: Important encoding standards like
[TEI-XML](http://www.tei-c.org/index.xml) are defined in
XML. Its strict syntax and the possibility to check data against
schema help to detect and avoiding encoding errors. If the schema is
well-defined, it is unambiguous, and it is easy to parse for a computer.
Most of these advantages, however, are on a technical level and few of
......
......@@ -13,8 +13,9 @@ term = { ["§"] factor }+ # "§" means all followin
factor = [flowmarker] [retrieveop] symbol !"=" # negative lookahead to be sure it's not a definition
| [flowmarker] literal
| [flowmarker] regexp
| [flowmarker] group
| [flowmarker] oneormore
| [flowmarker] group
# | [flowmarker] set
| repetition
| option
......@@ -23,6 +24,7 @@ flowmarker = "!" | "&" # '!' negative lookahead, '&' p
retrieveop = "::" | ":" # '::' pop, ':' retrieve
group = "(" §expression ")"
set = "<" §expression ">"
oneormore = "{" expression "}+"
repetition = "{" §expression "}"
option = "[" §expression "]"
......
......@@ -3,7 +3,7 @@
"""
[fail:LB]
1: """ """
10: """ """
[match:GAP]
......@@ -22,10 +22,10 @@
"""
[fail:GAP]
1: """
10: """
"""
2: """
11: """
% Comment
......@@ -61,15 +61,15 @@
% Comment"""
[fail:PARSEP]
1: " "
10: " "
2: """
11: """
"""
3: """
12: """
% Comment"""
4: """ % Comment
13: """ % Comment
% Comment
% Comment"""
......@@ -93,7 +93,7 @@
"""
[fail:WSPC]
1: "X"
10: "X"
[match:LFF]
......@@ -113,7 +113,7 @@
"""
[fail:LFF]
1: " "
10: " "
[match:LF]
......@@ -129,6 +129,6 @@
"""
[fail:LF]
1: """
10: """
"""
......@@ -3,13 +3,13 @@
[fail:text]
1 : Low-level text must not contain \& escaped characters.
10: Low-level text must not contain \& escaped characters.
2 : Low-level text must not contain ] [ brackets.
11: Low-level text must not contain ] [ brackets.
3 : Low-level text must not contain { environments }.
12: Low-level text must not contain { environments }.
4 : Low-level text must not contain any \commands.
13: Low-level text must not contain any \commands.
[match:text_element]
......
......@@ -43,17 +43,17 @@
[fail:paragraph]
1 : Paragraphs are separated by gaps.
20: Paragraphs are separated by gaps.
Like this one.
2 : \begin{enumerate}
21: \begin{enumerate}
3 : \item
22: \item
4 : und Vieh; \paragraph
23: und Vieh; \paragraph
5 : Paragraphs will end
24: Paragraphs will end
\begin{quotation}
at block environments
\end{quotation}
......@@ -86,4 +86,4 @@
\begin{quotation}
include block environments
\end{quotation}
like block quotes.
\ No newline at end of file
like block quotes.
......@@ -21,14 +21,14 @@
[fail:block_environment]
1 : """\begin{generic}inline environment\end{generic}
10: """\begin{generic}inline environment\end{generic}
"""
2 : """\begin{generic}inline environment
11: """\begin{generic}inline environment
\end{generic}
"""
3 : """\begin{generic}
12: """\begin{generic}
invalid enivronment \end{generic}
"""
......@@ -43,7 +43,7 @@
[fail:inline_environment]
3 : """\begin{generic}
10: """\begin{generic}
invalid enivronment \end{generic}
"""
......@@ -84,7 +84,7 @@
\end{itemize}
[fail:itemize]
1 : \begin{itemize}
11: \begin{itemize}
Free text is not allowed within an itemized environment!
\end{itemize}
......
......@@ -90,5 +90,3 @@ if __name__ == "__main__":
cpu_profile(tst_func)
MLW_AST_transformation_table = {
# AST Transformations for the MLW-grammar
"+": remove_empty,
"Artikel": [],
"LemmaPosition": [],
"Lemma": [],
"klassisch": [],
"gesichert": [],
"LemmaVarianten": [],
"LemmaWort": [],
"LemmaZusatz": [],
"lzs_typ": [],
"GrammatikPosition": [],
"wortart": [replace_or_reduce],
"GrammatikVarianten": [],
"flexion": [],
"FLEX": [],
"genus": [replace_or_reduce],
"EtymologiePosition": [],
"EtymologieVarianten": [],
"EtymologieVariante": [],
"ArtikelKopf": [replace_by_single_child],
"SchreibweisenPosition": [],
"SWTyp": [replace_or_reduce],
"SWVariante": [],
"Schreibweise": [replace_by_single_child],
"BedeutungsPosition": [],
"Bedeutung": [],
"Bedeutungskategorie": [],
"Interpretamente": [],
"LateinischeBedeutung": [],
"DeutscheBedeutung": [],
"Belege": [],
"EinBeleg": [],
"Zusatz": [],
"ArtikelVerfasser": [],
"Name": [],
"SW_LAT": [replace_or_reduce],
"SW_DEU": [replace_or_reduce],
"SW_GRIECH": [replace_or_reduce],
"Beleg": [replace_by_single_child],
"Verweis": [],
"VerweisZiel": [],
"ZielName": [replace_by_single_child],
"NAMENS_ABKÜRZUNG": [],
"NAME": [],
"DEU_WORT": [],
"DEU_GROSS": [],
"DEU_KLEIN": [],
"LAT_WORT": [],
"LAT_WORT_TEIL": [],
"GROSSSCHRIFT": [],
"GROSSFOLGE": [],
"BUCHSTABENFOLGE": [],
"ZEICHENFOLGE": [],
"TR": [replace_or_reduce],
"ABS": [replace_or_reduce],
"ZW": [],
"LZ": [],
"DATEI_ENDE": [],
"NIEMALS": [],
":Token, :RE": reduce_single_child,
"*": replace_by_single_child
}
<?xml version="1.0" encoding="UTF-8"?>
<!-- DTD für MLW Erfassung neuer Artikel -->
<!-- erstellt von Ursula Welsch, BADW -->
<!-- Stand: 6.7.2017 -->
<!-- Parameter-Entities -->
<!ENTITY % textauszeichnungen "kursiv | gesperrt | gerade" >
<!-- Lexikon -->
<!ELEMENT MLW-test (artikel)+ >
<!-- Artikel -->
<!ELEMENT artikel (lemma-position, artikelkopf?, bedeutung-position, verweis-position?, artikel-verfasser) >
<!ATTLIST artikel xml:id ID #REQUIRED >
<!-- =========================-->
<!-- Lemma-Ansatz -->
<!-- =========================-->
<!ELEMENT lemma-position (((lemma, lemma-varianten?, grammatik-position) | (lemma-position | zusatz)+), etymologie-position?) >
<!-- Lemma -->
<!ELEMENT lemma (#PCDATA) >
<!ATTLIST lemma
klassisch (ja | nein) "ja"
gesichert (ja | nein) "ja"
>
<!-- Lemma-Varianten -->
<!ELEMENT lemma-varianten (lemma-variante+, zusatz?) >
<!ELEMENT lemma-variante (#PCDATA) >
<!ATTLIST lemma-variante kurz CDATA #IMPLIED >
<!-- Grammatik-Position -->
<!ELEMENT grammatik-position (grammatik, grammatik-varianten?) >
<!-- Grammatikangaben -->
<!ELEMENT grammatik (#PCDATA) >
<!ATTLIST grammatik
wortart (nomen | adjektiv | verb) #REQUIRED
klasse (us-i | a-ae | um-i | x-cis) #IMPLIED
genus (m | f | n) #IMPLIED
>
<!-- Grammatik-Varianten -->
<!ELEMENT grammatik-varianten (grammatik-variante+) >
<!ELEMENT grammatik-variante (grammatik, beleg) >
<!-- Etymologie-Position -->
<!ELEMENT etymologie-position (etymologie-variante+) >
<!ELEMENT etymologie-variante (etymologie-besonderheit?, etymologie?, beleg) >
<!ELEMENT etymologie (#PCDATA) >
<!ATTLIST etymologie-variante sprache (griech | lat) #IMPLIED >
<!ELEMENT etymologie-besonderheit (#PCDATA) >
<!-- =========================-->
<!-- Artikelkopf -->
<!-- =========================-->
<!ELEMENT artikelkopf ((schreibweisen-position, struktur-position?, gebrauch-position?, metrik-position?, verwechslung-position?) |
(struktur-position, gebrauch-position?, metrik-position?, verwechslung-position?) |
(gebrauch-position, metrik-position?, verwechslung-position?) |
(metrik-position, verwechslung-position?) |
verwechslung-position) >
<!-- Schreibweisen-Position -->
<!ELEMENT schreibweisen-position (schreibweisen-variante)+ >
<!ELEMENT schreibweisen-variante (schreibweise-besonderheit?, schreibweise, (zusatz, schreibweise)*, (beleg, (zusatz | beleg))*) >
<!ELEMENT schreibweise-besonderheit (#PCDATA) >
<!ELEMENT schreibweise (#PCDATA) >
<!ATTLIST schreibweise-besonderheit typ (script | form | script-form) #IMPLIED >
<!-- Position für Strukturelle/Grammatische Besonderheiten -->
<!ELEMENT struktur-position (struktur-variante+) >
<!ELEMENT struktur-variante (struktur-besonderheit?, struktur?, beleg) >
<!ELEMENT struktur-besonderheit (#PCDATA) >
<!ELEMENT struktur (#PCDATA) >
<!ATTLIST struktur-besonderheit typ (pendet | struct | struct-nota) #IMPLIED >
<!-- Position zu Gebrauchs-Besonderheiten -->
<!ELEMENT gebrauch-position (gebrauch-variante+) >
<!ELEMENT gebrauch-variante (gebrauch-besonderheit?, gebrauch?, beleg) >
<!ELEMENT gebrauch-besonderheit (#PCDATA) >
<!ELEMENT gebrauch (#PCDATA) >
<!ATTLIST gebrauch-besonderheit typ (usu | partic) #IMPLIED >
<!-- Position zu Metrisch / Rhythmischen Besonderheiten -->
<!ELEMENT metrik-position (metrik-variante+) >
<!ELEMENT metrik-variante (metrik-besonderheit?, metrik?, beleg) >
<!ELEMENT metrik-besonderheit (#PCDATA) >
<!ELEMENT metrik (#PCDATA) >
<!ATTLIST metrik-besonderheit typ (metr | rhythm) #IMPLIED >
<!-- Position zu Verwechselungsgefahren -->
<!ELEMENT verwechslung-position (verwechslung-variante+) >
<!ELEMENT verwechslung-variante (verwechslung-besonderheit?, verwechslung?, beleg) >
<!ELEMENT verwechslung-besonderheit (#PCDATA) >
<!ELEMENT verwechslung (#PCDATA) >
<!ATTLIST verwechslung-besonderheit typ (confunditur) "confunditur" >
<!-- =========================-->
<!-- Hauptteil: Bedeutungsposition -->
<!-- =========================-->
<!ELEMENT bedeutung-position (bedeutung+) >
<!ELEMENT bedeutung (((klassifikation, interpretament-zusatz?) | (interpretament-zusatz) | (interpretament, interpretament-deutsch, interpretament-zusatz*)), (bedeutung | beleg-position)+) >
<!ELEMENT klassifikation (#PCDATA) >
<!ATTLIST bedeutung nr CDATA #REQUIRED >
<!-- Interpretament -->
<!ELEMENT interpretament (#PCDATA) >
<!ELEMENT interpretament-deutsch (#PCDATA| verweis)* >
<!ELEMENT interpretament-zusatz (#PCDATA | verweis)* >
<!ATTLIST interpretament typ (lat | griech | botan) "lat" >
<!-- Beleg-Position -->
<!ELEMENT beleg-position (beleg+, zusatz?) >
<!-- Artikelverfasser -->
<!ELEMENT artikel-verfasser (#PCDATA) >
<!-- =========================-->
<!--Elemente an verschiedenen Stellen der Struktur -->
<!-- =========================-->
<!-- Zusätze an verschiedenen Stellen der Struktur -->
<!ELEMENT zusatz (#PCDATA) >
<!ATTLIST zusatz typ (al | sim | saepe | vel | vel-rarius | OFFEN) #REQUIRED >
<!-- Verweise an verschiedenen Stellen der Struktur -->
<!ELEMENT verweis EMPTY >
<!ATTLIST verweis
typ (beleg | artikel | literatur) #REQUIRED
ziel CDATA #REQUIRED
>
<!-- Belege an verschiedenen Stellen der Struktur -->
<!--<!ELEMENT beleg ((beleg-quelle, beleg-text) | (verweis+, zusatz?)) >-->
<!ELEMENT beleg ((beleg-quelle, beleg-text) | verweis) >
<!-- Belegquelle -->
<!ELEMENT beleg-quelle (autor, werk, stelle, datierung?) >
<!ELEMENT autor (#PCDATA) >
<!ELEMENT werk (#PCDATA) >
<!ELEMENT stelle (#PCDATA | hoch)* >
<!ELEMENT datierung (#PCDATA) >
<!ATTLIST beleg
id ID #REQUIRED
>
<!-- Belegtext -->
<!ELEMENT beleg-text (#PCDATA | lemma-beleg | redaktion-ergaenzung | lesart | hervorhebung | sigle | %textauszeichnungen;)* >
<!ELEMENT redaktion-ergaenzung (#PCDATA | %textauszeichnungen;)* > <!-- redaktionelle Ergänzungen des Artikelverfassers; die runden Klammern werden generiert -->
<!ELEMENT lemma-beleg (#PCDATA) > <!-- das Vorkommen des Lemmas im Belegtext, erfasst wird die Langform; die Kurzform kommt in das Attribut kurzform -->
<!ELEMENT lesart (#PCDATA | redaktion-ergaenzung)* > <!-- verschiedene Lesarten/Lemmavarianten, die hier mit aufgeführt werden -->
<!ELEMENT hervorhebung (#PCDATA) > <!-- kleine Eckchen vor einer runden Klammer, die als Lesart bezeichnet wird -->
<!ELEMENT sigle (#PCDATA) > <!-- Sigle einer Literaturstelle -->
<!ATTLIST lemma-beleg kurzform CDATA #IMPLIED >
<!-- Textauszeichnung an verschiedenen Stellen -->
<!ELEMENT kursiv (#PCDATA | gerade)* >
<!ELEMENT gesperrt (#PCDATA) >
<!ELEMENT gerade (#PCDATA | kursiv)* >
<!ELEMENT hoch (#PCDATA) >
......@@ -11,21 +11,21 @@ LEMMA facitergul|a
GRAMMATIK
nomen; -ae f.
-us, -i m.: Verweis_001
-um, -i n.: Verweis_002
-us, -i m.: v. ibi
-um, -i n.: v. ibi
SCHREIBWEISE
script.:
vizreg-: Verweis_003
festregel(a): Verweis_004
fezdregl(a): Verweis_005
vizreg-: v. ibi
festregel(a): v. ibi