MLW_compiler.py 15.2 KB
Newer Older
di68kap's avatar
di68kap committed
1
#!/usr/bin/python
di68kap's avatar
di68kap committed
2 3 4 5 6 7 8 9

#######################################################################
#
# SYMBOLS SECTION - Can be edited. Changes will be preserved.
#
#######################################################################


di68kap's avatar
di68kap committed
10
import sys
11 12
from functools import partial

di68kap's avatar
di68kap committed
13 14 15 16
try:
    import regex as re
except ImportError:
    import re
17
from DHParser.parsers import Grammar, Compiler, Alternative, Required, Token, \
18 19 20
    Optional, OneOrMore, Sequence, RE, ZeroOrMore, NegativeLookahead, mixin_comment, compile_source
from DHParser.syntaxtree import traverse, reduce_single_child, replace_by_single_child, no_operation, \
    remove_expendables, remove_tokens, flatten, \
di68kap's avatar
di68kap committed
21
    WHITESPACE_KEYWORD, TOKEN_KEYWORD
di68kap's avatar
di68kap committed
22 23 24 25 26 27 28 29


#######################################################################
#
# SCANNER SECTION - Can be edited. Changes will be preserved.
#
#######################################################################

30 31 32 33
def MLWScanner(text):
    return text


di68kap's avatar
di68kap committed
34 35 36 37 38
#######################################################################
#
# PARSER SECTION - Don't edit! CHANGES WILL BE OVERWRITTEN!
#
#######################################################################
39

40
class MLWGrammar(Grammar):
41 42 43 44 45
    r"""Parser for a MLW source file, with this grammar:
    
    # EBNF-Syntax für MLW-Artikel
    
    @ comment       =  /#.*(?:\n|$)/    # Kommentare beginnen mit '#' und reichen bis zum Zeilenende
di68kap's avatar
di68kap committed
46
    @ whitespace    =  /[\t ]*/         # Zeilensprünge zählen nicht als Leerraum
47 48 49 50 51 52 53 54 55 56 57
    @ literalws     =  both             # Leerraum vor und nach Literalen wird automatisch entfernt
    
    Artikel         = [LEER]
                      §LemmaPosition  [ArtikelKopf]  §BedeutungsPosition  §Autorinfo
                      [LEER]  DATEI_ENDE
    
    
    #### LEMMA-POSITION ##########################################################
    
    LemmaPosition   = "LEMMA"  §Lemma  [LemmaVarianten]  §GrammatikPosition
    
di68kap's avatar
di68kap committed
58
    Lemma           = [_tll]  WORT_KLEIN [LEER]
59 60
    _tll            = "*"
    
di68kap's avatar
di68kap committed
61 62 63 64
    LemmaVarianten  = "VARIANTEN" [LEER] §LVariante  { TRENNER LVariante }
                      [TRENNER LVZusatz] [TRENNER]
    LVariante       = ~/(?:[a-z]|-)+/~      # Buchstabenfolge mit Trennzeichen "-"
    LVZusatz        = "ZUSATZ" "sim."
65 66 67 68 69
    
    
    
    #### GRAMMATIK-POSITION ######################################################
    
di68kap's avatar
di68kap committed
70 71
    GrammatikPosition = "GRAMMATIK" [LEER] §_wortart §TRENNER §Flexionen [_genus]
                        {GrammatikVarianten} [TRENNER]
72 73 74 75 76 77
    
    _wortart        = "nomen"  | "n." |
                      "verb"   | "v." |
                      "adverb" | "adv." |
                      "adjektiv" | "adj."
    
di68kap's avatar
di68kap committed
78
    GrammatikVarianten = TRENNER GVariante
79 80 81 82 83 84 85 86 87 88 89 90 91 92
    GVariante       = Flexionen  [_genus]  ":"  Beleg
    
    Flexionen       = Flexion { "," §Flexion }
    Flexion         = /-?[a-z]+/~
    
    _genus          = "maskulinum" | "m." |
                      "femininum" | "f." |
                      "neutrum" | "n."
    
    
    
    #### ARTIKEL-KOPF ############################################################
    
    ArtikelKopf     = SchreibweisenPosition
di68kap's avatar
di68kap committed
93 94
    SchreibweisenPosition =  "SCHREIBWEISE" [LEER] §SWTyp ":" [LEER]
                             §SWVariante { TRENNER SWVariante} [LEER]
95 96 97 98 99
    SWTyp           = "script." | "script. fat-"
    SWVariante      = Schreibweise ":" Beleg
    Schreibweise    = "vizreg-" | "festregel(a)" | "fezdregl(a)" | "fat-"
    
    Beleg           = Verweis
di68kap's avatar
di68kap committed
100
    Verweis         = ~/\w+/~
101 102 103 104 105
    VerweisZiel     = ~/<\w+>/~
    
    
    #### BEDEUTUNGS-POSITION #####################################################
    
di68kap's avatar
di68kap committed
106
    BedeutungsPosition = { "BEDEUTUNG" [LEER] §Bedeutung }+
107 108
    
    Bedeutung       = (Interpretamente | Bedeutungskategorie) [Belege]
di68kap's avatar
di68kap committed
109 110
    Bedeutungskategorie = /(?:(?![A-ZÄÖÜ][A-ZÄÖÜ]).)+/~ [LEER]
    Interpretamente = LateinischeBedeutung [LEER] §DeutscheBedeutung [LEER]
111 112
    LateinischeBedeutung = "LAT" /(?:(?![A-ZÄÖÜ][A-ZÄÖÜ]).)+/~
    DeutscheBedeutung = "DEU" /(?:(?![A-ZÄÖÜ][A-ZÄÖÜ]).)+/~
di68kap's avatar
di68kap committed
113
    Belege          = "BELEGE" [LEER] { "*" EinBeleg }
di68kap's avatar
di68kap committed
114 115
    EinBeleg        = { !([LEER] ("*" | "BEDEUTUNG" | "AUTOR" | "NAME" | "ZUSATZ"))
                        /\s*.*\s*/ }+
116
                      [Zusatz]
di68kap's avatar
di68kap committed
117
    Zusatz          = "ZUSATZ" /\s*.*/ TRENNER
118 119 120 121 122
    
    
    #### AUTOR/AUTORIN ###########################################################
    
    Autorinfo       = ("AUTORIN" | "AUTOR") Name
di68kap's avatar
di68kap committed
123 124
    Name            = WORT { WORT | NAMENS_ABKÜRZUNG }
    
125
    
di68kap's avatar
di68kap committed
126
    #### ATOMARE AUSDRÜCKE #######################################################
127
    
di68kap's avatar
di68kap committed
128
    NAMENS_ABKÜRZUNG = /[A-ZÄÖÜÁÀ]\./
129
    
di68kap's avatar
di68kap committed
130 131 132 133 134
    WORT             = /[A-ZÄÖÜ]?[a-zäöüß]+/~
    WORT_GROSS       = /[A-ZÄÖÜ][a-zäöüß]+/~
    WORT_KLEIN       = /[a-zäöüß]+/~
    LAT_WORT         = /[a-z]+/~
    GROSSSCHRIFT     = /[A-ZÄÖÜ]+/~
135
    
di68kap's avatar
di68kap committed
136 137
    TRENNER          = /\s*;\s*/ | { ZSPRUNG }+
    ZSPRUNG          = /\n/~
di68kap's avatar
di68kap committed
138
    
di68kap's avatar
di68kap committed
139 140 141
    LEER             = /\s+/        # horizontaler und(!) vertikaler Leerraum
    DATEI_ENDE       = !/./
    NIEMALS          = /(?!.)/
142
    """
di68kap's avatar
di68kap committed
143
    source_hash__ = "9fce888d1b21b2d11a6228e0b97f9291"
144
    parser_initialization__ = "upon instatiation"
di68kap's avatar
di68kap committed
145 146 147 148
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'[\t ]*', comment=r'#.*(?:\n|$)')
    wspL__ = WSP__
    wspR__ = WSP__
149 150
    NIEMALS = RE('(?!.)', wR='', wL='')
    DATEI_ENDE = NegativeLookahead(RE('.', wR='', wL=''))
di68kap's avatar
di68kap committed
151 152 153
    LEER = RE('\\s+', wR='', wL='')
    ZSPRUNG = RE('\\n', wL='')
    TRENNER = Alternative(RE('\\s*;\\s*', wR='', wL=''), OneOrMore(ZSPRUNG))
154 155 156 157 158
    GROSSSCHRIFT = RE('[A-ZÄÖÜ]+', wL='')
    LAT_WORT = RE('[a-z]+', wL='')
    WORT_KLEIN = RE('[a-zäöüß]+', wL='')
    WORT_GROSS = RE('[A-ZÄÖÜ][a-zäöüß]+', wL='')
    WORT = RE('[A-ZÄÖÜ]?[a-zäöüß]+', wL='')
di68kap's avatar
di68kap committed
159 160
    NAMENS_ABKÜRZUNG = RE('[A-ZÄÖÜÁÀ]\\.', wR='', wL='')
    Name = Sequence(WORT, ZeroOrMore(Alternative(WORT, NAMENS_ABKÜRZUNG)))
161
    Autorinfo = Sequence(Alternative(Token("AUTORIN"), Token("AUTOR")), Name)
di68kap's avatar
di68kap committed
162 163
    Zusatz = Sequence(Token("ZUSATZ"), RE('\\s*.*', wR='', wL=''), TRENNER)
    EinBeleg = Sequence(OneOrMore(Sequence(NegativeLookahead(Sequence(Optional(LEER), Alternative(Token("*"), Token("BEDEUTUNG"), Token("AUTOR"), Token("NAME"), Token("ZUSATZ")))), RE('\\s*.*\\s*', wR='', wL=''))), Optional(Zusatz))
di68kap's avatar
di68kap committed
164
    Belege = Sequence(Token("BELEGE"), Optional(LEER), ZeroOrMore(Sequence(Token("*"), EinBeleg)))
165 166
    DeutscheBedeutung = Sequence(Token("DEU"), RE('(?:(?![A-ZÄÖÜ][A-ZÄÖÜ]).)+', wL=''))
    LateinischeBedeutung = Sequence(Token("LAT"), RE('(?:(?![A-ZÄÖÜ][A-ZÄÖÜ]).)+', wL=''))
di68kap's avatar
di68kap committed
167 168
    Interpretamente = Sequence(LateinischeBedeutung, Optional(LEER), Required(DeutscheBedeutung), Optional(LEER))
    Bedeutungskategorie = Sequence(RE('(?:(?![A-ZÄÖÜ][A-ZÄÖÜ]).)+', wL=''), Optional(LEER))
169
    Bedeutung = Sequence(Alternative(Interpretamente, Bedeutungskategorie), Optional(Belege))
di68kap's avatar
di68kap committed
170
    BedeutungsPosition = OneOrMore(Sequence(Token("BEDEUTUNG"), Optional(LEER), Required(Bedeutung)))
171
    VerweisZiel = RE('<\\w+>')
di68kap's avatar
di68kap committed
172
    Verweis = RE('\\w+')
173 174 175 176
    Beleg = Verweis
    Schreibweise = Alternative(Token("vizreg-"), Token("festregel(a)"), Token("fezdregl(a)"), Token("fat-"))
    SWVariante = Sequence(Schreibweise, Token(":"), Beleg)
    SWTyp = Alternative(Token("script."), Token("script. fat-"))
di68kap's avatar
di68kap committed
177
    SchreibweisenPosition = Sequence(Token("SCHREIBWEISE"), Optional(LEER), Required(SWTyp), Token(":"), Optional(LEER), Required(SWVariante), ZeroOrMore(Sequence(TRENNER, SWVariante)), Optional(LEER))
178 179 180 181 182
    ArtikelKopf = SchreibweisenPosition
    _genus = Alternative(Token("maskulinum"), Token("m."), Token("femininum"), Token("f."), Token("neutrum"), Token("n."))
    Flexion = RE('-?[a-z]+', wL='')
    Flexionen = Sequence(Flexion, ZeroOrMore(Sequence(Token(","), Required(Flexion))))
    GVariante = Sequence(Flexionen, Optional(_genus), Token(":"), Beleg)
di68kap's avatar
di68kap committed
183
    GrammatikVarianten = Sequence(TRENNER, GVariante)
184
    _wortart = Alternative(Token("nomen"), Token("n."), Token("verb"), Token("v."), Token("adverb"), Token("adv."), Token("adjektiv"), Token("adj."))
di68kap's avatar
di68kap committed
185 186
    GrammatikPosition = Sequence(Token("GRAMMATIK"), Optional(LEER), Required(_wortart), Required(TRENNER), Required(Flexionen), Optional(_genus), ZeroOrMore(GrammatikVarianten), Optional(TRENNER))
    LVZusatz = Sequence(Token("ZUSATZ"), Token("sim."))
187
    LVariante = RE('(?:[a-z]|-)+')
di68kap's avatar
di68kap committed
188
    LemmaVarianten = Sequence(Token("VARIANTEN"), Optional(LEER), Required(LVariante), ZeroOrMore(Sequence(TRENNER, LVariante)), Optional(Sequence(TRENNER, LVZusatz)), Optional(TRENNER))
189
    _tll = Token("*")
di68kap's avatar
di68kap committed
190
    Lemma = Sequence(Optional(_tll), WORT_KLEIN, Optional(LEER))
191 192 193 194 195
    LemmaPosition = Sequence(Token("LEMMA"), Required(Lemma), Optional(LemmaVarianten), Required(GrammatikPosition))
    Artikel = Sequence(Optional(LEER), Required(LemmaPosition), Optional(ArtikelKopf), Required(BedeutungsPosition), Required(Autorinfo), Optional(LEER), DATEI_ENDE)
    root__ = Artikel
    

di68kap's avatar
di68kap committed
196 197 198 199 200
#######################################################################
#
# AST SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
201 202 203 204 205 206 207 208 209 210 211 212

def join_strings(node, delimiter='\n'):
    new_result = []
    n = 0
    while n < len(node.result):
        nd = node.result[n]
        if not nd.children:
            a = n
            n += 1
            while n < len(node.result) and not node.result[n].children:
                n += 1
            nd.result = delimiter.join((r.result for r in node.result[a:n]))
di68kap's avatar
di68kap committed
213 214
        elif nd.parser.name != "Zusatz":
            raise AssertionError(nd.as_sexpr())
di68kap's avatar
di68kap committed
215
        else:
di68kap's avatar
di68kap committed
216
            n += 1
217 218 219 220
        new_result.append(nd)
    node.result = tuple(new_result)


di68kap's avatar
di68kap committed
221
MLW_AST_transformation_table = {
222
    # AST Transformations for the MLW-grammar
di68kap's avatar
di68kap committed
223
    "Artikel": no_operation,
224 225
    "LemmaPosition":
        [partial(remove_tokens, tokens={'LEMMA'})],
di68kap's avatar
di68kap committed
226
    "Lemma": no_operation,
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
    "_tll, _wortart, _genus":
        [remove_expendables, reduce_single_child],
    "LemmaVarianten":
        [partial(remove_tokens, tokens={'VARIANTEN'}), flatten,
         partial(remove_tokens, tokens={',', ';'})],
    "LVariante, LVZusatz, Schreibweise, Name":
        [remove_expendables, reduce_single_child],
    "SWVariante":
        [remove_expendables, partial(remove_tokens, tokens={':'})],
    "GrammatikPosition":
        [partial(remove_tokens, tokens={'GRAMMATIK', ';'}), flatten],
    "GrammatikVarianten":
        [partial(remove_tokens, tokens={';'}), replace_by_single_child],
    "GVariante":
        [partial(remove_tokens, tokens={':'})],
    "Flexionen":
        [flatten, partial(remove_tokens, tokens={',', ';'})],
    "Flexion, Verweis":
        [remove_expendables, reduce_single_child],
    "Zusatz":
        [remove_expendables, remove_tokens, reduce_single_child],
di68kap's avatar
di68kap committed
248
    "ArtikelKopf": no_operation,
249 250 251
    "SchreibweisenPosition":
        [partial(remove_tokens, tokens={'SCHREIBWEISE', ':'}),
         flatten, partial(remove_tokens, tokens={','})],
di68kap's avatar
di68kap committed
252
    "SWTyp": no_operation,
253 254
    "BedeutungsPosition":
        [flatten, partial(remove_tokens, tokens={'BEDEUTUNG'})],
di68kap's avatar
di68kap committed
255 256 257
    "Bedeutung": no_operation,
    "Bedeutungskategorie": no_operation,
    "Interpretamente": no_operation,
258 259 260 261 262 263
    "LateinischeBedeutung, DeutscheBedeutung":
        [remove_expendables, remove_tokens, reduce_single_child],
    "Belege":
        [flatten, remove_tokens],
    "EinBeleg":
        [flatten, remove_expendables, join_strings, reduce_single_child],
di68kap's avatar
di68kap committed
264 265
    "Beleg": no_operation,
    "VerweisZiel": no_operation,
266 267 268
    "Autorinfo":
        [partial(remove_tokens, tokens={'AUTORIN', 'AUTOR'})],
    "WORT, WORT_KLEIN, WORT_GROSS, GROSSSCHRIFT":
di68kap's avatar
di68kap committed
269
    # test,
270
        [remove_expendables, reduce_single_child],
di68kap's avatar
di68kap committed
271 272 273
    "LEER": no_operation,
    "DATEI_ENDE": no_operation,
    "NIEMALS": no_operation,
274 275 276 277 278 279 280 281 282 283
    (TOKEN_KEYWORD, WHITESPACE_KEYWORD):
        [remove_expendables, reduce_single_child],
    "*":
        remove_expendables,
    "~":
        partial(remove_tokens, tokens={',', ';'}),
    "":
        [remove_expendables, replace_by_single_child]
}

di68kap's avatar
di68kap committed
284 285
MLWTransform = partial(traverse, processing_table=MLW_AST_transformation_table)

286

di68kap's avatar
di68kap committed
287 288 289 290 291
#######################################################################
#
# COMPILER SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
292

293
class MLWCompiler(Compiler):
294 295 296 297 298 299 300
    """Compiler for the abstract-syntax-tree of a MLW source file.
    """

    def __init__(self, grammar_name="MLW"):
        super(MLWCompiler, self).__init__()
        assert re.match('\w+\Z', grammar_name)

di68kap's avatar
di68kap committed
301
    def on_Artikel(self, node):
302 303
        return node

di68kap's avatar
di68kap committed
304
    def on_LemmaPosition(self, node):
305 306
        pass

di68kap's avatar
di68kap committed
307
    def on_Lemma(self, node):
308 309
        pass

di68kap's avatar
di68kap committed
310
    def on__tll(self, node):
311 312
        pass

di68kap's avatar
di68kap committed
313
    def on_LemmaVarianten(self, node):
314 315
        pass

di68kap's avatar
di68kap committed
316
    def on_LVariante(self, node):
317 318
        pass

di68kap's avatar
di68kap committed
319
    def on_LVZusatz(self, node):
320 321
        pass

di68kap's avatar
di68kap committed
322
    def on_GrammatikPosition(self, node):
323 324
        pass

di68kap's avatar
di68kap committed
325
    def on__wortart(self, node):
326 327
        pass

di68kap's avatar
di68kap committed
328
    def on_GrammatikVarianten(self, node):
329 330
        pass

di68kap's avatar
di68kap committed
331
    def on_GVariante(self, node):
332 333
        pass

di68kap's avatar
di68kap committed
334
    def on_Flexionen(self, node):
335 336
        pass

di68kap's avatar
di68kap committed
337
    def on_Flexion(self, node):
338 339
        pass

di68kap's avatar
di68kap committed
340
    def on__genus(self, node):
341 342
        pass

di68kap's avatar
di68kap committed
343
    def on_ArtikelKopf(self, node):
344 345
        pass

di68kap's avatar
di68kap committed
346
    def on_SchreibweisenPosition(self, node):
347 348
        pass

di68kap's avatar
di68kap committed
349
    def on_SWTyp(self, node):
350 351
        pass

di68kap's avatar
di68kap committed
352
    def on_SWVariante(self, node):
353 354
        pass

di68kap's avatar
di68kap committed
355
    def on_Schreibweise(self, node):
356 357
        pass

di68kap's avatar
di68kap committed
358
    def on_Beleg(self, node):
di68kap's avatar
di68kap committed
359 360
        pass

di68kap's avatar
di68kap committed
361
    def on_Verweis(self, node):
di68kap's avatar
di68kap committed
362 363
        pass

di68kap's avatar
di68kap committed
364
    def on_VerweisZiel(self, node):
di68kap's avatar
di68kap committed
365 366
        pass

di68kap's avatar
di68kap committed
367
    def on_BedeutungsPosition(self, node):
368 369
        pass

di68kap's avatar
di68kap committed
370
    def on_Bedeutung(self, node):
371 372
        pass

di68kap's avatar
di68kap committed
373
    def on_Bedeutungskategorie(self, node):
374 375
        pass

di68kap's avatar
di68kap committed
376
    def on_Interpretamente(self, node):
377 378
        pass

di68kap's avatar
di68kap committed
379
    def on_LateinischeBedeutung(self, node):
380 381
        pass

di68kap's avatar
di68kap committed
382
    def on_DeutscheBedeutung(self, node):
383 384
        pass

di68kap's avatar
di68kap committed
385
    def on_Belege(self, node):
386 387
        pass

di68kap's avatar
di68kap committed
388
    def on_EinBeleg(self, node):
389 390
        pass

di68kap's avatar
di68kap committed
391
    def on_Zusatz(self, node):
392 393
        pass

di68kap's avatar
di68kap committed
394
    def on_Autorinfo(self, node):
395 396
        pass

di68kap's avatar
di68kap committed
397
    def on_Name(self, node):
398 399
        pass

di68kap's avatar
di68kap committed
400
    def on_NAMENS_ABKÜRZUNG(self, node):
401 402
        pass

di68kap's avatar
di68kap committed
403
    def on_WORT(self, node):
di68kap's avatar
di68kap committed
404 405
        pass

di68kap's avatar
di68kap committed
406
    def on_WORT_GROSS(self, node):
407 408
        pass

di68kap's avatar
di68kap committed
409
    def on_WORT_KLEIN(self, node):
di68kap's avatar
di68kap committed
410 411
        pass

di68kap's avatar
di68kap committed
412
    def on_LAT_WORT(self, node):
413 414
        pass

di68kap's avatar
di68kap committed
415
    def on_GROSSSCHRIFT(self, node):
416 417
        pass

di68kap's avatar
di68kap committed
418
    def on_TRENNER(self, node):
419 420
        pass

di68kap's avatar
di68kap committed
421
    def on_ZSPRUNG(self, node):
422 423
        pass

di68kap's avatar
di68kap committed
424 425 426 427 428 429 430 431
    def on_LEER(self, node):
        pass

    def on_DATEI_ENDE(self, node):
        pass

    def on_NIEMALS(self, node):
        pass
di68kap's avatar
di68kap committed
432 433 434 435


#######################################################################
#
di68kap's avatar
di68kap committed
436
# END OF DHPARSER-SECTIONS
di68kap's avatar
di68kap committed
437 438 439
#
#######################################################################

di68kap's avatar
di68kap committed
440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457

def compile_MLW(source):
    """Compiles ``source`` and returns (result, errors, ast).
    """
    return compile_source(source, MLWScanner,
                          MLWGrammar(), MLWTransform, MLWCompiler())

if __name__ == "__main__":
    if len(sys.argv) > 1:
        result, errors, ast = compile_MLW(sys.argv[1])
        if errors:
            for error in errors:
                print(error)
                sys.exit(1)
        else:
            print(result)
    else:
        print("Usage: MLW_compiler.py [FILENAME]")