ebnf.py 41.6 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
from collections import OrderedDict
21
from functools import partial
22

23
24
25
26
try:
    import regex as re
except ImportError:
    import re
27
try:
Eckhart Arnold's avatar
Eckhart Arnold committed
28
    from typing import Callable, Dict, List, Set, Tuple, Union
29
except ImportError:
Eckhart Arnold's avatar
Eckhart Arnold committed
30
    from .typing34 import Callable, Dict, List, Set, Tuple, Union
31

32
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
33
34
from DHParser.parser import Grammar, mixin_comment, nil_preprocessor, Forward, RegExp, RE, \
    NegativeLookahead, Alternative, Series, Option, OneOrMore, ZeroOrMore, Token, \
35
    Required, Compiler, PreprocessorFunc
36
37
from DHParser.syntaxtree import Node, TransformationFunc, WHITESPACE_PTYPE, TOKEN_PTYPE
from DHParser.error import Error
38
from DHParser.transform import traverse, remove_brackets, \
39
    reduce_single_child, replace_by_single_child, remove_expendables, \
40
    remove_tokens, flatten, forbid, assert_content, remove_infix_operator
41
from DHParser.versionnumber import __version__
42

43
__all__ = ('get_ebnf_preprocessor',
44
45
46
47
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
48
           'EBNFTransform',
Eckhart Arnold's avatar
Eckhart Arnold committed
49
           'EBNFCompilerError',
50
           'EBNFCompiler',
51
           'grammar_changed',
52
           'PreprocessorFactoryFunc',
53
54
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
55
           'CompilerFactoryFunc')
56
57


Eckhart Arnold's avatar
Eckhart Arnold committed
58
59
60
61
62
63
64
########################################################################
#
# EBNF scanning
#
########################################################################


65
66
def get_ebnf_preprocessor() -> PreprocessorFunc:
    return nil_preprocessor
Eckhart Arnold's avatar
Eckhart Arnold committed
67
68
69
70
71
72
73
74


########################################################################
#
# EBNF parsing
#
########################################################################

75

76
# class EBNFGrammar(Grammar):
di68kap's avatar
di68kap committed
77
#     r"""Parser for an EBNF_variant source file, with this grammar:
78
79
80
81
82
83
84
85
#
#     # EBNF-Grammar in EBNF
#
#     @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
#     @ whitespace =  /\s*/                            # whitespace includes linefeed
#     @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly
#
#     syntax     =  [~//] { definition | directive } §EOF
di68kap's avatar
di68kap committed
86
87
#     definition =  symbol §"=" §expression
#     directive  =  "@" §symbol §"=" §( regexp | literal | list_ )
88
89
#
#     expression =  term { "|" term }
di68kap's avatar
di68kap committed
90
#     term       =  { factor }+
91
92
93
94
95
96
97
98
#     factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
#                 | [flowmarker] literal
#                 | [flowmarker] regexp
#                 | [flowmarker] group
#                 | [flowmarker] oneormore
#                 | repetition
#                 | option
#
di68kap's avatar
di68kap committed
99
#     flowmarker =  "!"  | "&"  | "§"                  # '!' negative lookahead, '&' positive lookahead, '§' required
100
#                 | "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
101
102
103
104
105
106
107
108
109
110
#     retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve
#
#     group      =  "(" expression §")"
#     oneormore  =  "{" expression "}+"
#     repetition =  "{" expression §"}"
#     option     =  "[" expression §"]"
#
#     symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
#     literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
#                 | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
111
#     regexp     =  /~?\/(?:\\\/|[^\/])*?\/~?/~        # e.g. /\w+/, ~/#.*(?:\n|$)/~
112
113
114
115
116
117
118
#                                                      # '~' is a whitespace-marker, if present leading or trailing
#                                                      # whitespace of a regular expression will be ignored tacitly.
#     list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
#                                                      # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
#     EOF =  !/./
#     """
#     expression = Forward()
di68kap's avatar
di68kap committed
119
#     source_hash__ = "4735db10f0b79d44209d1de0184b2ca0"
120
121
#     parser_initialization__ = "upon instantiation"
#     COMMENT__ = r'#.*(?:\n|$)'
122
123
#     WHITESPACE__ = r'\s*'
#     WSP__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
124
125
#     wspL__ = ''
#     wspR__ = WSP__
126
#     EOF = NegativeLookahead(RegExp('.'))
di68kap's avatar
di68kap committed
127
#     list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
128
#     regexp = RE('~?/(?:\\\\/|[^/])*?/~?')
129
130
#     literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
#     symbol = RE('(?!\\d)\\w+')
di68kap's avatar
di68kap committed
131
132
133
134
#     option = Series(Token("["), expression, Required(Token("]")))
#     repetition = Series(Token("{"), expression, Required(Token("}")))
#     oneormore = Series(Token("{"), expression, Token("}+"))
#     group = Series(Token("("), expression, Required(Token(")")))
135
#     retrieveop = Alternative(Token("::"), Token(":"))
di68kap's avatar
di68kap committed
136
137
138
139
140
141
142
143
144
145
#     flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
#     factor = Alternative(Series(Option(flowmarker), Option(retrieveop), symbol, NegativeLookahead(Token("="))),
#                          Series(Option(flowmarker), literal), Series(Option(flowmarker), regexp),
#                          Series(Option(flowmarker), group), Series(Option(flowmarker), oneormore), repetition, option)
#     term = OneOrMore(factor)
#     expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
#     directive = Series(Token("@"), Required(symbol), Required(Token("=")),
#                        Required(Alternative(regexp, literal, list_)))
#     definition = Series(symbol, Required(Token("=")), Required(expression))
#     syntax = Series(Option(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
146
147
#     root__ = syntax

148

di68kap's avatar
di68kap committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
class EBNFGrammar(Grammar):
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol "=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { ["§"] factor }+                       # "§" means all following factors mandatory
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"                         # '!' negative lookahead, '&' positive lookahead
                | "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
    option     =  "[" expression §"]"

    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:\\\/|[^\/])*?\/~?/~        # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
    source_hash__ = "a131abc5259738631000cda90d2fc65b"
Eckhart Arnold's avatar
Eckhart Arnold committed
193
    tialization__ = "upon instantiation"
di68kap's avatar
di68kap committed
194
195
196
197
198
199
    COMMENT__ = r'#.*(?:\n|$)'
    WHITESPACE__ = r'\s*'
    WSP__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
    wspL__ = ''
    wspR__ = WSP__
    EOF = NegativeLookahead(RegExp('.'))
Eckhart Arnold's avatar
Eckhart Arnold committed
200
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
di68kap's avatar
di68kap committed
201
202
203
    regexp = RE('~?/(?:\\\\/|[^/])*?/~?')
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
204
205
    option = Series(Token("["), expression, Token("]"), mandatory=1)
    repetition = Series(Token("{"), expression, Token("}"), mandatory=1)
Eckhart Arnold's avatar
Eckhart Arnold committed
206
    oneormore = Series(Token("{"), expression, Token("}+"))
207
    group = Series(Token("("), expression, Token(")"), mandatory=1)
di68kap's avatar
di68kap committed
208
209
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("-!"), Token("-&"))
Eckhart Arnold's avatar
Eckhart Arnold committed
210
211
212
213
214
215
    factor = Alternative(Series(Option(flowmarker), Option(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Series(Option(flowmarker), literal), Series(Option(flowmarker), regexp),
                         Series(Option(flowmarker), group), Series(Option(flowmarker), oneormore), repetition, option)
    term = OneOrMore(Series(Option(Token("§")), factor))
    expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
    directive = Series(Token("@"), symbol, Token("="), Alternative(regexp, literal, list_), mandatory=1)
di68kap's avatar
di68kap committed
216
    definition = Series(symbol, Token("="), expression, mandatory=1)
Eckhart Arnold's avatar
Eckhart Arnold committed
217
    syntax = Series(Option(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), EOF, mandatory=2)
di68kap's avatar
di68kap committed
218
219
220
    root__ = syntax


221
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
241
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
242
243
244
245
246
247
248
249
250
251
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


252
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


269
EBNF_AST_transformation_table = {
270
    # AST Transformations for EBNF-grammar
271
    "+":
272
        remove_expendables,
273
    "syntax":
274
        [],  # otherwise '"*": replace_by_single_child' would be applied
275
    "directive, definition":
276
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
277
    "expression":
278
        [replace_by_single_child, flatten, remove_tokens('|')],  # remove_infix_operator],
279
280
281
282
283
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
284
        [remove_brackets, replace_by_single_child],
285
    "oneormore, repetition, option":
286
287
        [reduce_single_child, remove_brackets,
         forbid('repetition', 'option', 'oneormore'), assert_content(r'(?!§)')],
288
    "symbol, literal, regexp":
289
        reduce_single_child,
290
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
291
        reduce_single_child,
292
    "list_":
293
        [flatten, remove_infix_operator],
294
    "*":
295
        replace_by_single_child
296
297
}

298

Eckhart Arnold's avatar
Eckhart Arnold committed
299
def EBNFTransform() -> TransformationFunc:
300
    return partial(traverse, processing_table=EBNF_AST_transformation_table.copy())
di68kap's avatar
di68kap committed
301

302
def get_ebnf_transformer() -> TransformationFunc:
303
304
305
306
307
308
309
    global thread_local_EBNF_transformer_singleton
    try:
        transformer = thread_local_EBNF_transformer_singleton
    except NameError:
        thread_local_EBNF_transformer_singleton = EBNFTransform()
        transformer = thread_local_EBNF_transformer_singleton
    return transformer
Eckhart Arnold's avatar
Eckhart Arnold committed
310
311
312
313
314
315
316
317


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

318

319
PreprocessorFactoryFunc = Callable[[], PreprocessorFunc]
320
ParserFactoryFunc = Callable[[], Grammar]
321
TransformerFactoryFunc = Callable[[], TransformationFunc]
322
323
CompilerFactoryFunc = Callable[[], Compiler]

324
325
326
PREPROCESSOR_FACTORY = '''
def get_preprocessor() -> PreprocessorFunc:
    return {NAME}Preprocessor
327
328
329
330
'''


GRAMMAR_FACTORY = '''
331
def get_grammar() -> {NAME}Grammar:
332
333
334
335
336
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
337
338
        grammar = thread_local_{NAME}_grammar_singleton
    return grammar
339
340
341
342
'''


TRANSFORMER_FACTORY = '''
343
344
345
def {NAME}Transform() -> TransformationDict:
    return partial(traverse, processing_table={NAME}_AST_transformation_table.copy())

346
def get_transformer() -> TransformationFunc:
347
348
349
350
351
352
353
    global thread_local_{NAME}_transformer_singleton
    try:
        transformer = thread_local_{NAME}_transformer_singleton
    except NameError:
        thread_local_{NAME}_transformer_singleton = {NAME}Transform()
        transformer = thread_local_{NAME}_transformer_singleton
    return transformer
354
355
356
357
'''


COMPILER_FACTORY = '''
358
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
359
360
361
362
363
364
365
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
366
367
        compiler = thread_local_{NAME}_compiler_singleton
    return compiler
368
369
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
370

371
372
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
373
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
374
375
376
    pass


377
class EBNFCompiler(Compiler):
378
379
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
380
    in EBNF-Notation.
381
382
383
384
385
386
387
388
389
390
391
392
393

    Instances of this class must be called with the root-node of the
    abstract syntax tree from an EBNF-specification of a formal language.
    The returned value is the Python-source-code of a Grammar class for
    this language that can be used to parse texts in this language.
    See classes `parser.Compiler` and `parser.Grammar` for more information.

    Addionally, class EBNFCompiler provides helper methods to generate
    code-skeletons for a preprocessor, AST-transformation and full
    compilation of the formal language. These method's names start with
    the prefix `gen_`.

    Attributes:
394
        current_symbols:  During compilation, a list containing the root
395
396
397
398
                node of the currently compiled definition as first element
                and then the nodes of the symbols that are referred to in
                the currently compiled definition.

399
        rules:  Dictionary that maps rule names to a list of Nodes that
400
401
402
403
404
405
406
407
408
                contain symbol-references in the definition of the rule.
                The first item in the list is the node of the rule-
                definition itself. Example:

                           `alternative = a | b`

                Now `[str(node) for node in self.rules['alternative']]`
                yields `['alternative = a | b', 'a', 'b']`

409
        symbols:  A mapping of symbol names to their first usage (not
410
411
                their definition!) in the EBNF source.

412
        variables:  A set of symbols names that are used with the
413
414
415
416
                Pop or Retrieve operator. Because the values of these
                symbols need to be captured they are called variables.
                See `test_parser.TestPopRetrieve` for an example.

417
        recursive:  A set of symbols that are used recursively and
418
419
                therefore require a `Forward`-operator.

420
        definitions:  A dictionary of definitions. Other than `rules`
421
422
                this maps the symbols to their compiled definienda.

423
        deferred_taks:  A list of callables that is filled during
424
425
426
427
428
                compilatation, but that will be executed only after
                compilation has finished. Typically, it contains
                sementatic checks that require information that
                is only available upon completion of compilation.

429
        root:   The name of the root symbol.
430

431
        directives:  A dictionary of all directives and their default
432
                values.
433
434
435

        re_flags:  A set of regular expression flags to be added to all
                regular expressions found in the current parsing process
436
437
    """
    COMMENT_KEYWORD = "COMMENT__"
438
    WHITESPACE_KEYWORD = "WSP__"
Eckhart Arnold's avatar
Eckhart Arnold committed
439
440
    RAW_WS_KEYWORD = "WHITESPACE__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, RAW_WS_KEYWORD, COMMENT_KEYWORD}
441
    AST_ERROR = "Badly structured syntax tree. " \
Eckhart Arnold's avatar
Eckhart Arnold committed
442
                "Potentially due to erroneous AST transformation."
443
444
445
446
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
447
448
449
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
450

451

452
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
453
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
454
455
        self._reset()

456

457
    def _reset(self):
458
        super(EBNFCompiler, self)._reset()
459
        self._result = ''           # type: str
460
        self.re_flags = set()       # type: Set[str]
461
462
463
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
464
465
        self.variables = set()      # type: Set[str]
        self.recursive = set()      # type: Set[str]
466
        self.definitions = {}       # type: Dict[str, str]
467
        self.deferred_tasks = []    # type: List[Callable]
468
        self.root_symbol = ""  # type: str
469
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
470
                           'comment': '',
471
                           'literalws': ['right'],
472
473
                           'tokens': set(),  # alt. 'preprocessor_tokens'
                           'filter': dict(),  # alt. 'filter'
Eckhart Arnold's avatar
Eckhart Arnold committed
474
                           'ignorecase': False}
475

Eckhart Arnold's avatar
Eckhart Arnold committed
476
    @property
477
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
478
479
        return self._result

480
    # methods for generating skeleton code for preprocessor, transformer, and compiler
481

482
    def gen_preprocessor_skeleton(self) -> str:
483
484
485
486
        """
        Returns Python-skeleton-code for a preprocessor-function for
        the previously compiled formal language.
        """
487
        name = self.grammar_name + "Preprocessor"
488
        return "def %s(text):\n    return text\n" % name \
489
               + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
490

491

492
    def gen_transformer_skeleton(self) -> str:
493
494
495
496
        """
        Returns Python-skeleton-code for the AST-transformation for the
        previously compiled formal language.
        """
497
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
498
499
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
500
        tt_name = self.grammar_name + '_AST_transformation_table'
di68kap's avatar
di68kap committed
501
        transtable = [tt_name + ' = {',
Eckhart Arnold's avatar
Eckhart Arnold committed
502
                      '    # AST Transformations for the ' + self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
503
        transtable.append('    "+": remove_empty,')
504
        for name in self.rules:
505
506
507
508
509
510
511
            tf = '[]'
            rule = self.definitions[name]
            if rule.startswith('Alternative'):
                tf = '[replace_or_reduce]'
            elif rule.startswith('Synonym'):
                tf = '[replace_by_single_child]'
            transtable.append('    "' + name + '": %s,' % tf)
512
        transtable.append('    ":Token, :RE": reduce_single_child,')
513
        transtable += ['    "*": replace_by_single_child', '}', '']
514
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
515
516
        return '\n'.join(transtable)

517

518
    def gen_compiler_skeleton(self) -> str:
519
520
521
522
        """
        Returns Python-skeleton-code for a Compiler-class for the
        previously compiled formal language.
        """
523
        if not self.rules:
524
525
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
526
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
527
528
529
530
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
531
                    self.grammar_name + '", grammar_source=""):',
532
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
533
                    'Compiler, self).__init__(grammar_name, grammar_source)',
534
                    "        assert re.match('\w+\Z', grammar_name)", '']
535
        for name in self.rules:
536
            method_name = Compiler.method_name(name)
537
            if name == self.root_symbol:
538
                compiler += ['    def ' + method_name + '(self, node):',
539
540
                             '        return node', '']
            else:
541
                compiler += ['    def ' + method_name + '(self, node):',
542
                             '        pass', '']
543
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
544
        return '\n'.join(compiler)
545

546

547
548
549
550
551
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
552
553
554
555
556
557
558
559
560
561

        # execute deferred tasks, for example semantic checks that cannot
        # be done before the symbol table is complete

        for task in self.deferred_tasks:
            task()

        # provide for capturing of symbols that are variables, i.e. the
        # value of will be retrieved at some point during the parsing process

562
563
564
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
565
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
566

567
568
        # add special fields for Grammar class

569
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
570
                            if 'right' in self.directives['literalws'] else "''"))
571
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
572
                            if 'left' in self.directives['literalws'] else "''"))
573
        definitions.append((self.WHITESPACE_KEYWORD,
Eckhart Arnold's avatar
Eckhart Arnold committed
574
575
576
                            ("mixin_comment(whitespace=" + self.RAW_WS_KEYWORD +
                             ", comment=" + self.COMMENT_KEYWORD + ")")))
        definitions.append((self.RAW_WS_KEYWORD, "r'{whitespace}'".format(**self.directives)))
577
578
579
580
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
581

582
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
583
        declarations = ['class ' + self.grammar_name +
584
                        'Grammar(Grammar):',
585
586
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
587
                        (', with this grammar:' if self.grammar_source else '.')]
588
        definitions.append(('parser_initialization__', '"upon instantiation"'))
589
        if self.grammar_source:
590
            definitions.append(('source_hash__',
591
                                '"%s"' % md5(self.grammar_source, __version__)))
592
            declarations.append('')
593
            declarations += [line for line in self.grammar_source.split('\n')]
594
595
596
597
598
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
599

600
        self.root_symbol = definitions[0][0] if definitions else ""
601
602
603
604
605
606
607
608
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
609
610
611
612
613
614
615

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
616
                # root_node.error_flag = True
617
618
619

        # check for unconnected rules

Eckhart Arnold's avatar
Eckhart Arnold committed
620
621
622
623
624
625
626
627
628
629
630
631
        defined_symbols.difference_update(self.RESERVED_SYMBOLS)

        def remove_connections(symbol):
            if symbol in defined_symbols:
                defined_symbols.remove(symbol)
                for related in self.rules[symbol][1:]:
                    remove_connections(str(related))

        remove_connections(self.root_symbol)
        for leftover in defined_symbols:
            self.rules[leftover][0].add_error(('Rule "%s" is not connected to '
                'parser root "%s" !') % (leftover, self.root_symbol), Error.WARNING)
632

633
        # set root_symbol parser and assemble python grammar definition
634

635
636
        if self.root_symbol and 'root__' not in self.rules:
            declarations.append('root__ = ' + self.root_symbol)
637
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
638
639
640
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
641

642
643
644

    ## compilation methods

645
    def on_syntax(self, node: Node) -> str:
646
        definitions = []  # type: List[Tuple[str, str]]
647
648

        # drop the wrapping sequence node
649
650
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
651
652

        # compile definitions and directives and collect definitions
653
        for nd in node.children:
654
            if nd.parser.name == "definition":
655
                definitions.append(self.compile(nd))
656
            else:
657
                assert nd.parser.name == "directive", nd.as_sxpr()
658
                self.compile(nd)
659
            node.error_flag = max(node.error_flag, nd.error_flag)
660
        self.definitions.update(definitions)
661

662
        return self.assemble_parser(definitions, node)
663

664

665
    def on_definition(self, node: Node) -> Tuple[str, str]:
666
        rule = str(node.children[0])
667
        if rule in self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
668
669
670
671
672
            first = self.rules[rule][0]
            if not first._errors:
                first.add_error('First definition of rule "%s" '
                                'followed by illegal redefinitions.' % rule)
            node.add_error('A rule with name "%s" has already been defined earlier.' % rule)
673
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
674
675
676
677
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
678
        elif rule in self.directives['tokens']:
679
            node.add_error('Symbol "%s" has already been defined as '
680
                           'a preprocessor token.' % rule)
681
682
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
683
                           % rule + '(This may change in the future.)')
684
        try:
685
686
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
687
            defn = self.compile(node.children[1])
688
            if rule in self.variables:
689
                defn = 'Capture(%s)' % defn
690
                self.variables.remove(rule)
691
692
693
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
694
        except TypeError as error:
695
696
697
698
            from traceback import extract_tb
            trace = str(extract_tb(error.__traceback__)[-1])
            errmsg = "%s (TypeError: %s; %s)\n%s" \
                     % (EBNFCompiler.AST_ERROR, str(error), trace, node.as_sxpr())
699
700
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
701
        return rule, defn
702

703

704
    def _check_rx(self, node: Node, rx: str) -> str:
705
706
        """
        Checks whether the string `rx` represents a valid regular
707
708
709
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
710
        flags = self.re_flags | {'x'} if rx.find('\n') >= 0 else self.re_flags
711
        if flags:  rx = "(?%s)%s" % ("".join(flags), rx)
712
713
714
715
716
717
718
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

719

720
    def on_directive(self, node: Node) -> str:
721
        key = str(node.children[0]).lower()
722
        assert key not in self.directives['tokens']
723

724
        if key in {'comment', 'whitespace'}:
725
726
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
727
                    node.add_error('Directive "%s" must have one, but not %i values.' %
728
                                   (key, len(node.children[1].result)))
729
                value = self.compile(node.children[1]).pop()
730
731
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
732
                else:
733
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
734
            else:
735
736
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
737
738
739
740
741
742
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
743
744
745
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
746
            self.directives[key] = value
747

748
749
750
751
752
753
        elif key == 'ignorecase':
            value = str(node.children[1]).lower() not in {"off", "false", "no"}
            self.directives['ignorecase'] == value
            if value:
                self.re_flags.add('i')

Eckhart Arnold's avatar
Eckhart Arnold committed
754
755
756
        # elif key == 'testing':
        #     value = str(node.children[1])
        #     self.directives['testing'] = value.lower() not in {"off", "false", "no"}
757

758
        elif key == 'literalws':
759
            value = {item.lower() for item in self.compile(node.children[1])}
760
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
761
                    or ('none' in value and len(value) > 1)):
762
763
764
765
766
767
768
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

769
        elif key in {'tokens', 'preprocessor_tokens'}:
770
            self.directives['tokens'] |= self.compile(node.children[1])
771

772
        elif key.endswith('_filter'):
773
            filter_set = self.compile(node.children[1])
774
775
776
777
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
778

779
780
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
781
                           (key, ', '.join(list(self.directives.keys()))))
782
783
        return ""

784

785
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
786
787
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
788
789
        name for the particular non-terminal.
        """
790
        arguments = [self.compile(r) for r in node.children] + custom_args
791
        node.error_flag = max(node.error_flag, max(t.error_flag for t in node.children))
792
793
        return parser_class + '(' + ', '.join(arguments) + ')'

794

795
    def on_expression(self, node) -> str:
796
797
        return self.non_terminal(node, 'Alternative')

798

799
    def on_term(self, node) -> str:
di68kap's avatar
di68kap committed
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
        # Basically, the following code does only this:
        #       return self.non_terminal(node, 'Series')
        # What makes it (look) more complicated is the handling of the
        # mandatory §-operator
        mandatory_marker = []
        filtered_children = []
        i = 0
        for nd in node.children:
            if nd.parser.ptype == TOKEN_PTYPE and str(nd) == "§":
                mandatory_marker.append(i)
                if i == 0:
                    nd.add_error('First item of a series should not be mandatory.',
                                 Error.WARNING)
                elif len(mandatory_marker) > 1:
                    nd.add_error('One mandatory marker (§) sufficient to declare the '
                                 'rest of the series as mandatory.', Error.WARNING)
            else:
                filtered_children.append(nd)
                i += 1
        saved_result = node.result
        node.result = tuple(filtered_children)
        custom_args =  ['mandatory=%i' % mandatory_marker[0]] if mandatory_marker else []
        compiled = self.non_terminal(node, 'Series', custom_args)
        node.result = saved_result
        return compiled
825

826

827
    def on_factor(self, node: Node) -> str:
828
        assert node.children
829
        assert len(node.children) >= 2, node.as_sxpr()
830
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
831
        custom_args = []  # type: List[str]
832
833

        if prefix in {'::', ':'}:
834
835
            assert len(node.children) == 2
            arg = node.children[-1]
836
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
837
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
838
839
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
840
            if str(arg) in self.directives['filter']:
841
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
842
            self.variables.add(str(arg))  # cast(str, arg.result)
843

844
        elif len(node.children) > 2:
845
846
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
847
848
849
850
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
851

852
        node.result = node.children[1:]
853
854
        try:
            parser_class = self.PREFIX_TABLE[prefix]
855
856
857
858
            result = self.non_terminal(node, parser_class, custom_args)
            if prefix[:1] == '-':
                def check(node):
                    nd = node
859
860
861
862
863
864
865
866
867
868
                    if len(nd.children) >= 1:
                        nd = nd.children[0]
                    while nd.parser.name == "symbol":
                        symlist = self.rules.get(str(nd), [])
                        if len(symlist) == 2:
                            nd = symlist[1]
                        else:
                            if len(symlist) == 1:
                                nd = symlist[0].children[1]
                            break
869
870
871
                    if (nd.parser.name != "regexp" or str(nd)[:1] != '/'
                        or str(nd)[-1:] != '/'):
                        node.add_error("Lookbehind-parser can only be used with plain RegExp-"
872
                                       "parsers, not with: " + nd.parser.name + nd.parser.ptype)
873
874
875
876

                if not result.startswith('RegExp('):
                    self.deferred_tasks.append(lambda: check(node))
            return result
877
878
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
879
        return ""
880

881

882
    def on_option(self, node) -> str:
883
        return self.non_terminal(node, 'Option')
884

885

886
    def on_repetition(self, node) -> str:
887
888
        return self.non_terminal(node, 'ZeroOrMore')

889

890
    def on_oneormore(self, node) -> str:
891
892
        return self.non_terminal(node, 'OneOrMore')

893

894
    def on_regexchain(self, node) -> str:
895
896
        raise EBNFCompilerError("Not yet implemented!")

897

898
    def on_group(self, node) -> str:
899
900
901
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

902

903
904
905
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
906
            return 'PreprocessorToken("' + symbol + '")'
907
        else:
908
909
            self.current_symbols.append(node)
            if symbol not in self.symbols:
910
                self.symbols[symbol] = node  # remember first use of symbol
911
912
            if symbol in self.rules:
                self.recursive.add(symbol)
Eckhart Arnold's avatar
Eckhart Arnold committed
913
            if symbol in EBNFCompiler.RESERVED_SYMBOLS:  # (EBNFCompiler.WHITESPACE_KEYWORD, EBNFCompiler.COMMENT_KEYWORD):
Eckhart Arnold's avatar
Eckhart Arnold committed
914
                return "RegExp(%s)" % symbol
915
            return symbol
916

917

918
    def on_literal(self, node) -> str:
919
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.merge_children([node.result]) + ')' ?
920

921

922
    def on_regexp(self, node: Node) -> str:
923
        rx = str(node)
924
        name = []   # type: List[str]
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
        if rx[0] == '/' and rx[-1] == '/':
            parser = 'RegExp('
        else:
            parser = 'RE('
            if rx[:2] == '~/':
                if not 'left' in self.directives['literalws']:
                    name = ['wL=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[1:]
            elif 'left' in self.directives['literalws']:
                name = ["wL=''"] + name
            if rx[-2:] == '/~':
                if 'right' not in self.directives['literalws']:
                    name = ['wR=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[:-1]
            elif 'right' in self.directives['literalws']:
                name = ["wR=''"] + name
941
942
943
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))