ebnf.py 41.6 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
from collections import OrderedDict
21
from functools import partial
22

23
24
25
26
try:
    import regex as re
except ImportError:
    import re
27
try:
Eckhart Arnold's avatar
Eckhart Arnold committed
28
    from typing import Callable, Dict, List, Set, Tuple, Union
29
except ImportError:
Eckhart Arnold's avatar
Eckhart Arnold committed
30
    from .typing34 import Callable, Dict, List, Set, Tuple, Union
31

32
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
33
34
35
from DHParser.parser import Grammar, mixin_comment, nil_preprocessor, Forward, RegExp, RE, \
    NegativeLookahead, Alternative, Series, Option, OneOrMore, ZeroOrMore, Token, \
    Compiler, PreprocessorFunc
36
37
from DHParser.syntaxtree import Node, TransformationFunc, WHITESPACE_PTYPE, TOKEN_PTYPE
from DHParser.error import Error
38
from DHParser.transform import traverse, remove_brackets, \
39
    reduce_single_child, replace_by_single_child, remove_expendables, \
40
    remove_tokens, flatten, forbid, assert_content, remove_infix_operator
41
from DHParser.versionnumber import __version__
42

43
__all__ = ('get_ebnf_preprocessor',
44
45
46
47
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
48
           'EBNFTransform',
Eckhart Arnold's avatar
Eckhart Arnold committed
49
           'EBNFCompilerError',
50
           'EBNFCompiler',
51
           'grammar_changed',
52
           'PreprocessorFactoryFunc',
53
54
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
55
           'CompilerFactoryFunc')
56
57


Eckhart Arnold's avatar
Eckhart Arnold committed
58
59
60
61
62
63
64
########################################################################
#
# EBNF scanning
#
########################################################################


65
66
def get_ebnf_preprocessor() -> PreprocessorFunc:
    return nil_preprocessor
Eckhart Arnold's avatar
Eckhart Arnold committed
67
68
69
70
71
72
73
74


########################################################################
#
# EBNF parsing
#
########################################################################

75

76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# class EBNFGrammar(Grammar):
#     r"""Parser for an EBNF source file, with this grammar:
#
#     # EBNF-Grammar in EBNF
#
#     @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
#     @ whitespace =  /\s*/                            # whitespace includes linefeed
#     @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly
#
#     syntax     =  [~//] { definition | directive } §EOF
#     definition =  symbol §"=" expression
#     directive  =  "@" §symbol §"=" ( regexp | literal | list_ )
#
#     expression =  term { "|" term }
#     term       =  { factor }+
#     factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
#                 | [flowmarker] literal
#                 | [flowmarker] regexp
#                 | [flowmarker] group
#                 | [flowmarker] oneormore
#                 | repetition
#                 | option
#
#     flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
#                   "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
#     retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve
#
#     group      =  "(" expression §")"
#     oneormore  =  "{" expression "}+"
#     repetition =  "{" expression §"}"
#     option     =  "[" expression §"]"
#
#     symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
#     literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
#                 | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
#     regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
#                                                      # '~' is a whitespace-marker, if present leading or trailing
#                                                      # whitespace of a regular expression will be ignored tacitly.
#     list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
#                                                      # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
#     EOF =  !/./
#     """
#     expression = Forward()
#     source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
#     parser_initialization__ = "upon instantiation"
#     COMMENT__ = r'#.*(?:\n|$)'
#     WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
#     wspL__ = ''
#     wspR__ = WSP__
#     EOF = NegativeLookahead(RE('.', wR=''))
#     list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
#     regexp = RE(r'~?/(?:\\/|[^/])*?/~?')  # RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
#     literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
#     symbol = RE('(?!\\d)\\w+')
#     option = Series(Token("["), expression, Required(Token("]")))
#     repetition = Series(Token("{"), expression, Required(Token("}")))
#     oneormore = Series(Token("{"), expression, Token("}+"))
#     group = Series(Token("("), expression, Required(Token(")")))
#     retrieveop = Alternative(Token("::"), Token(":"))
#     flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
#     factor = Alternative(Series(Option(flowmarker), Option(retrieveop), symbol, NegativeLookahead(Token("="))),
#                          Series(Option(flowmarker), literal), Series(Option(flowmarker), regexp),
#                          Series(Option(flowmarker), group), Series(Option(flowmarker), oneormore),
#                          repetition, option)
#     term = OneOrMore(factor)
#     expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
#     directive = Series(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
#     definition = Series(symbol, Required(Token("=")), expression)
#     syntax = Series(Option(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
#     root__ = syntax

147
class EBNFGrammar(Grammar):
148
149
150
151
152
153
154
155
156
157
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
158
    directive  =  "@" §symbol "=" ( regexp | literal | list_ )
159
160

    expression =  term { "|" term }
161
    term       =  { ["§"] factor }+                       # "§" means all following factors mandatory
162
163
164
165
166
167
168
169
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
                | [flowmarker] oneormore
                | repetition
                | option

170
171
    flowmarker =  "!"  | "&"                         # '!' negative lookahead, '&' positive lookahead
                | "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
172
173
174
175
176
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
177
178
    option     =  "[" expression §"]"

179
180
181
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
182
    regexp     =  /~?\/(?:\\\/|[^\/])*?\/~?/~        # e.g. /\w+/, ~/#.*(?:\n|$)/~
183
184
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
185
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
186
187
188
189
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
190
    source_hash__ = "a131abc5259738631000cda90d2fc65b"
191
    parser_initialization__ = "upon instantiation"
192
    COMMENT__ = r'#.*(?:\n|$)'
193
194
    WHITESPACE__ = r'\s*'
    WSP__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
195
    wspL__ = ''
196
    wspR__ = WSP__
197
198
199
200
    EOF = NegativeLookahead(RegExp('.'))
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'), mandatory=1000)),
                   mandatory=1000)
    regexp = RE('~?/(?:\\\\/|[^/])*?/~?')
201
202
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
203
204
205
206
    option = Series(Token("["), expression, Token("]"), mandatory=2)
    repetition = Series(Token("{"), expression, Token("}"), mandatory=2)
    oneormore = Series(Token("{"), expression, Token("}+"), mandatory=1000)
    group = Series(Token("("), expression, Token(")"), mandatory=2)
207
    retrieveop = Alternative(Token("::"), Token(":"))
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
    flowmarker = Alternative(Token("!"), Token("&"), Token("-!"), Token("-&"))
    factor = Alternative(
        Series(Option(flowmarker), Option(retrieveop), symbol, NegativeLookahead(Token("=")),
               mandatory=1000), Series(Option(flowmarker), literal, mandatory=1000),
        Series(Option(flowmarker), regexp, mandatory=1000),
        Series(Option(flowmarker), group, mandatory=1000),
        Series(Option(flowmarker), oneormore, mandatory=1000), repetition, option)
    term = OneOrMore(Series(Option(Token("§")), factor, mandatory=1000))
    expression.set(
        Series(term, ZeroOrMore(Series(Token("|"), term, mandatory=1000)), mandatory=1000))
    directive = Series(Token("@"), symbol, Token("="), Alternative(regexp, literal, list_),
                       mandatory=1)
    definition = Series(symbol, Token("="), expression, mandatory=1)
    syntax = Series(Option(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)),
                    EOF, mandatory=2)
223
224
225
    root__ = syntax


226
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
246
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
247
248
249
250
251
252
253
254
255
256
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


257
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


274
EBNF_AST_transformation_table = {
275
    # AST Transformations for EBNF-grammar
276
    "+":
277
        remove_expendables,
278
    "syntax":
279
        [],  # otherwise '"*": replace_by_single_child' would be applied
280
    "directive, definition":
281
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
282
    "expression":
283
        [replace_by_single_child, flatten, remove_tokens('|')],  # remove_infix_operator],
284
285
286
287
288
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
289
        [remove_brackets, replace_by_single_child],
290
    "oneormore, repetition, option":
291
292
        [reduce_single_child, remove_brackets,
         forbid('repetition', 'option', 'oneormore'), assert_content(r'(?!§)')],
293
    "symbol, literal, regexp":
294
        reduce_single_child,
295
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
296
        reduce_single_child,
297
    "list_":
298
        [flatten, remove_infix_operator],
299
    "*":
300
        replace_by_single_child
301
302
}

303

Eckhart Arnold's avatar
Eckhart Arnold committed
304
def EBNFTransform() -> TransformationFunc:
305
    return partial(traverse, processing_table=EBNF_AST_transformation_table.copy())
di68kap's avatar
di68kap committed
306

307
def get_ebnf_transformer() -> TransformationFunc:
308
309
310
311
312
313
314
    global thread_local_EBNF_transformer_singleton
    try:
        transformer = thread_local_EBNF_transformer_singleton
    except NameError:
        thread_local_EBNF_transformer_singleton = EBNFTransform()
        transformer = thread_local_EBNF_transformer_singleton
    return transformer
Eckhart Arnold's avatar
Eckhart Arnold committed
315
316
317
318
319
320
321
322


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

323

324
PreprocessorFactoryFunc = Callable[[], PreprocessorFunc]
325
ParserFactoryFunc = Callable[[], Grammar]
326
TransformerFactoryFunc = Callable[[], TransformationFunc]
327
328
CompilerFactoryFunc = Callable[[], Compiler]

329
330
331
PREPROCESSOR_FACTORY = '''
def get_preprocessor() -> PreprocessorFunc:
    return {NAME}Preprocessor
332
333
334
335
'''


GRAMMAR_FACTORY = '''
336
def get_grammar() -> {NAME}Grammar:
337
338
339
340
341
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
342
343
        grammar = thread_local_{NAME}_grammar_singleton
    return grammar
344
345
346
347
'''


TRANSFORMER_FACTORY = '''
348
349
350
def {NAME}Transform() -> TransformationDict:
    return partial(traverse, processing_table={NAME}_AST_transformation_table.copy())

351
def get_transformer() -> TransformationFunc:
352
353
354
355
356
357
358
    global thread_local_{NAME}_transformer_singleton
    try:
        transformer = thread_local_{NAME}_transformer_singleton
    except NameError:
        thread_local_{NAME}_transformer_singleton = {NAME}Transform()
        transformer = thread_local_{NAME}_transformer_singleton
    return transformer
359
360
361
362
'''


COMPILER_FACTORY = '''
363
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
364
365
366
367
368
369
370
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
371
372
        compiler = thread_local_{NAME}_compiler_singleton
    return compiler
373
374
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
375

376
377
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
378
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
379
380
381
    pass


382
class EBNFCompiler(Compiler):
383
384
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
385
    in EBNF-Notation.
386
387
388
389
390
391
392
393
394
395
396
397
398

    Instances of this class must be called with the root-node of the
    abstract syntax tree from an EBNF-specification of a formal language.
    The returned value is the Python-source-code of a Grammar class for
    this language that can be used to parse texts in this language.
    See classes `parser.Compiler` and `parser.Grammar` for more information.

    Addionally, class EBNFCompiler provides helper methods to generate
    code-skeletons for a preprocessor, AST-transformation and full
    compilation of the formal language. These method's names start with
    the prefix `gen_`.

    Attributes:
399
        current_symbols:  During compilation, a list containing the root
400
401
402
403
                node of the currently compiled definition as first element
                and then the nodes of the symbols that are referred to in
                the currently compiled definition.

404
        rules:  Dictionary that maps rule names to a list of Nodes that
405
406
407
408
409
410
411
412
413
                contain symbol-references in the definition of the rule.
                The first item in the list is the node of the rule-
                definition itself. Example:

                           `alternative = a | b`

                Now `[str(node) for node in self.rules['alternative']]`
                yields `['alternative = a | b', 'a', 'b']`

414
        symbols:  A mapping of symbol names to their first usage (not
415
416
                their definition!) in the EBNF source.

417
        variables:  A set of symbols names that are used with the
418
419
420
421
                Pop or Retrieve operator. Because the values of these
                symbols need to be captured they are called variables.
                See `test_parser.TestPopRetrieve` for an example.

422
        recursive:  A set of symbols that are used recursively and
423
424
                therefore require a `Forward`-operator.

425
        definitions:  A dictionary of definitions. Other than `rules`
426
427
                this maps the symbols to their compiled definienda.

428
        deferred_taks:  A list of callables that is filled during
429
430
431
432
433
                compilatation, but that will be executed only after
                compilation has finished. Typically, it contains
                sementatic checks that require information that
                is only available upon completion of compilation.

434
        root:   The name of the root symbol.
435

436
        directives:  A dictionary of all directives and their default
437
                values.
438
439
440

        re_flags:  A set of regular expression flags to be added to all
                regular expressions found in the current parsing process
441
442
    """
    COMMENT_KEYWORD = "COMMENT__"
443
    WHITESPACE_KEYWORD = "WSP__"
Eckhart Arnold's avatar
Eckhart Arnold committed
444
445
    RAW_WS_KEYWORD = "WHITESPACE__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, RAW_WS_KEYWORD, COMMENT_KEYWORD}
446
    AST_ERROR = "Badly structured syntax tree. " \
Eckhart Arnold's avatar
Eckhart Arnold committed
447
                "Potentially due to erroneous AST transformation."
448
449
450
451
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
452
453
454
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
455

456

457
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
458
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
459
460
        self._reset()

461

462
    def _reset(self):
463
        super(EBNFCompiler, self)._reset()
464
        self._result = ''           # type: str
465
        self.re_flags = set()       # type: Set[str]
466
467
468
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
469
470
        self.variables = set()      # type: Set[str]
        self.recursive = set()      # type: Set[str]
471
        self.definitions = {}       # type: Dict[str, str]
472
        self.deferred_tasks = []    # type: List[Callable]
473
        self.root_symbol = ""  # type: str
474
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
475
                           'comment': '',
476
                           'literalws': ['right'],
477
478
                           'tokens': set(),  # alt. 'preprocessor_tokens'
                           'filter': dict(),  # alt. 'filter'
Eckhart Arnold's avatar
Eckhart Arnold committed
479
                           'ignorecase': False}
480

Eckhart Arnold's avatar
Eckhart Arnold committed
481
    @property
482
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
483
484
        return self._result

485
    # methods for generating skeleton code for preprocessor, transformer, and compiler
486

487
    def gen_preprocessor_skeleton(self) -> str:
488
489
490
491
        """
        Returns Python-skeleton-code for a preprocessor-function for
        the previously compiled formal language.
        """
492
        name = self.grammar_name + "Preprocessor"
493
        return "def %s(text):\n    return text\n" % name \
494
               + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
495

496

497
    def gen_transformer_skeleton(self) -> str:
498
499
500
501
        """
        Returns Python-skeleton-code for the AST-transformation for the
        previously compiled formal language.
        """
502
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
503
504
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
505
        tt_name = self.grammar_name + '_AST_transformation_table'
di68kap's avatar
di68kap committed
506
        transtable = [tt_name + ' = {',
Eckhart Arnold's avatar
Eckhart Arnold committed
507
                      '    # AST Transformations for the ' + self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
508
        transtable.append('    "+": remove_empty,')
509
        for name in self.rules:
510
511
512
513
514
515
516
            tf = '[]'
            rule = self.definitions[name]
            if rule.startswith('Alternative'):
                tf = '[replace_or_reduce]'
            elif rule.startswith('Synonym'):
                tf = '[replace_by_single_child]'
            transtable.append('    "' + name + '": %s,' % tf)
517
        transtable.append('    ":Token, :RE": reduce_single_child,')
518
        transtable += ['    "*": replace_by_single_child', '}', '']
519
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
520
521
        return '\n'.join(transtable)

522

523
    def gen_compiler_skeleton(self) -> str:
524
525
526
527
        """
        Returns Python-skeleton-code for a Compiler-class for the
        previously compiled formal language.
        """
528
        if not self.rules:
529
530
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
531
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
532
533
534
535
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
536
                    self.grammar_name + '", grammar_source=""):',
537
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
538
                    'Compiler, self).__init__(grammar_name, grammar_source)',
539
                    "        assert re.match('\w+\Z', grammar_name)", '']
540
        for name in self.rules:
541
            method_name = Compiler.method_name(name)
542
            if name == self.root_symbol:
543
                compiler += ['    def ' + method_name + '(self, node):',
544
545
                             '        return node', '']
            else:
546
                compiler += ['    def ' + method_name + '(self, node):',
547
                             '        pass', '']
548
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
549
        return '\n'.join(compiler)
550

551

552
553
554
555
556
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
557
558
559
560
561
562
563
564
565
566

        # execute deferred tasks, for example semantic checks that cannot
        # be done before the symbol table is complete

        for task in self.deferred_tasks:
            task()

        # provide for capturing of symbols that are variables, i.e. the
        # value of will be retrieved at some point during the parsing process

567
568
569
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
570
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
571

572
573
        # add special fields for Grammar class

574
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
575
                            if 'right' in self.directives['literalws'] else "''"))
576
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
577
                            if 'left' in self.directives['literalws'] else "''"))
578
        definitions.append((self.WHITESPACE_KEYWORD,
Eckhart Arnold's avatar
Eckhart Arnold committed
579
580
581
                            ("mixin_comment(whitespace=" + self.RAW_WS_KEYWORD +
                             ", comment=" + self.COMMENT_KEYWORD + ")")))
        definitions.append((self.RAW_WS_KEYWORD, "r'{whitespace}'".format(**self.directives)))
582
583
584
585
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
586

587
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
588
        declarations = ['class ' + self.grammar_name +
589
                        'Grammar(Grammar):',
590
591
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
592
                        (', with this grammar:' if self.grammar_source else '.')]
593
        definitions.append(('parser_initialization__', '"upon instantiation"'))
594
        if self.grammar_source:
595
            definitions.append(('source_hash__',
596
                                '"%s"' % md5(self.grammar_source, __version__)))
597
            declarations.append('')
598
            declarations += [line for line in self.grammar_source.split('\n')]
599
600
601
602
603
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
604

605
        self.root_symbol = definitions[0][0] if definitions else ""
606
607
608
609
610
611
612
613
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
614
615
616
617
618
619
620

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
621
                # root_node.error_flag = True
622
623
624

        # check for unconnected rules

Eckhart Arnold's avatar
Eckhart Arnold committed
625
626
627
628
629
630
631
632
633
634
635
636
        defined_symbols.difference_update(self.RESERVED_SYMBOLS)

        def remove_connections(symbol):
            if symbol in defined_symbols:
                defined_symbols.remove(symbol)
                for related in self.rules[symbol][1:]:
                    remove_connections(str(related))

        remove_connections(self.root_symbol)
        for leftover in defined_symbols:
            self.rules[leftover][0].add_error(('Rule "%s" is not connected to '
                'parser root "%s" !') % (leftover, self.root_symbol), Error.WARNING)
637

638
        # set root_symbol parser and assemble python grammar definition
639

640
641
        if self.root_symbol and 'root__' not in self.rules:
            declarations.append('root__ = ' + self.root_symbol)
642
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
643
644
645
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
646

647
648
649

    ## compilation methods

650
    def on_syntax(self, node: Node) -> str:
651
        definitions = []  # type: List[Tuple[str, str]]
652
653

        # drop the wrapping sequence node
654
655
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
656
657

        # compile definitions and directives and collect definitions
658
        for nd in node.children:
659
            if nd.parser.name == "definition":
660
                definitions.append(self.compile(nd))
661
            else:
662
                assert nd.parser.name == "directive", nd.as_sxpr()
663
                self.compile(nd)
664
            node.error_flag = max(node.error_flag, nd.error_flag)
665
        self.definitions.update(definitions)
666

667
        return self.assemble_parser(definitions, node)
668

669

670
    def on_definition(self, node: Node) -> Tuple[str, str]:
671
        rule = str(node.children[0])
672
        if rule in self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
673
674
675
676
677
            first = self.rules[rule][0]
            if not first._errors:
                first.add_error('First definition of rule "%s" '
                                'followed by illegal redefinitions.' % rule)
            node.add_error('A rule with name "%s" has already been defined earlier.' % rule)
678
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
679
680
681
682
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
683
        elif rule in self.directives['tokens']:
684
            node.add_error('Symbol "%s" has already been defined as '
685
                           'a preprocessor token.' % rule)
686
687
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
688
                           % rule + '(This may change in the future.)')
689
        try:
690
691
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
692
            defn = self.compile(node.children[1])
693
            if rule in self.variables:
694
                defn = 'Capture(%s)' % defn
695
                self.variables.remove(rule)
696
697
698
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
699
        except TypeError as error:
700
701
702
703
            from traceback import extract_tb
            trace = str(extract_tb(error.__traceback__)[-1])
            errmsg = "%s (TypeError: %s; %s)\n%s" \
                     % (EBNFCompiler.AST_ERROR, str(error), trace, node.as_sxpr())
704
705
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
706
        return rule, defn
707

708

709
    def _check_rx(self, node: Node, rx: str) -> str:
710
711
        """
        Checks whether the string `rx` represents a valid regular
712
713
714
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
715
        flags = self.re_flags | {'x'} if rx.find('\n') >= 0 else self.re_flags
716
        if flags:  rx = "(?%s)%s" % ("".join(flags), rx)
717
718
719
720
721
722
723
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

724

725
    def on_directive(self, node: Node) -> str:
726
        key = str(node.children[0]).lower()
727
        assert key not in self.directives['tokens']
728

729
        if key in {'comment', 'whitespace'}:
730
731
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
732
                    node.add_error('Directive "%s" must have one, but not %i values.' %
733
                                   (key, len(node.children[1].result)))
734
                value = self.compile(node.children[1]).pop()
735
736
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
737
                else:
738
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
739
            else:
740
741
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
742
743
744
745
746
747
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
748
749
750
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
751
            self.directives[key] = value
752

753
754
755
756
757
758
        elif key == 'ignorecase':
            value = str(node.children[1]).lower() not in {"off", "false", "no"}
            self.directives['ignorecase'] == value
            if value:
                self.re_flags.add('i')

Eckhart Arnold's avatar
Eckhart Arnold committed
759
760
761
        # elif key == 'testing':
        #     value = str(node.children[1])
        #     self.directives['testing'] = value.lower() not in {"off", "false", "no"}
762

763
        elif key == 'literalws':
764
            value = {item.lower() for item in self.compile(node.children[1])}
765
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
766
                    or ('none' in value and len(value) > 1)):
767
768
769
770
771
772
773
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

774
        elif key in {'tokens', 'preprocessor_tokens'}:
775
            self.directives['tokens'] |= self.compile(node.children[1])
776

777
        elif key.endswith('_filter'):
778
            filter_set = self.compile(node.children[1])
779
780
781
782
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
783

784
785
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
786
                           (key, ', '.join(list(self.directives.keys()))))
787
788
        return ""

789

790
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
791
792
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
793
794
        name for the particular non-terminal.
        """
795
        arguments = [self.compile(r) for r in node.children] + custom_args
796
        node.error_flag = max(node.error_flag, max(t.error_flag for t in node.children))
797
798
        return parser_class + '(' + ', '.join(arguments) + ')'

799

800
    def on_expression(self, node) -> str:
801
802
        return self.non_terminal(node, 'Alternative')

803

804
    def on_term(self, node) -> str:
805
806
807
808
809
810
811
812
        mandatory_marker = []
        filtered_children = []
        i = 0
        for nd in node.children:
            if nd.parser.ptype == TOKEN_PTYPE and str(nd) == "§":
                mandatory_marker.append(i)
                if i == 0:
                    nd.add_error('First item of a series should not be mandatory.',
813
                                 Error.WARNING)
814
815
                elif len(mandatory_marker) > 1:
                    nd.add_error('One mandatory marker (§) sufficient to declare the '
816
                                 'rest of the series as mandatory.', Error.WARNING)
817
818
819
820
821
822
823
824
825
            else:
                filtered_children.append(nd)
                i += 1
        saved_result = node.result
        node.result = tuple(filtered_children)
        mandatory_marker.append(Series.NOPE)
        compiled = self.non_terminal(node, 'Series', ['mandatory=%i' % mandatory_marker[0]])
        node.result = saved_result
        return compiled
826

827

828
    def on_factor(self, node: Node) -> str:
829
        assert node.children
830
        assert len(node.children) >= 2, node.as_sxpr()
831
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
832
        custom_args = []  # type: List[str]
833
834

        if prefix in {'::', ':'}:
835
836
            assert len(node.children) == 2
            arg = node.children[-1]
837
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
838
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
839
840
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
841
            if str(arg) in self.directives['filter']:
842
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
843
            self.variables.add(str(arg))  # cast(str, arg.result)
844

845
        elif len(node.children) > 2:
846
847
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
848
849
850
851
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
852

853
        node.result = node.children[1:]
854
855
        try:
            parser_class = self.PREFIX_TABLE[prefix]
856
857
858
859
            result = self.non_terminal(node, parser_class, custom_args)
            if prefix[:1] == '-':
                def check(node):
                    nd = node
860
861
862
863
864
865
866
867
868
869
                    if len(nd.children) >= 1:
                        nd = nd.children[0]
                    while nd.parser.name == "symbol":
                        symlist = self.rules.get(str(nd), [])
                        if len(symlist) == 2:
                            nd = symlist[1]
                        else:
                            if len(symlist) == 1:
                                nd = symlist[0].children[1]
                            break
870
871
872
                    if (nd.parser.name != "regexp" or str(nd)[:1] != '/'
                        or str(nd)[-1:] != '/'):
                        node.add_error("Lookbehind-parser can only be used with plain RegExp-"
873
                                       "parsers, not with: " + nd.parser.name + nd.parser.ptype)
874
875
876
877

                if not result.startswith('RegExp('):
                    self.deferred_tasks.append(lambda: check(node))
            return result
878
879
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
880
        return ""
881

882

883
    def on_option(self, node) -> str:
884
        return self.non_terminal(node, 'Option')
885

886

887
    def on_repetition(self, node) -> str:
888
889
        return self.non_terminal(node, 'ZeroOrMore')

890

891
    def on_oneormore(self, node) -> str:
892
893
        return self.non_terminal(node, 'OneOrMore')

894

895
    def on_regexchain(self, node) -> str:
896
897
        raise EBNFCompilerError("Not yet implemented!")

898

899
    def on_group(self, node) -> str:
900
901
902
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

903

904
905
906
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
907
            return 'PreprocessorToken("' + symbol + '")'
908
        else:
909
910
            self.current_symbols.append(node)
            if symbol not in self.symbols:
911
                self.symbols[symbol] = node  # remember first use of symbol
912
913
            if symbol in self.rules:
                self.recursive.add(symbol)
Eckhart Arnold's avatar
Eckhart Arnold committed
914
            if symbol in EBNFCompiler.RESERVED_SYMBOLS:  # (EBNFCompiler.WHITESPACE_KEYWORD, EBNFCompiler.COMMENT_KEYWORD):
Eckhart Arnold's avatar
Eckhart Arnold committed
915
                return "RegExp(%s)" % symbol
916
            return symbol
917

918

919
    def on_literal(self, node) -> str:
920
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.merge_children([node.result]) + ')' ?
921

922

923
    def on_regexp(self, node: Node) -> str:
924
        rx = str(node)
925
        name = []   # type: List[str]
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
        if rx[0] == '/' and rx[-1] == '/':
            parser = 'RegExp('
        else:
            parser = 'RE('
            if rx[:2] == '~/':
                if not 'left' in self.directives['literalws']:
                    name = ['wL=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[1:]
            elif 'left' in self.directives['literalws']:
                name = ["wL=''"] + name
            if rx[-2:] == '/~':
                if 'right' not in self.directives['literalws']:
                    name = ['wR=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[:-1]
            elif 'right' in self.directives['literalws']:
                name = ["wR=''"] + name
942
943
944
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
945
946
947
948
            from traceback import extract_tb
            trace = str(extract_tb(error.__traceback__)[-1])
            errmsg = "%s (AttributeError: %s; %s)\n%s" \
                     % (EBNFCompiler.AST_ERROR, str(error), trace, node.as_sxpr())
949
950
            node.add_error(errmsg)
            return '"' + errmsg + '"'
951
        return parser + ', '.join([arg] + name) + ')'
952