ebnf.py 41.9 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
from collections import OrderedDict
21
from functools import partial
22

23
24
25
26
try:
    import regex as re
except ImportError:
    import re
27
try:
Eckhart Arnold's avatar
Eckhart Arnold committed
28
    from typing import Callable, Dict, List, Set, Tuple, Union
29
except ImportError:
Eckhart Arnold's avatar
Eckhart Arnold committed
30
    from .typing34 import Callable, Dict, List, Set, Tuple, Union
31

32
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
33
34
from DHParser.parser import Grammar, mixin_comment, nil_preprocessor, Forward, RegExp, RE, \
    NegativeLookahead, Alternative, Series, Option, OneOrMore, ZeroOrMore, Token, \
35
    Required, Compiler, PreprocessorFunc
36
37
from DHParser.syntaxtree import Node, TransformationFunc, WHITESPACE_PTYPE, TOKEN_PTYPE
from DHParser.error import Error
38
from DHParser.transform import traverse, remove_brackets, \
39
    reduce_single_child, replace_by_single_child, remove_expendables, \
40
    remove_tokens, flatten, forbid, assert_content, remove_infix_operator
41
from DHParser.versionnumber import __version__
42

43
__all__ = ('get_ebnf_preprocessor',
44
45
46
47
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
48
           'EBNFTransform',
Eckhart Arnold's avatar
Eckhart Arnold committed
49
           'EBNFCompilerError',
50
           'EBNFCompiler',
51
           'grammar_changed',
52
           'PreprocessorFactoryFunc',
53
54
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
55
           'CompilerFactoryFunc')
56
57


Eckhart Arnold's avatar
Eckhart Arnold committed
58
59
60
61
62
63
64
########################################################################
#
# EBNF scanning
#
########################################################################


65
66
def get_ebnf_preprocessor() -> PreprocessorFunc:
    return nil_preprocessor
Eckhart Arnold's avatar
Eckhart Arnold committed
67
68
69
70
71
72
73
74


########################################################################
#
# EBNF parsing
#
########################################################################

75

76
# class EBNFGrammar(Grammar):
di68kap's avatar
di68kap committed
77
#     r"""Parser for an EBNF_variant source file, with this grammar:
78
79
80
81
82
83
84
85
#
#     # EBNF-Grammar in EBNF
#
#     @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
#     @ whitespace =  /\s*/                            # whitespace includes linefeed
#     @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly
#
#     syntax     =  [~//] { definition | directive } §EOF
di68kap's avatar
di68kap committed
86
87
#     definition =  symbol §"=" §expression
#     directive  =  "@" §symbol §"=" §( regexp | literal | list_ )
88
89
#
#     expression =  term { "|" term }
di68kap's avatar
di68kap committed
90
#     term       =  { factor }+
91
92
93
94
95
96
97
98
#     factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
#                 | [flowmarker] literal
#                 | [flowmarker] regexp
#                 | [flowmarker] group
#                 | [flowmarker] oneormore
#                 | repetition
#                 | option
#
di68kap's avatar
di68kap committed
99
#     flowmarker =  "!"  | "&"  | "§"                  # '!' negative lookahead, '&' positive lookahead, '§' required
100
#                 | "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
101
102
103
104
105
106
107
108
109
110
#     retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve
#
#     group      =  "(" expression §")"
#     oneormore  =  "{" expression "}+"
#     repetition =  "{" expression §"}"
#     option     =  "[" expression §"]"
#
#     symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
#     literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
#                 | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
111
#     regexp     =  /~?\/(?:\\\/|[^\/])*?\/~?/~        # e.g. /\w+/, ~/#.*(?:\n|$)/~
112
113
114
115
116
117
118
#                                                      # '~' is a whitespace-marker, if present leading or trailing
#                                                      # whitespace of a regular expression will be ignored tacitly.
#     list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
#                                                      # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
#     EOF =  !/./
#     """
#     expression = Forward()
di68kap's avatar
di68kap committed
119
#     source_hash__ = "4735db10f0b79d44209d1de0184b2ca0"
120
121
#     parser_initialization__ = "upon instantiation"
#     COMMENT__ = r'#.*(?:\n|$)'
122
123
#     WHITESPACE__ = r'\s*'
#     WSP__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
124
125
#     wspL__ = ''
#     wspR__ = WSP__
126
#     EOF = NegativeLookahead(RegExp('.'))
di68kap's avatar
di68kap committed
127
#     list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
128
#     regexp = RE('~?/(?:\\\\/|[^/])*?/~?')
129
130
#     literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
#     symbol = RE('(?!\\d)\\w+')
di68kap's avatar
di68kap committed
131
132
133
134
#     option = Series(Token("["), expression, Required(Token("]")))
#     repetition = Series(Token("{"), expression, Required(Token("}")))
#     oneormore = Series(Token("{"), expression, Token("}+"))
#     group = Series(Token("("), expression, Required(Token(")")))
135
#     retrieveop = Alternative(Token("::"), Token(":"))
di68kap's avatar
di68kap committed
136
137
138
139
140
141
142
143
144
145
#     flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
#     factor = Alternative(Series(Option(flowmarker), Option(retrieveop), symbol, NegativeLookahead(Token("="))),
#                          Series(Option(flowmarker), literal), Series(Option(flowmarker), regexp),
#                          Series(Option(flowmarker), group), Series(Option(flowmarker), oneormore), repetition, option)
#     term = OneOrMore(factor)
#     expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
#     directive = Series(Token("@"), Required(symbol), Required(Token("=")),
#                        Required(Alternative(regexp, literal, list_)))
#     definition = Series(symbol, Required(Token("=")), Required(expression))
#     syntax = Series(Option(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
146
147
#     root__ = syntax

148

di68kap's avatar
di68kap committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
class EBNFGrammar(Grammar):
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol "=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { ["§"] factor }+                       # "§" means all following factors mandatory
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"                         # '!' negative lookahead, '&' positive lookahead
                | "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
    option     =  "[" expression §"]"

    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:\\\/|[^\/])*?\/~?/~        # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
    source_hash__ = "a131abc5259738631000cda90d2fc65b"
    parser_initialization__ = "upon instantiation"
    COMMENT__ = r'#.*(?:\n|$)'
    WHITESPACE__ = r'\s*'
    WSP__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
    wspL__ = ''
    wspR__ = WSP__
    EOF = NegativeLookahead(RegExp('.'))
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'), mandatory=1000)),
                   mandatory=1000)
    regexp = RE('~?/(?:\\\\/|[^/])*?/~?')
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
    option = Series(Token("["), expression, Token("]"), mandatory=2)
    repetition = Series(Token("{"), expression, Token("}"), mandatory=2)
    oneormore = Series(Token("{"), expression, Token("}+"), mandatory=1000)
    group = Series(Token("("), expression, Token(")"), mandatory=2)
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("-!"), Token("-&"))
    factor = Alternative(
        Series(Option(flowmarker), Option(retrieveop), symbol, NegativeLookahead(Token("=")),
               mandatory=1000), Series(Option(flowmarker), literal, mandatory=1000),
        Series(Option(flowmarker), regexp, mandatory=1000),
        Series(Option(flowmarker), group, mandatory=1000),
        Series(Option(flowmarker), oneormore, mandatory=1000), repetition, option)
    term = OneOrMore(Series(Option(Token("§")), factor, mandatory=1000))
    expression.set(
        Series(term, ZeroOrMore(Series(Token("|"), term, mandatory=1000)), mandatory=1000))
    directive = Series(Token("@"), symbol, Token("="), Alternative(regexp, literal, list_),
                       mandatory=1)
    definition = Series(symbol, Token("="), expression, mandatory=1)
    syntax = Series(Option(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)),
                    EOF, mandatory=2)
    root__ = syntax


228
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
248
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
249
250
251
252
253
254
255
256
257
258
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


259
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


276
EBNF_AST_transformation_table = {
277
    # AST Transformations for EBNF-grammar
278
    "+":
279
        remove_expendables,
280
    "syntax":
281
        [],  # otherwise '"*": replace_by_single_child' would be applied
282
    "directive, definition":
283
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
284
    "expression":
285
        [replace_by_single_child, flatten, remove_tokens('|')],  # remove_infix_operator],
286
287
288
289
290
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
291
        [remove_brackets, replace_by_single_child],
292
    "oneormore, repetition, option":
293
294
        [reduce_single_child, remove_brackets,
         forbid('repetition', 'option', 'oneormore'), assert_content(r'(?!§)')],
295
    "symbol, literal, regexp":
296
        reduce_single_child,
297
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
298
        reduce_single_child,
299
    "list_":
300
        [flatten, remove_infix_operator],
301
    "*":
302
        replace_by_single_child
303
304
}

305

Eckhart Arnold's avatar
Eckhart Arnold committed
306
def EBNFTransform() -> TransformationFunc:
307
    return partial(traverse, processing_table=EBNF_AST_transformation_table.copy())
di68kap's avatar
di68kap committed
308

309
def get_ebnf_transformer() -> TransformationFunc:
310
311
312
313
314
315
316
    global thread_local_EBNF_transformer_singleton
    try:
        transformer = thread_local_EBNF_transformer_singleton
    except NameError:
        thread_local_EBNF_transformer_singleton = EBNFTransform()
        transformer = thread_local_EBNF_transformer_singleton
    return transformer
Eckhart Arnold's avatar
Eckhart Arnold committed
317
318
319
320
321
322
323
324


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

325

326
PreprocessorFactoryFunc = Callable[[], PreprocessorFunc]
327
ParserFactoryFunc = Callable[[], Grammar]
328
TransformerFactoryFunc = Callable[[], TransformationFunc]
329
330
CompilerFactoryFunc = Callable[[], Compiler]

331
332
333
PREPROCESSOR_FACTORY = '''
def get_preprocessor() -> PreprocessorFunc:
    return {NAME}Preprocessor
334
335
336
337
'''


GRAMMAR_FACTORY = '''
338
def get_grammar() -> {NAME}Grammar:
339
340
341
342
343
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
344
345
        grammar = thread_local_{NAME}_grammar_singleton
    return grammar
346
347
348
349
'''


TRANSFORMER_FACTORY = '''
350
351
352
def {NAME}Transform() -> TransformationDict:
    return partial(traverse, processing_table={NAME}_AST_transformation_table.copy())

353
def get_transformer() -> TransformationFunc:
354
355
356
357
358
359
360
    global thread_local_{NAME}_transformer_singleton
    try:
        transformer = thread_local_{NAME}_transformer_singleton
    except NameError:
        thread_local_{NAME}_transformer_singleton = {NAME}Transform()
        transformer = thread_local_{NAME}_transformer_singleton
    return transformer
361
362
363
364
'''


COMPILER_FACTORY = '''
365
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
366
367
368
369
370
371
372
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
373
374
        compiler = thread_local_{NAME}_compiler_singleton
    return compiler
375
376
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
377

378
379
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
380
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
381
382
383
    pass


384
class EBNFCompiler(Compiler):
385
386
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
387
    in EBNF-Notation.
388
389
390
391
392
393
394
395
396
397
398
399
400

    Instances of this class must be called with the root-node of the
    abstract syntax tree from an EBNF-specification of a formal language.
    The returned value is the Python-source-code of a Grammar class for
    this language that can be used to parse texts in this language.
    See classes `parser.Compiler` and `parser.Grammar` for more information.

    Addionally, class EBNFCompiler provides helper methods to generate
    code-skeletons for a preprocessor, AST-transformation and full
    compilation of the formal language. These method's names start with
    the prefix `gen_`.

    Attributes:
401
        current_symbols:  During compilation, a list containing the root
402
403
404
405
                node of the currently compiled definition as first element
                and then the nodes of the symbols that are referred to in
                the currently compiled definition.

406
        rules:  Dictionary that maps rule names to a list of Nodes that
407
408
409
410
411
412
413
414
415
                contain symbol-references in the definition of the rule.
                The first item in the list is the node of the rule-
                definition itself. Example:

                           `alternative = a | b`

                Now `[str(node) for node in self.rules['alternative']]`
                yields `['alternative = a | b', 'a', 'b']`

416
        symbols:  A mapping of symbol names to their first usage (not
417
418
                their definition!) in the EBNF source.

419
        variables:  A set of symbols names that are used with the
420
421
422
423
                Pop or Retrieve operator. Because the values of these
                symbols need to be captured they are called variables.
                See `test_parser.TestPopRetrieve` for an example.

424
        recursive:  A set of symbols that are used recursively and
425
426
                therefore require a `Forward`-operator.

427
        definitions:  A dictionary of definitions. Other than `rules`
428
429
                this maps the symbols to their compiled definienda.

430
        deferred_taks:  A list of callables that is filled during
431
432
433
434
435
                compilatation, but that will be executed only after
                compilation has finished. Typically, it contains
                sementatic checks that require information that
                is only available upon completion of compilation.

436
        root:   The name of the root symbol.
437

438
        directives:  A dictionary of all directives and their default
439
                values.
440
441
442

        re_flags:  A set of regular expression flags to be added to all
                regular expressions found in the current parsing process
443
444
    """
    COMMENT_KEYWORD = "COMMENT__"
445
    WHITESPACE_KEYWORD = "WSP__"
Eckhart Arnold's avatar
Eckhart Arnold committed
446
447
    RAW_WS_KEYWORD = "WHITESPACE__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, RAW_WS_KEYWORD, COMMENT_KEYWORD}
448
    AST_ERROR = "Badly structured syntax tree. " \
Eckhart Arnold's avatar
Eckhart Arnold committed
449
                "Potentially due to erroneous AST transformation."
450
451
452
453
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
454
455
456
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
457

458

459
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
460
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
461
462
        self._reset()

463

464
    def _reset(self):
465
        super(EBNFCompiler, self)._reset()
466
        self._result = ''           # type: str
467
        self.re_flags = set()       # type: Set[str]
468
469
470
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
471
472
        self.variables = set()      # type: Set[str]
        self.recursive = set()      # type: Set[str]
473
        self.definitions = {}       # type: Dict[str, str]
474
        self.deferred_tasks = []    # type: List[Callable]
475
        self.root_symbol = ""  # type: str
476
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
477
                           'comment': '',
478
                           'literalws': ['right'],
479
480
                           'tokens': set(),  # alt. 'preprocessor_tokens'
                           'filter': dict(),  # alt. 'filter'
Eckhart Arnold's avatar
Eckhart Arnold committed
481
                           'ignorecase': False}
482

Eckhart Arnold's avatar
Eckhart Arnold committed
483
    @property
484
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
485
486
        return self._result

487
    # methods for generating skeleton code for preprocessor, transformer, and compiler
488

489
    def gen_preprocessor_skeleton(self) -> str:
490
491
492
493
        """
        Returns Python-skeleton-code for a preprocessor-function for
        the previously compiled formal language.
        """
494
        name = self.grammar_name + "Preprocessor"
495
        return "def %s(text):\n    return text\n" % name \
496
               + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
497

498

499
    def gen_transformer_skeleton(self) -> str:
500
501
502
503
        """
        Returns Python-skeleton-code for the AST-transformation for the
        previously compiled formal language.
        """
504
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
505
506
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
507
        tt_name = self.grammar_name + '_AST_transformation_table'
di68kap's avatar
di68kap committed
508
        transtable = [tt_name + ' = {',
Eckhart Arnold's avatar
Eckhart Arnold committed
509
                      '    # AST Transformations for the ' + self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
510
        transtable.append('    "+": remove_empty,')
511
        for name in self.rules:
512
513
514
515
516
517
518
            tf = '[]'
            rule = self.definitions[name]
            if rule.startswith('Alternative'):
                tf = '[replace_or_reduce]'
            elif rule.startswith('Synonym'):
                tf = '[replace_by_single_child]'
            transtable.append('    "' + name + '": %s,' % tf)
519
        transtable.append('    ":Token, :RE": reduce_single_child,')
520
        transtable += ['    "*": replace_by_single_child', '}', '']
521
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
522
523
        return '\n'.join(transtable)

524

525
    def gen_compiler_skeleton(self) -> str:
526
527
528
529
        """
        Returns Python-skeleton-code for a Compiler-class for the
        previously compiled formal language.
        """
530
        if not self.rules:
531
532
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
533
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
534
535
536
537
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
538
                    self.grammar_name + '", grammar_source=""):',
539
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
540
                    'Compiler, self).__init__(grammar_name, grammar_source)',
541
                    "        assert re.match('\w+\Z', grammar_name)", '']
542
        for name in self.rules:
543
            method_name = Compiler.method_name(name)
544
            if name == self.root_symbol:
545
                compiler += ['    def ' + method_name + '(self, node):',
546
547
                             '        return node', '']
            else:
548
                compiler += ['    def ' + method_name + '(self, node):',
549
                             '        pass', '']
550
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
551
        return '\n'.join(compiler)
552

553

554
555
556
557
558
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
559
560
561
562
563
564
565
566
567
568

        # execute deferred tasks, for example semantic checks that cannot
        # be done before the symbol table is complete

        for task in self.deferred_tasks:
            task()

        # provide for capturing of symbols that are variables, i.e. the
        # value of will be retrieved at some point during the parsing process

569
570
571
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
572
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
573

574
575
        # add special fields for Grammar class

576
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
577
                            if 'right' in self.directives['literalws'] else "''"))
578
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
579
                            if 'left' in self.directives['literalws'] else "''"))
580
        definitions.append((self.WHITESPACE_KEYWORD,
Eckhart Arnold's avatar
Eckhart Arnold committed
581
582
583
                            ("mixin_comment(whitespace=" + self.RAW_WS_KEYWORD +
                             ", comment=" + self.COMMENT_KEYWORD + ")")))
        definitions.append((self.RAW_WS_KEYWORD, "r'{whitespace}'".format(**self.directives)))
584
585
586
587
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
588

589
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
590
        declarations = ['class ' + self.grammar_name +
591
                        'Grammar(Grammar):',
592
593
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
594
                        (', with this grammar:' if self.grammar_source else '.')]
595
        definitions.append(('parser_initialization__', '"upon instantiation"'))
596
        if self.grammar_source:
597
            definitions.append(('source_hash__',
598
                                '"%s"' % md5(self.grammar_source, __version__)))
599
            declarations.append('')
600
            declarations += [line for line in self.grammar_source.split('\n')]
601
602
603
604
605
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
606

607
        self.root_symbol = definitions[0][0] if definitions else ""
608
609
610
611
612
613
614
615
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
616
617
618
619
620
621
622

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
623
                # root_node.error_flag = True
624
625
626

        # check for unconnected rules

Eckhart Arnold's avatar
Eckhart Arnold committed
627
628
629
630
631
632
633
634
635
636
637
638
        defined_symbols.difference_update(self.RESERVED_SYMBOLS)

        def remove_connections(symbol):
            if symbol in defined_symbols:
                defined_symbols.remove(symbol)
                for related in self.rules[symbol][1:]:
                    remove_connections(str(related))

        remove_connections(self.root_symbol)
        for leftover in defined_symbols:
            self.rules[leftover][0].add_error(('Rule "%s" is not connected to '
                'parser root "%s" !') % (leftover, self.root_symbol), Error.WARNING)
639

640
        # set root_symbol parser and assemble python grammar definition
641

642
643
        if self.root_symbol and 'root__' not in self.rules:
            declarations.append('root__ = ' + self.root_symbol)
644
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
645
646
647
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
648

649
650
651

    ## compilation methods

652
    def on_syntax(self, node: Node) -> str:
653
        definitions = []  # type: List[Tuple[str, str]]
654
655

        # drop the wrapping sequence node
656
657
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
658
659

        # compile definitions and directives and collect definitions
660
        for nd in node.children:
661
            if nd.parser.name == "definition":
662
                definitions.append(self.compile(nd))
663
            else:
664
                assert nd.parser.name == "directive", nd.as_sxpr()
665
                self.compile(nd)
666
            node.error_flag = max(node.error_flag, nd.error_flag)
667
        self.definitions.update(definitions)
668

669
        return self.assemble_parser(definitions, node)
670

671

672
    def on_definition(self, node: Node) -> Tuple[str, str]:
673
        rule = str(node.children[0])
674
        if rule in self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
675
676
677
678
679
            first = self.rules[rule][0]
            if not first._errors:
                first.add_error('First definition of rule "%s" '
                                'followed by illegal redefinitions.' % rule)
            node.add_error('A rule with name "%s" has already been defined earlier.' % rule)
680
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
681
682
683
684
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
685
        elif rule in self.directives['tokens']:
686
            node.add_error('Symbol "%s" has already been defined as '
687
                           'a preprocessor token.' % rule)
688
689
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
690
                           % rule + '(This may change in the future.)')
691
        try:
692
693
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
694
            defn = self.compile(node.children[1])
695
            if rule in self.variables:
696
                defn = 'Capture(%s)' % defn
697
                self.variables.remove(rule)
698
699
700
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
701
        except TypeError as error:
702
703
704
705
            from traceback import extract_tb
            trace = str(extract_tb(error.__traceback__)[-1])
            errmsg = "%s (TypeError: %s; %s)\n%s" \
                     % (EBNFCompiler.AST_ERROR, str(error), trace, node.as_sxpr())
706
707
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
708
        return rule, defn
709

710

711
    def _check_rx(self, node: Node, rx: str) -> str:
712
713
        """
        Checks whether the string `rx` represents a valid regular
714
715
716
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
717
        flags = self.re_flags | {'x'} if rx.find('\n') >= 0 else self.re_flags
718
        if flags:  rx = "(?%s)%s" % ("".join(flags), rx)
719
720
721
722
723
724
725
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

726

727
    def on_directive(self, node: Node) -> str:
728
        key = str(node.children[0]).lower()
729
        assert key not in self.directives['tokens']
730

731
        if key in {'comment', 'whitespace'}:
732
733
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
734
                    node.add_error('Directive "%s" must have one, but not %i values.' %
735
                                   (key, len(node.children[1].result)))
736
                value = self.compile(node.children[1]).pop()
737
738
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
739
                else:
740
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
741
            else:
742
743
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
744
745
746
747
748
749
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
750
751
752
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
753
            self.directives[key] = value
754

755
756
757
758
759
760
        elif key == 'ignorecase':
            value = str(node.children[1]).lower() not in {"off", "false", "no"}
            self.directives['ignorecase'] == value
            if value:
                self.re_flags.add('i')

Eckhart Arnold's avatar
Eckhart Arnold committed
761
762
763
        # elif key == 'testing':
        #     value = str(node.children[1])
        #     self.directives['testing'] = value.lower() not in {"off", "false", "no"}
764

765
        elif key == 'literalws':
766
            value = {item.lower() for item in self.compile(node.children[1])}
767
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
768
                    or ('none' in value and len(value) > 1)):
769
770
771
772
773
774
775
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

776
        elif key in {'tokens', 'preprocessor_tokens'}:
777
            self.directives['tokens'] |= self.compile(node.children[1])
778

779
        elif key.endswith('_filter'):
780
            filter_set = self.compile(node.children[1])
781
782
783
784
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
785

786
787
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
788
                           (key, ', '.join(list(self.directives.keys()))))
789
790
        return ""

791

792
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
793
794
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
795
796
        name for the particular non-terminal.
        """
797
        arguments = [self.compile(r) for r in node.children] + custom_args
798
        node.error_flag = max(node.error_flag, max(t.error_flag for t in node.children))
799
800
        return parser_class + '(' + ', '.join(arguments) + ')'

801

802
    def on_expression(self, node) -> str:
803
804
        return self.non_terminal(node, 'Alternative')

805

806
    def on_term(self, node) -> str:
di68kap's avatar
di68kap committed
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
        # Basically, the following code does only this:
        #       return self.non_terminal(node, 'Series')
        # What makes it (look) more complicated is the handling of the
        # mandatory §-operator
        mandatory_marker = []
        filtered_children = []
        i = 0
        for nd in node.children:
            if nd.parser.ptype == TOKEN_PTYPE and str(nd) == "§":
                mandatory_marker.append(i)
                if i == 0:
                    nd.add_error('First item of a series should not be mandatory.',
                                 Error.WARNING)
                elif len(mandatory_marker) > 1:
                    nd.add_error('One mandatory marker (§) sufficient to declare the '
                                 'rest of the series as mandatory.', Error.WARNING)
            else:
                filtered_children.append(nd)
                i += 1
        saved_result = node.result
        node.result = tuple(filtered_children)
        custom_args =  ['mandatory=%i' % mandatory_marker[0]] if mandatory_marker else []
        compiled = self.non_terminal(node, 'Series', custom_args)
        node.result = saved_result
        return compiled
832

833

834
    def on_factor(self, node: Node) -> str:
835
        assert node.children
836
        assert len(node.children) >= 2, node.as_sxpr()
837
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
838
        custom_args = []  # type: List[str]
839
840

        if prefix in {'::', ':'}:
841
842
            assert len(node.children) == 2
            arg = node.children[-1]
843
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
844
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
845
846
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
847
            if str(arg) in self.directives['filter']:
848
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
849
            self.variables.add(str(arg))  # cast(str, arg.result)
850

851
        elif len(node.children) > 2:
852
853
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
854
855
856
857
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
858

859
        node.result = node.children[1:]
860
861
        try:
            parser_class = self.PREFIX_TABLE[prefix]
862
863
864
865
            result = self.non_terminal(node, parser_class, custom_args)
            if prefix[:1] == '-':
                def check(node):
                    nd = node
866
867
868
869
870
871
872
873
874
875
                    if len(nd.children) >= 1:
                        nd = nd.children[0]
                    while nd.parser.name == "symbol":
                        symlist = self.rules.get(str(nd), [])
                        if len(symlist) == 2:
                            nd = symlist[1]
                        else:
                            if len(symlist) == 1:
                                nd = symlist[0].children[1]
                            break
876
877
878
                    if (nd.parser.name != "regexp" or str(nd)[:1] != '/'
                        or str(nd)[-1:] != '/'):
                        node.add_error("Lookbehind-parser can only be used with plain RegExp-"
879
                                       "parsers, not with: " + nd.parser.name + nd.parser.ptype)
880
881
882
883

                if not result.startswith('RegExp('):
                    self.deferred_tasks.append(lambda: check(node))
            return result
884
885
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
886
        return ""
887

888

889
    def on_option(self, node) -> str:
890
        return self.non_terminal(node, 'Option')
891

892

893
    def on_repetition(self, node) -> str:
894
895
        return self.non_terminal(node, 'ZeroOrMore')

896

897
    def on_oneormore(self, node) -> str:
898
899
        return self.non_terminal(node, 'OneOrMore')

900

901
    def on_regexchain(self, node) -> str:
902
903
        raise EBNFCompilerError("Not yet implemented!")

904

905
    def on_group(self, node) -> str:
906
907
908
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

909

910
911
912
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
913
            return 'PreprocessorToken("' + symbol + '")'
914
        else:
915
916
            self.current_symbols.append(node)
            if symbol not in self.symbols:
917
                self.symbols[symbol] = node  # remember first use of symbol
918
919
            if symbol in self.rules:
                self.recursive.add(symbol)
Eckhart Arnold's avatar
Eckhart Arnold committed
920
            if symbol in EBNFCompiler.RESERVED_SYMBOLS:  # (EBNFCompiler.WHITESPACE_KEYWORD, EBNFCompiler.COMMENT_KEYWORD):
Eckhart Arnold's avatar
Eckhart Arnold committed
921
                return "RegExp(%s)" % symbol
922
            return symbol
923

924

925
    def on_literal(self, node) -> str:
926
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.merge_children([node.result]) + ')' ?
927

928

929
    def on_regexp(self, node: Node) -> str:
930
        rx = str(node)
931
        name = []   # type: List[str]
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
        if rx[0] == '/' and rx[-1] == '/':
            parser = 'RegExp('
        else:
            parser = 'RE('
            if rx[:2] == '~/':
                if not 'left' in self.directives['literalws']:
                    name = ['wL=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[1:]
            elif 'left' in self.directives['literalws']:
                name = ["wL=''"] + name
            if rx[-2:] == '/~':
                if 'right' not in self.directives['literalws']:
                    name = ['wR=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[:-1]
            elif 'right' in self.directives['literalws']:
                name = ["wR=''"] + name
948
949
950
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
951
952
953
954
            from traceback import extract_tb
            trace = str(extract_tb(error.__traceback__)[-1])
            errmsg = "%s (AttributeError: %s; %s)\n%s" \
                     % (EBNFCompiler.AST_ERROR, str(error), trace, node.as_sxpr())
955
956
            node.add_error(errmsg)
            return '"' + errmsg + '"'
957
        return parser + ', '.join([arg] + name) + ')'
958

959

960
    def on_list_(self, node) -> Set[str]: