ebnf.py 41.5 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
from collections import OrderedDict
21
from functools import partial
22

23
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name, re, typing
24
25
from DHParser.parser import Grammar, mixin_comment, nil_preprocessor, Forward, RegExp, RE, \
    NegativeLookahead, Alternative, Series, Option, OneOrMore, ZeroOrMore, Token, \
26
    Required, Compiler, PreprocessorFunc
27
28
from DHParser.syntaxtree import Node, TransformationFunc, WHITESPACE_PTYPE, TOKEN_PTYPE
from DHParser.error import Error
29
from DHParser.transform import traverse, remove_brackets, \
30
    reduce_single_child, replace_by_single_child, remove_expendables, \
31
    remove_tokens, flatten, forbid, assert_content, remove_infix_operator
32
from DHParser.versionnumber import __version__
33

34
35
from typing import Callable, Dict, List, Set, Tuple, Union

36
__all__ = ('get_ebnf_preprocessor',
37
38
39
40
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
41
           'EBNFTransform',
Eckhart Arnold's avatar
Eckhart Arnold committed
42
           'EBNFCompilerError',
43
           'EBNFCompiler',
44
           'grammar_changed',
45
           'PreprocessorFactoryFunc',
46
47
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
48
           'CompilerFactoryFunc')
49
50


Eckhart Arnold's avatar
Eckhart Arnold committed
51
52
53
54
55
56
57
########################################################################
#
# EBNF scanning
#
########################################################################


58
59
def get_ebnf_preprocessor() -> PreprocessorFunc:
    return nil_preprocessor
Eckhart Arnold's avatar
Eckhart Arnold committed
60
61
62
63
64
65
66
67


########################################################################
#
# EBNF parsing
#
########################################################################

68

69
# class EBNFGrammar(Grammar):
di68kap's avatar
di68kap committed
70
#     r"""Parser for an EBNF_variant source file, with this grammar:
71
72
73
74
75
76
77
78
#
#     # EBNF-Grammar in EBNF
#
#     @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
#     @ whitespace =  /\s*/                            # whitespace includes linefeed
#     @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly
#
#     syntax     =  [~//] { definition | directive } §EOF
di68kap's avatar
di68kap committed
79
80
#     definition =  symbol §"=" §expression
#     directive  =  "@" §symbol §"=" §( regexp | literal | list_ )
81
82
#
#     expression =  term { "|" term }
di68kap's avatar
di68kap committed
83
#     term       =  { factor }+
84
85
86
87
88
89
90
91
#     factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
#                 | [flowmarker] literal
#                 | [flowmarker] regexp
#                 | [flowmarker] group
#                 | [flowmarker] oneormore
#                 | repetition
#                 | option
#
di68kap's avatar
di68kap committed
92
#     flowmarker =  "!"  | "&"  | "§"                  # '!' negative lookahead, '&' positive lookahead, '§' required
93
#                 | "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
94
95
96
97
98
99
100
101
102
103
#     retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve
#
#     group      =  "(" expression §")"
#     oneormore  =  "{" expression "}+"
#     repetition =  "{" expression §"}"
#     option     =  "[" expression §"]"
#
#     symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
#     literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
#                 | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
104
#     regexp     =  /~?\/(?:\\\/|[^\/])*?\/~?/~        # e.g. /\w+/, ~/#.*(?:\n|$)/~
105
106
107
108
109
110
111
#                                                      # '~' is a whitespace-marker, if present leading or trailing
#                                                      # whitespace of a regular expression will be ignored tacitly.
#     list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
#                                                      # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
#     EOF =  !/./
#     """
#     expression = Forward()
di68kap's avatar
di68kap committed
112
#     source_hash__ = "4735db10f0b79d44209d1de0184b2ca0"
113
114
#     parser_initialization__ = "upon instantiation"
#     COMMENT__ = r'#.*(?:\n|$)'
115
116
#     WHITESPACE__ = r'\s*'
#     WSP__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
117
118
#     wspL__ = ''
#     wspR__ = WSP__
119
#     EOF = NegativeLookahead(RegExp('.'))
di68kap's avatar
di68kap committed
120
#     list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
121
#     regexp = RE('~?/(?:\\\\/|[^/])*?/~?')
122
123
#     literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
#     symbol = RE('(?!\\d)\\w+')
di68kap's avatar
di68kap committed
124
125
126
127
#     option = Series(Token("["), expression, Required(Token("]")))
#     repetition = Series(Token("{"), expression, Required(Token("}")))
#     oneormore = Series(Token("{"), expression, Token("}+"))
#     group = Series(Token("("), expression, Required(Token(")")))
128
#     retrieveop = Alternative(Token("::"), Token(":"))
di68kap's avatar
di68kap committed
129
130
131
132
133
134
135
136
137
138
#     flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
#     factor = Alternative(Series(Option(flowmarker), Option(retrieveop), symbol, NegativeLookahead(Token("="))),
#                          Series(Option(flowmarker), literal), Series(Option(flowmarker), regexp),
#                          Series(Option(flowmarker), group), Series(Option(flowmarker), oneormore), repetition, option)
#     term = OneOrMore(factor)
#     expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
#     directive = Series(Token("@"), Required(symbol), Required(Token("=")),
#                        Required(Alternative(regexp, literal, list_)))
#     definition = Series(symbol, Required(Token("=")), Required(expression))
#     syntax = Series(Option(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
139
140
#     root__ = syntax

141

di68kap's avatar
di68kap committed
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
class EBNFGrammar(Grammar):
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol "=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { ["§"] factor }+                       # "§" means all following factors mandatory
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"                         # '!' negative lookahead, '&' positive lookahead
                | "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
    option     =  "[" expression §"]"

    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:\\\/|[^\/])*?\/~?/~        # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
    source_hash__ = "a131abc5259738631000cda90d2fc65b"
Eckhart Arnold's avatar
Eckhart Arnold committed
186
    tialization__ = "upon instantiation"
di68kap's avatar
di68kap committed
187
188
189
190
191
192
    COMMENT__ = r'#.*(?:\n|$)'
    WHITESPACE__ = r'\s*'
    WSP__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
    wspL__ = ''
    wspR__ = WSP__
    EOF = NegativeLookahead(RegExp('.'))
Eckhart Arnold's avatar
Eckhart Arnold committed
193
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
di68kap's avatar
di68kap committed
194
195
196
    regexp = RE('~?/(?:\\\\/|[^/])*?/~?')
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
197
198
    option = Series(Token("["), expression, Token("]"), mandatory=1)
    repetition = Series(Token("{"), expression, Token("}"), mandatory=1)
Eckhart Arnold's avatar
Eckhart Arnold committed
199
    oneormore = Series(Token("{"), expression, Token("}+"))
200
    group = Series(Token("("), expression, Token(")"), mandatory=1)
di68kap's avatar
di68kap committed
201
202
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("-!"), Token("-&"))
Eckhart Arnold's avatar
Eckhart Arnold committed
203
204
205
206
207
208
    factor = Alternative(Series(Option(flowmarker), Option(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Series(Option(flowmarker), literal), Series(Option(flowmarker), regexp),
                         Series(Option(flowmarker), group), Series(Option(flowmarker), oneormore), repetition, option)
    term = OneOrMore(Series(Option(Token("§")), factor))
    expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
    directive = Series(Token("@"), symbol, Token("="), Alternative(regexp, literal, list_), mandatory=1)
di68kap's avatar
di68kap committed
209
    definition = Series(symbol, Token("="), expression, mandatory=1)
Eckhart Arnold's avatar
Eckhart Arnold committed
210
    syntax = Series(Option(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), EOF, mandatory=2)
di68kap's avatar
di68kap committed
211
212
213
    root__ = syntax


214
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
234
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
235
236
237
238
239
240
241
242
243
244
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


245
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


262
EBNF_AST_transformation_table = {
263
    # AST Transformations for EBNF-grammar
264
    "+":
265
        remove_expendables,
266
    "syntax":
267
        [],  # otherwise '"*": replace_by_single_child' would be applied
268
    "directive, definition":
269
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
270
    "expression":
271
        [replace_by_single_child, flatten, remove_tokens('|')],  # remove_infix_operator],
272
273
274
275
276
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
277
        [remove_brackets, replace_by_single_child],
278
    "oneormore, repetition, option":
279
280
        [reduce_single_child, remove_brackets,
         forbid('repetition', 'option', 'oneormore'), assert_content(r'(?!§)')],
281
    "symbol, literal, regexp":
282
        reduce_single_child,
283
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
284
        reduce_single_child,
285
    "list_":
286
        [flatten, remove_infix_operator],
287
    "*":
288
        replace_by_single_child
289
290
}

291

Eckhart Arnold's avatar
Eckhart Arnold committed
292
def EBNFTransform() -> TransformationFunc:
293
    return partial(traverse, processing_table=EBNF_AST_transformation_table.copy())
di68kap's avatar
di68kap committed
294

295
def get_ebnf_transformer() -> TransformationFunc:
296
297
298
299
300
301
302
    global thread_local_EBNF_transformer_singleton
    try:
        transformer = thread_local_EBNF_transformer_singleton
    except NameError:
        thread_local_EBNF_transformer_singleton = EBNFTransform()
        transformer = thread_local_EBNF_transformer_singleton
    return transformer
Eckhart Arnold's avatar
Eckhart Arnold committed
303
304
305
306
307
308
309
310


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

311

312
PreprocessorFactoryFunc = Callable[[], PreprocessorFunc]
313
ParserFactoryFunc = Callable[[], Grammar]
314
TransformerFactoryFunc = Callable[[], TransformationFunc]
315
316
CompilerFactoryFunc = Callable[[], Compiler]

317
318
319
PREPROCESSOR_FACTORY = '''
def get_preprocessor() -> PreprocessorFunc:
    return {NAME}Preprocessor
320
321
322
323
'''


GRAMMAR_FACTORY = '''
324
def get_grammar() -> {NAME}Grammar:
325
326
327
328
329
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
330
331
        grammar = thread_local_{NAME}_grammar_singleton
    return grammar
332
333
334
335
'''


TRANSFORMER_FACTORY = '''
336
337
338
def {NAME}Transform() -> TransformationDict:
    return partial(traverse, processing_table={NAME}_AST_transformation_table.copy())

339
def get_transformer() -> TransformationFunc:
340
341
342
343
344
345
346
    global thread_local_{NAME}_transformer_singleton
    try:
        transformer = thread_local_{NAME}_transformer_singleton
    except NameError:
        thread_local_{NAME}_transformer_singleton = {NAME}Transform()
        transformer = thread_local_{NAME}_transformer_singleton
    return transformer
347
348
349
350
'''


COMPILER_FACTORY = '''
351
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
352
353
354
355
356
357
358
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
359
360
        compiler = thread_local_{NAME}_compiler_singleton
    return compiler
361
362
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
363

364
365
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
366
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
367
368
369
    pass


370
class EBNFCompiler(Compiler):
371
372
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
373
    in EBNF-Notation.
374
375
376
377
378
379
380
381
382
383
384
385
386

    Instances of this class must be called with the root-node of the
    abstract syntax tree from an EBNF-specification of a formal language.
    The returned value is the Python-source-code of a Grammar class for
    this language that can be used to parse texts in this language.
    See classes `parser.Compiler` and `parser.Grammar` for more information.

    Addionally, class EBNFCompiler provides helper methods to generate
    code-skeletons for a preprocessor, AST-transformation and full
    compilation of the formal language. These method's names start with
    the prefix `gen_`.

    Attributes:
387
        current_symbols:  During compilation, a list containing the root
388
389
390
391
                node of the currently compiled definition as first element
                and then the nodes of the symbols that are referred to in
                the currently compiled definition.

392
        rules:  Dictionary that maps rule names to a list of Nodes that
393
394
395
396
397
398
399
400
401
                contain symbol-references in the definition of the rule.
                The first item in the list is the node of the rule-
                definition itself. Example:

                           `alternative = a | b`

                Now `[str(node) for node in self.rules['alternative']]`
                yields `['alternative = a | b', 'a', 'b']`

402
        symbols:  A mapping of symbol names to their first usage (not
403
404
                their definition!) in the EBNF source.

405
        variables:  A set of symbols names that are used with the
406
407
408
409
                Pop or Retrieve operator. Because the values of these
                symbols need to be captured they are called variables.
                See `test_parser.TestPopRetrieve` for an example.

410
        recursive:  A set of symbols that are used recursively and
411
412
                therefore require a `Forward`-operator.

413
        definitions:  A dictionary of definitions. Other than `rules`
414
415
                this maps the symbols to their compiled definienda.

416
        deferred_taks:  A list of callables that is filled during
417
418
419
420
421
                compilatation, but that will be executed only after
                compilation has finished. Typically, it contains
                sementatic checks that require information that
                is only available upon completion of compilation.

422
        root:   The name of the root symbol.
423

424
        directives:  A dictionary of all directives and their default
425
                values.
426
427
428

        re_flags:  A set of regular expression flags to be added to all
                regular expressions found in the current parsing process
429
430
    """
    COMMENT_KEYWORD = "COMMENT__"
431
    WHITESPACE_KEYWORD = "WSP__"
Eckhart Arnold's avatar
Eckhart Arnold committed
432
433
    RAW_WS_KEYWORD = "WHITESPACE__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, RAW_WS_KEYWORD, COMMENT_KEYWORD}
434
    AST_ERROR = "Badly structured syntax tree. " \
Eckhart Arnold's avatar
Eckhart Arnold committed
435
                "Potentially due to erroneous AST transformation."
436
437
438
439
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
440
441
442
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
443

444

445
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
446
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
447
448
        self._reset()

449

450
    def _reset(self):
451
        super(EBNFCompiler, self)._reset()
452
        self._result = ''           # type: str
453
        self.re_flags = set()       # type: Set[str]
454
455
456
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
457
458
        self.variables = set()      # type: Set[str]
        self.recursive = set()      # type: Set[str]
459
        self.definitions = {}       # type: Dict[str, str]
460
        self.deferred_tasks = []    # type: List[Callable]
461
        self.root_symbol = ""  # type: str
462
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
463
                           'comment': '',
464
                           'literalws': ['right'],
465
466
                           'tokens': set(),  # alt. 'preprocessor_tokens'
                           'filter': dict(),  # alt. 'filter'
Eckhart Arnold's avatar
Eckhart Arnold committed
467
                           'ignorecase': False}
468

Eckhart Arnold's avatar
Eckhart Arnold committed
469
    @property
470
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
471
472
        return self._result

473
    # methods for generating skeleton code for preprocessor, transformer, and compiler
474

475
    def gen_preprocessor_skeleton(self) -> str:
476
477
478
479
        """
        Returns Python-skeleton-code for a preprocessor-function for
        the previously compiled formal language.
        """
480
        name = self.grammar_name + "Preprocessor"
481
        return "def %s(text):\n    return text\n" % name \
482
               + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
483

484

485
    def gen_transformer_skeleton(self) -> str:
486
487
488
489
        """
        Returns Python-skeleton-code for the AST-transformation for the
        previously compiled formal language.
        """
490
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
491
492
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
493
        tt_name = self.grammar_name + '_AST_transformation_table'
di68kap's avatar
di68kap committed
494
        transtable = [tt_name + ' = {',
Eckhart Arnold's avatar
Eckhart Arnold committed
495
                      '    # AST Transformations for the ' + self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
496
        transtable.append('    "+": remove_empty,')
497
        for name in self.rules:
498
499
500
501
502
503
504
            tf = '[]'
            rule = self.definitions[name]
            if rule.startswith('Alternative'):
                tf = '[replace_or_reduce]'
            elif rule.startswith('Synonym'):
                tf = '[replace_by_single_child]'
            transtable.append('    "' + name + '": %s,' % tf)
505
        transtable.append('    ":Token, :RE": reduce_single_child,')
506
        transtable += ['    "*": replace_by_single_child', '}', '']
507
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
508
509
        return '\n'.join(transtable)

510

511
    def gen_compiler_skeleton(self) -> str:
512
513
514
515
        """
        Returns Python-skeleton-code for a Compiler-class for the
        previously compiled formal language.
        """
516
        if not self.rules:
517
518
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
519
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
520
521
522
523
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
524
                    self.grammar_name + '", grammar_source=""):',
525
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
526
                    'Compiler, self).__init__(grammar_name, grammar_source)',
527
                    "        assert re.match('\w+\Z', grammar_name)", '']
528
        for name in self.rules:
529
            method_name = Compiler.method_name(name)
530
            if name == self.root_symbol:
531
                compiler += ['    def ' + method_name + '(self, node):',
532
533
                             '        return node', '']
            else:
534
                compiler += ['    def ' + method_name + '(self, node):',
535
                             '        pass', '']
536
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
537
        return '\n'.join(compiler)
538

539

540
541
542
543
544
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
545
546
547
548
549
550
551
552
553
554

        # execute deferred tasks, for example semantic checks that cannot
        # be done before the symbol table is complete

        for task in self.deferred_tasks:
            task()

        # provide for capturing of symbols that are variables, i.e. the
        # value of will be retrieved at some point during the parsing process

555
556
557
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
558
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
559

560
561
        # add special fields for Grammar class

562
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
563
                            if 'right' in self.directives['literalws'] else "''"))
564
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
565
                            if 'left' in self.directives['literalws'] else "''"))
566
        definitions.append((self.WHITESPACE_KEYWORD,
Eckhart Arnold's avatar
Eckhart Arnold committed
567
568
569
                            ("mixin_comment(whitespace=" + self.RAW_WS_KEYWORD +
                             ", comment=" + self.COMMENT_KEYWORD + ")")))
        definitions.append((self.RAW_WS_KEYWORD, "r'{whitespace}'".format(**self.directives)))
570
571
572
573
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
574

575
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
576
        declarations = ['class ' + self.grammar_name +
577
                        'Grammar(Grammar):',
578
579
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
580
                        (', with this grammar:' if self.grammar_source else '.')]
581
        definitions.append(('parser_initialization__', '"upon instantiation"'))
582
        if self.grammar_source:
583
            definitions.append(('source_hash__',
584
                                '"%s"' % md5(self.grammar_source, __version__)))
585
            declarations.append('')
586
            declarations += [line for line in self.grammar_source.split('\n')]
587
588
589
590
591
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
592

593
        self.root_symbol = definitions[0][0] if definitions else ""
594
595
596
597
598
599
600
601
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
602
603
604
605
606
607
608

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
609
                # root_node.error_flag = True
610
611
612

        # check for unconnected rules

Eckhart Arnold's avatar
Eckhart Arnold committed
613
614
615
616
617
618
619
620
621
622
623
624
        defined_symbols.difference_update(self.RESERVED_SYMBOLS)

        def remove_connections(symbol):
            if symbol in defined_symbols:
                defined_symbols.remove(symbol)
                for related in self.rules[symbol][1:]:
                    remove_connections(str(related))

        remove_connections(self.root_symbol)
        for leftover in defined_symbols:
            self.rules[leftover][0].add_error(('Rule "%s" is not connected to '
                'parser root "%s" !') % (leftover, self.root_symbol), Error.WARNING)
625

626
        # set root_symbol parser and assemble python grammar definition
627

628
629
        if self.root_symbol and 'root__' not in self.rules:
            declarations.append('root__ = ' + self.root_symbol)
630
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
631
632
633
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
634

635
636
637

    ## compilation methods

638
    def on_syntax(self, node: Node) -> str:
639
        definitions = []  # type: List[Tuple[str, str]]
640
641

        # drop the wrapping sequence node
642
643
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
644
645

        # compile definitions and directives and collect definitions
646
        for nd in node.children:
647
            if nd.parser.name == "definition":
648
                definitions.append(self.compile(nd))
649
            else:
650
                assert nd.parser.name == "directive", nd.as_sxpr()
651
                self.compile(nd)
652
            node.error_flag = max(node.error_flag, nd.error_flag)
653
        self.definitions.update(definitions)
654

655
        return self.assemble_parser(definitions, node)
656

657

658
    def on_definition(self, node: Node) -> Tuple[str, str]:
659
        rule = str(node.children[0])
660
        if rule in self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
661
662
663
664
665
            first = self.rules[rule][0]
            if not first._errors:
                first.add_error('First definition of rule "%s" '
                                'followed by illegal redefinitions.' % rule)
            node.add_error('A rule with name "%s" has already been defined earlier.' % rule)
666
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
667
668
669
670
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
671
        elif rule in self.directives['tokens']:
672
            node.add_error('Symbol "%s" has already been defined as '
673
                           'a preprocessor token.' % rule)
674
675
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
676
                           % rule + '(This may change in the future.)')
677
        try:
678
679
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
680
            defn = self.compile(node.children[1])
681
            if rule in self.variables:
682
                defn = 'Capture(%s)' % defn
683
                self.variables.remove(rule)
684
685
686
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
687
        except TypeError as error:
688
689
690
691
            from traceback import extract_tb
            trace = str(extract_tb(error.__traceback__)[-1])
            errmsg = "%s (TypeError: %s; %s)\n%s" \
                     % (EBNFCompiler.AST_ERROR, str(error), trace, node.as_sxpr())
692
693
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
694
        return rule, defn
695

696

697
    def _check_rx(self, node: Node, rx: str) -> str:
698
699
        """
        Checks whether the string `rx` represents a valid regular
700
701
702
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
703
        flags = self.re_flags | {'x'} if rx.find('\n') >= 0 else self.re_flags
704
        if flags:  rx = "(?%s)%s" % ("".join(flags), rx)
705
706
707
708
709
710
711
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

712

713
    def on_directive(self, node: Node) -> str:
714
        key = str(node.children[0]).lower()
715
        assert key not in self.directives['tokens']
716

717
        if key in {'comment', 'whitespace'}:
718
719
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
720
                    node.add_error('Directive "%s" must have one, but not %i values.' %
721
                                   (key, len(node.children[1].result)))
722
                value = self.compile(node.children[1]).pop()
723
724
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
725
                else:
726
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
727
            else:
728
729
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
730
731
732
733
734
735
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
736
737
738
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
739
            self.directives[key] = value
740

741
742
743
744
745
746
        elif key == 'ignorecase':
            value = str(node.children[1]).lower() not in {"off", "false", "no"}
            self.directives['ignorecase'] == value
            if value:
                self.re_flags.add('i')

Eckhart Arnold's avatar
Eckhart Arnold committed
747
748
749
        # elif key == 'testing':
        #     value = str(node.children[1])
        #     self.directives['testing'] = value.lower() not in {"off", "false", "no"}
750

751
        elif key == 'literalws':
752
            value = {item.lower() for item in self.compile(node.children[1])}
753
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
754
                    or ('none' in value and len(value) > 1)):
755
756
757
758
759
760
761
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

762
        elif key in {'tokens', 'preprocessor_tokens'}:
763
            self.directives['tokens'] |= self.compile(node.children[1])
764

765
        elif key.endswith('_filter'):
766
            filter_set = self.compile(node.children[1])
767
768
769
770
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
771

772
773
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
774
                           (key, ', '.join(list(self.directives.keys()))))
775
776
        return ""

777

778
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
779
780
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
781
782
        name for the particular non-terminal.
        """
783
        arguments = [self.compile(r) for r in node.children] + custom_args
784
        node.error_flag = max(node.error_flag, max(t.error_flag for t in node.children))
785
786
        return parser_class + '(' + ', '.join(arguments) + ')'

787

788
    def on_expression(self, node) -> str:
789
790
        return self.non_terminal(node, 'Alternative')

791

792
    def on_term(self, node) -> str:
di68kap's avatar
di68kap committed
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
        # Basically, the following code does only this:
        #       return self.non_terminal(node, 'Series')
        # What makes it (look) more complicated is the handling of the
        # mandatory §-operator
        mandatory_marker = []
        filtered_children = []
        i = 0
        for nd in node.children:
            if nd.parser.ptype == TOKEN_PTYPE and str(nd) == "§":
                mandatory_marker.append(i)
                if i == 0:
                    nd.add_error('First item of a series should not be mandatory.',
                                 Error.WARNING)
                elif len(mandatory_marker) > 1:
                    nd.add_error('One mandatory marker (§) sufficient to declare the '
                                 'rest of the series as mandatory.', Error.WARNING)
            else:
                filtered_children.append(nd)
                i += 1
        saved_result = node.result
        node.result = tuple(filtered_children)
        custom_args =  ['mandatory=%i' % mandatory_marker[0]] if mandatory_marker else []
        compiled = self.non_terminal(node, 'Series', custom_args)
        node.result = saved_result
        return compiled
818

819

820
    def on_factor(self, node: Node) -> str:
821
        assert node.children
822
        assert len(node.children) >= 2, node.as_sxpr()
823
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
824
        custom_args = []  # type: List[str]
825
826

        if prefix in {'::', ':'}:
827
828
            assert len(node.children) == 2
            arg = node.children[-1]
829
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
830
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
831
832
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
833
            if str(arg) in self.directives['filter']:
834
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
835
            self.variables.add(str(arg))  # cast(str, arg.result)
836

837
        elif len(node.children) > 2:
838
839
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
840
841
842
843
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
844

845
        node.result = node.children[1:]
846
847
        try:
            parser_class = self.PREFIX_TABLE[prefix]
848
849
850
851
            result = self.non_terminal(node, parser_class, custom_args)
            if prefix[:1] == '-':
                def check(node):
                    nd = node
852
853
854
855
856
857
858
859
860
861
                    if len(nd.children) >= 1:
                        nd = nd.children[0]
                    while nd.parser.name == "symbol":
                        symlist = self.rules.get(str(nd), [])
                        if len(symlist) == 2:
                            nd = symlist[1]
                        else:
                            if len(symlist) == 1:
                                nd = symlist[0].children[1]
                            break
862
863
864
                    if (nd.parser.name != "regexp" or str(nd)[:1] != '/'
                        or str(nd)[-1:] != '/'):
                        node.add_error("Lookbehind-parser can only be used with plain RegExp-"
865
                                       "parsers, not with: " + nd.parser.name + nd.parser.ptype)
866
867
868
869

                if not result.startswith('RegExp('):
                    self.deferred_tasks.append(lambda: check(node))
            return result
870
871
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
872
        return ""
873

874

875
    def on_option(self, node) -> str:
876
        return self.non_terminal(node, 'Option')
877

878

879
    def on_repetition(self, node) -> str:
880
881
        return self.non_terminal(node, 'ZeroOrMore')

882

883
    def on_oneormore(self, node) -> str:
884
885
        return self.non_terminal(node, 'OneOrMore')

886

887
    def on_regexchain(self, node) -> str:
888
889
        raise EBNFCompilerError("Not yet implemented!")

890

891
    def on_group(self, node) -> str:
892
893
894
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

895

896
897
898
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
899
            return 'PreprocessorToken("' + symbol + '")'
900
        else:
901
902
            self.current_symbols.append(node)
            if symbol not in self.symbols:
903
                self.symbols[symbol] = node  # remember first use of symbol
904
905
            if symbol in self.rules:
                self.recursive.add(symbol)
Eckhart Arnold's avatar
Eckhart Arnold committed
906
            if symbol in EBNFCompiler.RESERVED_SYMBOLS:  # (EBNFCompiler.WHITESPACE_KEYWORD, EBNFCompiler.COMMENT_KEYWORD):
Eckhart Arnold's avatar
Eckhart Arnold committed
907
                return "RegExp(%s)" % symbol
908
            return symbol
909

910

911
    def on_literal(self, node) -> str:
912
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.merge_children([node.result]) + ')' ?
913

914

915
    def on_regexp(self, node: Node) -> str:
916
        rx = str(node)
917
        name = []   # type: List[str]
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
        if rx[0] == '/' and rx[-1] == '/':
            parser = 'RegExp('
        else:
            parser = 'RE('
            if rx[:2] == '~/':
                if not 'left' in self.directives['literalws']:
                    name = ['wL=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[1:]
            elif 'left' in self.directives['literalws']:
                name = ["wL=''"] + name
            if rx[-2:] == '/~':
                if 'right' not in self.directives['literalws']:
                    name = ['wR=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[:-1]
            elif 'right' in self.directives['literalws']:
                name = ["wR=''"] + name
934
935
936
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
937
938
939
940
            from traceback import extract_tb
            trace = str(extract_tb(error.__traceback__)[-1])
            errmsg = "%s (AttributeError: %s; %s)\n%s" \
                     % (EBNFCompiler.AST_ERROR, str(error), trace, node.as_sxpr())
941
942
            node.add_error(errmsg)
            return '"' + errmsg + '"'
943
        return parser + ', '.join([arg] + name) + ')'
Eckhart Arnold's avatar