ebnf.py 41.7 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
from collections import OrderedDict
21
from functools import partial
22

23
24
25
26
try:
    import regex as re
except ImportError:
    import re
27
try:
Eckhart Arnold's avatar
Eckhart Arnold committed
28
    from typing import Callable, Dict, List, Set, Tuple, Union
29
except ImportError:
Eckhart Arnold's avatar
Eckhart Arnold committed
30
    from .typing34 import Callable, Dict, List, Set, Tuple, Union
31

32
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
33
34
35
from DHParser.parser import Grammar, mixin_comment, nil_preprocessor, Forward, RegExp, RE, \
    NegativeLookahead, Alternative, Series, Option, OneOrMore, ZeroOrMore, Token, \
    Compiler, PreprocessorFunc
36
37
from DHParser.syntaxtree import Node, TransformationFunc, WHITESPACE_PTYPE, TOKEN_PTYPE
from DHParser.error import Error
38
from DHParser.transform import traverse, remove_brackets, \
39
    reduce_single_child, replace_by_single_child, remove_expendables, \
40
    remove_tokens, flatten, forbid, assert_content, remove_infix_operator
41
from DHParser.versionnumber import __version__
42

43
__all__ = ('get_ebnf_preprocessor',
44
45
46
47
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
48
           'EBNFTransform',
Eckhart Arnold's avatar
Eckhart Arnold committed
49
           'EBNFCompilerError',
50
           'EBNFCompiler',
51
           'grammar_changed',
52
           'PreprocessorFactoryFunc',
53
54
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
55
           'CompilerFactoryFunc')
56
57


Eckhart Arnold's avatar
Eckhart Arnold committed
58
59
60
61
62
63
64
########################################################################
#
# EBNF scanning
#
########################################################################


65
66
def get_ebnf_preprocessor() -> PreprocessorFunc:
    return nil_preprocessor
Eckhart Arnold's avatar
Eckhart Arnold committed
67
68
69
70
71
72
73
74


########################################################################
#
# EBNF parsing
#
########################################################################

75

76
# class EBNFGrammar(Grammar):
77
#     r"""Parser for an EBNF_variant source file, with this grammar:
78
79
80
81
82
83
84
85
#
#     # EBNF-Grammar in EBNF
#
#     @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
#     @ whitespace =  /\s*/                            # whitespace includes linefeed
#     @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly
#
#     syntax     =  [~//] { definition | directive } §EOF
86
87
#     definition =  symbol §"=" §expression
#     directive  =  "@" §symbol §"=" §( regexp | literal | list_ )
88
89
90
91
92
93
94
95
96
97
98
#
#     expression =  term { "|" term }
#     term       =  { factor }+
#     factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
#                 | [flowmarker] literal
#                 | [flowmarker] regexp
#                 | [flowmarker] group
#                 | [flowmarker] oneormore
#                 | repetition
#                 | option
#
99
100
#     flowmarker =  "!"  | "&"  | "§"                  # '!' negative lookahead, '&' positive lookahead, '§' required
#                 | "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
101
102
103
104
105
106
107
108
109
110
#     retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve
#
#     group      =  "(" expression §")"
#     oneormore  =  "{" expression "}+"
#     repetition =  "{" expression §"}"
#     option     =  "[" expression §"]"
#
#     symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
#     literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
#                 | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
111
#     regexp     =  /~?\/(?:\\\/|[^\/])*?\/~?/~        # e.g. /\w+/, ~/#.*(?:\n|$)/~
112
113
114
115
116
117
118
#                                                      # '~' is a whitespace-marker, if present leading or trailing
#                                                      # whitespace of a regular expression will be ignored tacitly.
#     list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
#                                                      # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
#     EOF =  !/./
#     """
#     expression = Forward()
119
#     source_hash__ = "4735db10f0b79d44209d1de0184b2ca0"
120
121
#     parser_initialization__ = "upon instantiation"
#     COMMENT__ = r'#.*(?:\n|$)'
122
123
#     WHITESPACE__ = r'\s*'
#     WSP__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
124
125
#     wspL__ = ''
#     wspR__ = WSP__
126
127
128
#     EOF = NegativeLookahead(RegExp('.'))
#     list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'), mandatory=1000)), mandatory=1000)
#     regexp = RE('~?/(?:\\\\/|[^/])*?/~?')
129
130
#     literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
#     symbol = RE('(?!\\d)\\w+')
131
132
133
134
#     option = Series(Token("["), expression, Token("]"), mandatory=2)
#     repetition = Series(Token("{"), expression, Token("}"), mandatory=2)
#     oneormore = Series(Token("{"), expression, Token("}+"), mandatory=1000)
#     group = Series(Token("("), expression, Token(")"), mandatory=2)
135
136
#     retrieveop = Alternative(Token("::"), Token(":"))
#     flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
137
138
139
140
141
#     factor = Alternative(
#         Series(Option(flowmarker), Option(retrieveop), symbol, NegativeLookahead(Token("=")), mandatory=1000),
#         Series(Option(flowmarker), literal, mandatory=1000), Series(Option(flowmarker), regexp, mandatory=1000),
#         Series(Option(flowmarker), group, mandatory=1000), Series(Option(flowmarker), oneormore, mandatory=1000),
#         repetition, option)
142
#     term = OneOrMore(factor)
143
144
145
146
#     expression.set(Series(term, ZeroOrMore(Series(Token("|"), term, mandatory=1000)), mandatory=1000))
#     directive = Series(Token("@"), symbol, Token("="), Alternative(regexp, literal, list_), mandatory=1)
#     definition = Series(symbol, Token("="), expression, mandatory=1)
#     syntax = Series(Option(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), EOF, mandatory=2)
147
148
#     root__ = syntax

149

150
class EBNFGrammar(Grammar):
151
152
153
154
155
156
157
158
159
160
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
161
    directive  =  "@" §symbol "=" ( regexp | literal | list_ )
162
163

    expression =  term { "|" term }
164
    term       =  { ["§"] factor }+                       # "§" means all following factors mandatory
165
166
167
168
169
170
171
172
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
                | [flowmarker] oneormore
                | repetition
                | option

173
174
    flowmarker =  "!"  | "&"                         # '!' negative lookahead, '&' positive lookahead
                | "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
175
176
177
178
179
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
180
181
    option     =  "[" expression §"]"

182
183
184
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
185
    regexp     =  /~?\/(?:\\\/|[^\/])*?\/~?/~        # e.g. /\w+/, ~/#.*(?:\n|$)/~
186
187
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
188
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
189
190
191
192
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
193
    source_hash__ = "a131abc5259738631000cda90d2fc65b"
194
    parser_initialization__ = "upon instantiation"
195
    COMMENT__ = r'#.*(?:\n|$)'
196
197
    WHITESPACE__ = r'\s*'
    WSP__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
198
    wspL__ = ''
199
    wspR__ = WSP__
200
201
202
203
    EOF = NegativeLookahead(RegExp('.'))
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'), mandatory=1000)),
                   mandatory=1000)
    regexp = RE('~?/(?:\\\\/|[^/])*?/~?')
204
205
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
206
207
208
209
    option = Series(Token("["), expression, Token("]"), mandatory=2)
    repetition = Series(Token("{"), expression, Token("}"), mandatory=2)
    oneormore = Series(Token("{"), expression, Token("}+"), mandatory=1000)
    group = Series(Token("("), expression, Token(")"), mandatory=2)
210
    retrieveop = Alternative(Token("::"), Token(":"))
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
    flowmarker = Alternative(Token("!"), Token("&"), Token("-!"), Token("-&"))
    factor = Alternative(
        Series(Option(flowmarker), Option(retrieveop), symbol, NegativeLookahead(Token("=")),
               mandatory=1000), Series(Option(flowmarker), literal, mandatory=1000),
        Series(Option(flowmarker), regexp, mandatory=1000),
        Series(Option(flowmarker), group, mandatory=1000),
        Series(Option(flowmarker), oneormore, mandatory=1000), repetition, option)
    term = OneOrMore(Series(Option(Token("§")), factor, mandatory=1000))
    expression.set(
        Series(term, ZeroOrMore(Series(Token("|"), term, mandatory=1000)), mandatory=1000))
    directive = Series(Token("@"), symbol, Token("="), Alternative(regexp, literal, list_),
                       mandatory=1)
    definition = Series(symbol, Token("="), expression, mandatory=1)
    syntax = Series(Option(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)),
                    EOF, mandatory=2)
226
227
228
    root__ = syntax


229
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
249
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
250
251
252
253
254
255
256
257
258
259
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


260
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


277
EBNF_AST_transformation_table = {
278
    # AST Transformations for EBNF-grammar
279
    "+":
280
        remove_expendables,
281
    "syntax":
282
        [],  # otherwise '"*": replace_by_single_child' would be applied
283
    "directive, definition":
284
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
285
    "expression":
286
        [replace_by_single_child, flatten, remove_tokens('|')],  # remove_infix_operator],
287
288
289
290
291
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
292
        [remove_brackets, replace_by_single_child],
293
    "oneormore, repetition, option":
294
295
        [reduce_single_child, remove_brackets,
         forbid('repetition', 'option', 'oneormore'), assert_content(r'(?!§)')],
296
    "symbol, literal, regexp":
297
        reduce_single_child,
298
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
299
        reduce_single_child,
300
    "list_":
301
        [flatten, remove_infix_operator],
302
    "*":
303
        replace_by_single_child
304
305
}

306

Eckhart Arnold's avatar
Eckhart Arnold committed
307
def EBNFTransform() -> TransformationFunc:
308
    return partial(traverse, processing_table=EBNF_AST_transformation_table.copy())
di68kap's avatar
di68kap committed
309

310
def get_ebnf_transformer() -> TransformationFunc:
311
312
313
314
315
316
317
    global thread_local_EBNF_transformer_singleton
    try:
        transformer = thread_local_EBNF_transformer_singleton
    except NameError:
        thread_local_EBNF_transformer_singleton = EBNFTransform()
        transformer = thread_local_EBNF_transformer_singleton
    return transformer
Eckhart Arnold's avatar
Eckhart Arnold committed
318
319
320
321
322
323
324
325


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

326

327
PreprocessorFactoryFunc = Callable[[], PreprocessorFunc]
328
ParserFactoryFunc = Callable[[], Grammar]
329
TransformerFactoryFunc = Callable[[], TransformationFunc]
330
331
CompilerFactoryFunc = Callable[[], Compiler]

332
333
334
PREPROCESSOR_FACTORY = '''
def get_preprocessor() -> PreprocessorFunc:
    return {NAME}Preprocessor
335
336
337
338
'''


GRAMMAR_FACTORY = '''
339
def get_grammar() -> {NAME}Grammar:
340
341
342
343
344
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
345
346
        grammar = thread_local_{NAME}_grammar_singleton
    return grammar
347
348
349
350
'''


TRANSFORMER_FACTORY = '''
351
352
353
def {NAME}Transform() -> TransformationDict:
    return partial(traverse, processing_table={NAME}_AST_transformation_table.copy())

354
def get_transformer() -> TransformationFunc:
355
356
357
358
359
360
361
    global thread_local_{NAME}_transformer_singleton
    try:
        transformer = thread_local_{NAME}_transformer_singleton
    except NameError:
        thread_local_{NAME}_transformer_singleton = {NAME}Transform()
        transformer = thread_local_{NAME}_transformer_singleton
    return transformer
362
363
364
365
'''


COMPILER_FACTORY = '''
366
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
367
368
369
370
371
372
373
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
374
375
        compiler = thread_local_{NAME}_compiler_singleton
    return compiler
376
377
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
378

379
380
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
381
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
382
383
384
    pass


385
class EBNFCompiler(Compiler):
386
387
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
388
    in EBNF-Notation.
389
390
391
392
393
394
395
396
397
398
399
400
401

    Instances of this class must be called with the root-node of the
    abstract syntax tree from an EBNF-specification of a formal language.
    The returned value is the Python-source-code of a Grammar class for
    this language that can be used to parse texts in this language.
    See classes `parser.Compiler` and `parser.Grammar` for more information.

    Addionally, class EBNFCompiler provides helper methods to generate
    code-skeletons for a preprocessor, AST-transformation and full
    compilation of the formal language. These method's names start with
    the prefix `gen_`.

    Attributes:
402
        current_symbols:  During compilation, a list containing the root
403
404
405
406
                node of the currently compiled definition as first element
                and then the nodes of the symbols that are referred to in
                the currently compiled definition.

407
        rules:  Dictionary that maps rule names to a list of Nodes that
408
409
410
411
412
413
414
415
416
                contain symbol-references in the definition of the rule.
                The first item in the list is the node of the rule-
                definition itself. Example:

                           `alternative = a | b`

                Now `[str(node) for node in self.rules['alternative']]`
                yields `['alternative = a | b', 'a', 'b']`

417
        symbols:  A mapping of symbol names to their first usage (not
418
419
                their definition!) in the EBNF source.

420
        variables:  A set of symbols names that are used with the
421
422
423
424
                Pop or Retrieve operator. Because the values of these
                symbols need to be captured they are called variables.
                See `test_parser.TestPopRetrieve` for an example.

425
        recursive:  A set of symbols that are used recursively and
426
427
                therefore require a `Forward`-operator.

428
        definitions:  A dictionary of definitions. Other than `rules`
429
430
                this maps the symbols to their compiled definienda.

431
        deferred_taks:  A list of callables that is filled during
432
433
434
435
436
                compilatation, but that will be executed only after
                compilation has finished. Typically, it contains
                sementatic checks that require information that
                is only available upon completion of compilation.

437
        root:   The name of the root symbol.
438

439
        directives:  A dictionary of all directives and their default
440
                values.
441
442
443

        re_flags:  A set of regular expression flags to be added to all
                regular expressions found in the current parsing process
444
445
    """
    COMMENT_KEYWORD = "COMMENT__"
446
    WHITESPACE_KEYWORD = "WSP__"
Eckhart Arnold's avatar
Eckhart Arnold committed
447
448
    RAW_WS_KEYWORD = "WHITESPACE__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, RAW_WS_KEYWORD, COMMENT_KEYWORD}
449
    AST_ERROR = "Badly structured syntax tree. " \
Eckhart Arnold's avatar
Eckhart Arnold committed
450
                "Potentially due to erroneous AST transformation."
451
452
453
454
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
455
456
457
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
458

459

460
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
461
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
462
463
        self._reset()

464

465
    def _reset(self):
466
        super(EBNFCompiler, self)._reset()
467
        self._result = ''           # type: str
468
        self.re_flags = set()       # type: Set[str]
469
470
471
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
472
473
        self.variables = set()      # type: Set[str]
        self.recursive = set()      # type: Set[str]
474
        self.definitions = {}       # type: Dict[str, str]
475
        self.deferred_tasks = []    # type: List[Callable]
476
        self.root_symbol = ""  # type: str
477
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
478
                           'comment': '',
479
                           'literalws': ['right'],
480
481
                           'tokens': set(),  # alt. 'preprocessor_tokens'
                           'filter': dict(),  # alt. 'filter'
Eckhart Arnold's avatar
Eckhart Arnold committed
482
                           'ignorecase': False}
483

Eckhart Arnold's avatar
Eckhart Arnold committed
484
    @property
485
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
486
487
        return self._result

488
    # methods for generating skeleton code for preprocessor, transformer, and compiler
489

490
    def gen_preprocessor_skeleton(self) -> str:
491
492
493
494
        """
        Returns Python-skeleton-code for a preprocessor-function for
        the previously compiled formal language.
        """
495
        name = self.grammar_name + "Preprocessor"
496
        return "def %s(text):\n    return text\n" % name \
497
               + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
498

499

500
    def gen_transformer_skeleton(self) -> str:
501
502
503
504
        """
        Returns Python-skeleton-code for the AST-transformation for the
        previously compiled formal language.
        """
505
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
506
507
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
508
        tt_name = self.grammar_name + '_AST_transformation_table'
di68kap's avatar
di68kap committed
509
        transtable = [tt_name + ' = {',
Eckhart Arnold's avatar
Eckhart Arnold committed
510
                      '    # AST Transformations for the ' + self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
511
        transtable.append('    "+": remove_empty,')
512
        for name in self.rules:
513
514
515
516
517
518
519
            tf = '[]'
            rule = self.definitions[name]
            if rule.startswith('Alternative'):
                tf = '[replace_or_reduce]'
            elif rule.startswith('Synonym'):
                tf = '[replace_by_single_child]'
            transtable.append('    "' + name + '": %s,' % tf)
520
        transtable.append('    ":Token, :RE": reduce_single_child,')
521
        transtable += ['    "*": replace_by_single_child', '}', '']
522
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
523
524
        return '\n'.join(transtable)

525

526
    def gen_compiler_skeleton(self) -> str:
527
528
529
530
        """
        Returns Python-skeleton-code for a Compiler-class for the
        previously compiled formal language.
        """
531
        if not self.rules:
532
533
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
534
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
535
536
537
538
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
539
                    self.grammar_name + '", grammar_source=""):',
540
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
541
                    'Compiler, self).__init__(grammar_name, grammar_source)',
542
                    "        assert re.match('\w+\Z', grammar_name)", '']
543
        for name in self.rules:
544
            method_name = Compiler.method_name(name)
545
            if name == self.root_symbol:
546
                compiler += ['    def ' + method_name + '(self, node):',
547
548
                             '        return node', '']
            else:
549
                compiler += ['    def ' + method_name + '(self, node):',
550
                             '        pass', '']
551
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
552
        return '\n'.join(compiler)
553

554

555
556
557
558
559
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
560
561
562
563
564
565
566
567
568
569

        # execute deferred tasks, for example semantic checks that cannot
        # be done before the symbol table is complete

        for task in self.deferred_tasks:
            task()

        # provide for capturing of symbols that are variables, i.e. the
        # value of will be retrieved at some point during the parsing process

570
571
572
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
573
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
574

575
576
        # add special fields for Grammar class

577
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
578
                            if 'right' in self.directives['literalws'] else "''"))
579
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
580
                            if 'left' in self.directives['literalws'] else "''"))
581
        definitions.append((self.WHITESPACE_KEYWORD,
Eckhart Arnold's avatar
Eckhart Arnold committed
582
583
584
                            ("mixin_comment(whitespace=" + self.RAW_WS_KEYWORD +
                             ", comment=" + self.COMMENT_KEYWORD + ")")))
        definitions.append((self.RAW_WS_KEYWORD, "r'{whitespace}'".format(**self.directives)))
585
586
587
588
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
589

590
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
591
        declarations = ['class ' + self.grammar_name +
592
                        'Grammar(Grammar):',
593
594
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
595
                        (', with this grammar:' if self.grammar_source else '.')]
596
        definitions.append(('parser_initialization__', '"upon instantiation"'))
597
        if self.grammar_source:
598
            definitions.append(('source_hash__',
599
                                '"%s"' % md5(self.grammar_source, __version__)))
600
            declarations.append('')
601
            declarations += [line for line in self.grammar_source.split('\n')]
602
603
604
605
606
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
607

608
        self.root_symbol = definitions[0][0] if definitions else ""
609
610
611
612
613
614
615
616
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
617
618
619
620
621
622
623

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
624
                # root_node.error_flag = True
625
626
627

        # check for unconnected rules

Eckhart Arnold's avatar
Eckhart Arnold committed
628
629
630
631
632
633
634
635
636
637
638
639
        defined_symbols.difference_update(self.RESERVED_SYMBOLS)

        def remove_connections(symbol):
            if symbol in defined_symbols:
                defined_symbols.remove(symbol)
                for related in self.rules[symbol][1:]:
                    remove_connections(str(related))

        remove_connections(self.root_symbol)
        for leftover in defined_symbols:
            self.rules[leftover][0].add_error(('Rule "%s" is not connected to '
                'parser root "%s" !') % (leftover, self.root_symbol), Error.WARNING)
640

641
        # set root_symbol parser and assemble python grammar definition
642

643
644
        if self.root_symbol and 'root__' not in self.rules:
            declarations.append('root__ = ' + self.root_symbol)
645
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
646
647
648
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
649

650
651
652

    ## compilation methods

653
    def on_syntax(self, node: Node) -> str:
654
        definitions = []  # type: List[Tuple[str, str]]
655
656

        # drop the wrapping sequence node
657
658
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
659
660

        # compile definitions and directives and collect definitions
661
        for nd in node.children:
662
            if nd.parser.name == "definition":
663
                definitions.append(self.compile(nd))
664
            else:
665
                assert nd.parser.name == "directive", nd.as_sxpr()
666
                self.compile(nd)
667
            node.error_flag = max(node.error_flag, nd.error_flag)
668
        self.definitions.update(definitions)
669

670
        return self.assemble_parser(definitions, node)
671

672

673
    def on_definition(self, node: Node) -> Tuple[str, str]:
674
        rule = str(node.children[0])
675
        if rule in self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
676
677
678
679
680
            first = self.rules[rule][0]
            if not first._errors:
                first.add_error('First definition of rule "%s" '
                                'followed by illegal redefinitions.' % rule)
            node.add_error('A rule with name "%s" has already been defined earlier.' % rule)
681
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
682
683
684
685
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
686
        elif rule in self.directives['tokens']:
687
            node.add_error('Symbol "%s" has already been defined as '
688
                           'a preprocessor token.' % rule)
689
690
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
691
                           % rule + '(This may change in the future.)')
692
        try:
693
694
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
695
            defn = self.compile(node.children[1])
696
            if rule in self.variables:
697
                defn = 'Capture(%s)' % defn
698
                self.variables.remove(rule)
699
700
701
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
702
        except TypeError as error:
703
704
705
706
            from traceback import extract_tb
            trace = str(extract_tb(error.__traceback__)[-1])
            errmsg = "%s (TypeError: %s; %s)\n%s" \
                     % (EBNFCompiler.AST_ERROR, str(error), trace, node.as_sxpr())
707
708
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
709
        return rule, defn
710

711

712
    def _check_rx(self, node: Node, rx: str) -> str:
713
714
        """
        Checks whether the string `rx` represents a valid regular
715
716
717
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
718
        flags = self.re_flags | {'x'} if rx.find('\n') >= 0 else self.re_flags
719
        if flags:  rx = "(?%s)%s" % ("".join(flags), rx)
720
721
722
723
724
725
726
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

727

728
    def on_directive(self, node: Node) -> str:
729
        key = str(node.children[0]).lower()
730
        assert key not in self.directives['tokens']
731

732
        if key in {'comment', 'whitespace'}:
733
734
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
735
                    node.add_error('Directive "%s" must have one, but not %i values.' %
736
                                   (key, len(node.children[1].result)))
737
                value = self.compile(node.children[1]).pop()
738
739
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
740
                else:
741
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
742
            else:
743
744
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
745
746
747
748
749
750
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
751
752
753
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
754
            self.directives[key] = value
755

756
757
758
759
760
761
        elif key == 'ignorecase':
            value = str(node.children[1]).lower() not in {"off", "false", "no"}
            self.directives['ignorecase'] == value
            if value:
                self.re_flags.add('i')

Eckhart Arnold's avatar
Eckhart Arnold committed
762
763
764
        # elif key == 'testing':
        #     value = str(node.children[1])
        #     self.directives['testing'] = value.lower() not in {"off", "false", "no"}
765

766
        elif key == 'literalws':
767
            value = {item.lower() for item in self.compile(node.children[1])}
768
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
769
                    or ('none' in value and len(value) > 1)):
770
771
772
773
774
775
776
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

777
        elif key in {'tokens', 'preprocessor_tokens'}:
778
            self.directives['tokens'] |= self.compile(node.children[1])
779

780
        elif key.endswith('_filter'):
781
            filter_set = self.compile(node.children[1])
782
783
784
785
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
786

787
788
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
789
                           (key, ', '.join(list(self.directives.keys()))))
790
791
        return ""

792

793
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
794
795
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
796
797
        name for the particular non-terminal.
        """
798
        arguments = [self.compile(r) for r in node.children] + custom_args
799
        node.error_flag = max(node.error_flag, max(t.error_flag for t in node.children))
800
801
        return parser_class + '(' + ', '.join(arguments) + ')'

802

803
    def on_expression(self, node) -> str:
804
805
        return self.non_terminal(node, 'Alternative')

806

807
    def on_term(self, node) -> str:
808
809
810
811
812
813
814
815
        mandatory_marker = []
        filtered_children = []
        i = 0
        for nd in node.children:
            if nd.parser.ptype == TOKEN_PTYPE and str(nd) == "§":
                mandatory_marker.append(i)
                if i == 0:
                    nd.add_error('First item of a series should not be mandatory.',
816
                                 Error.WARNING)
817
818
                elif len(mandatory_marker) > 1:
                    nd.add_error('One mandatory marker (§) sufficient to declare the '
819
                                 'rest of the series as mandatory.', Error.WARNING)
820
821
822
823
824
825
826
827
828
            else:
                filtered_children.append(nd)
                i += 1
        saved_result = node.result
        node.result = tuple(filtered_children)
        mandatory_marker.append(Series.NOPE)
        compiled = self.non_terminal(node, 'Series', ['mandatory=%i' % mandatory_marker[0]])
        node.result = saved_result
        return compiled
829

830

831
    def on_factor(self, node: Node) -> str:
832
        assert node.children
833
        assert len(node.children) >= 2, node.as_sxpr()
834
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
835
        custom_args = []  # type: List[str]
836
837

        if prefix in {'::', ':'}:
838
839
            assert len(node.children) == 2
            arg = node.children[-1]
840
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
841
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
842
843
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
844
            if str(arg) in self.directives['filter']:
845
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
846
            self.variables.add(str(arg))  # cast(str, arg.result)
847

848
        elif len(node.children) > 2:
849
850
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
851
852
853
854
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
855

856
        node.result = node.children[1:]
857
858
        try:
            parser_class = self.PREFIX_TABLE[prefix]
859
860
861
862
            result = self.non_terminal(node, parser_class, custom_args)
            if prefix[:1] == '-':
                def check(node):
                    nd = node
863
864
865
866
867
868
869
870
871
872
                    if len(nd.children) >= 1:
                        nd = nd.children[0]
                    while nd.parser.name == "symbol":
                        symlist = self.rules.get(str(nd), [])
                        if len(symlist) == 2:
                            nd = symlist[1]
                        else:
                            if len(symlist) == 1:
                                nd = symlist[0].children[1]
                            break
873
874
875
                    if (nd.parser.name != "regexp" or str(nd)[:1] != '/'
                        or str(nd)[-1:] != '/'):
                        node.add_error("Lookbehind-parser can only be used with plain RegExp-"
876
                                       "parsers, not with: " + nd.parser.name + nd.parser.ptype)
877
878
879
880

                if not result.startswith('RegExp('):
                    self.deferred_tasks.append(lambda: check(node))
            return result
881
882
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
883
        return ""
884

885

886
    def on_option(self, node) -> str:
887
        return self.non_terminal(node, 'Option')
888

889

890
    def on_repetition(self, node) -> str:
891
892
        return self.non_terminal(node, 'ZeroOrMore')

893

894
    def on_oneormore(self, node) -> str:
895
896
        return self.non_terminal(node, 'OneOrMore')

897

898
    def on_regexchain(self, node) -> str:
899
900
        raise EBNFCompilerError("Not yet implemented!")

901

902
    def on_group(self, node) -> str:
903
904
905
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

906

907
908
909
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
910
            return 'PreprocessorToken("' + symbol + '")'
911
        else:
912
913
            self.current_symbols.append(node)
            if symbol not in self.symbols:
914
                self.symbols[symbol] = node  # remember first use of symbol
915
916
            if symbol in self.rules:
                self.recursive.add(symbol)
Eckhart Arnold's avatar
Eckhart Arnold committed
917
            if symbol in EBNFCompiler.RESERVED_SYMBOLS:  # (EBNFCompiler.WHITESPACE_KEYWORD, EBNFCompiler.COMMENT_KEYWORD):
Eckhart Arnold's avatar
Eckhart Arnold committed
918
                return "RegExp(%s)" % symbol
919
            return symbol
920

921

922
    def on_literal(self, node) -> str: