ebnf.py 31.3 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
21
from collections import OrderedDict

22
23
24
25
try:
    import regex as re
except ImportError:
    import re
26
27
28
29
try:
    from typing import Callable, Dict, List, Set, Tuple
except ImportError:
    from .typing34 import Callable, Dict, List, Set, Tuple
30

31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
32
from DHParser.parser import Grammar, mixin_comment, nil_preprocessor, Forward, RE, NegativeLookahead, \
33
    Alternative, Series, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
34
    PreprocessorFunc
35
36
37
38
from DHParser.syntaxtree import WHITESPACE_PTYPE, TOKEN_PTYPE, Node, TransformationFunc
from DHParser.transform import traverse, remove_brackets, \
    reduce_single_child, replace_by_single_child, remove_expendables, \
    remove_tokens, flatten, forbid, assert_content, key_tag_name
39
from DHParser.versionnumber import __version__
40

41
__all__ = ('get_ebnf_preprocessor',
42
43
44
45
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
46
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
47
           'EBNFCompilerError',
48
           'EBNFCompiler',
49
           'grammar_changed',
50
           'PreprocessorFactoryFunc',
51
52
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
53
           'CompilerFactoryFunc')
54
55


Eckhart Arnold's avatar
Eckhart Arnold committed
56
57
58
59
60
61
62
########################################################################
#
# EBNF scanning
#
########################################################################


63
64
def get_ebnf_preprocessor() -> PreprocessorFunc:
    return nil_preprocessor
Eckhart Arnold's avatar
Eckhart Arnold committed
65
66
67
68
69
70
71
72


########################################################################
#
# EBNF parsing
#
########################################################################

73

74
class EBNFGrammar(Grammar):
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
104
105
    option     =  "[" expression §"]"

106
107
108
109
110
111
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
112
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
113
114
115
116
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
117
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
118
    parser_initialization__ = "upon instantiation"
119
120
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
121
    wspL__ = ''
122
    wspR__ = WSP__
123
    EOF = NegativeLookahead(RE('.', wR=''))
124
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
125
    regexp = RE(r'~?/(?:\\/|[^/])*?/~?')  # RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
126
127
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
128
129
130
131
    option = Series(Token("["), expression, Required(Token("]")))
    repetition = Series(Token("{"), expression, Required(Token("}")))
    oneormore = Series(Token("{"), expression, Token("}+"))
    group = Series(Token("("), expression, Required(Token(")")))
132
133
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
134
135
    factor = Alternative(Series(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Series(Optional(flowmarker), literal), Series(Optional(flowmarker), regexp),
136
137
                         Series(Optional(flowmarker), group), Series(Optional(flowmarker), oneormore),
                         repetition, option)
138
    term = OneOrMore(factor)
139
140
141
142
    expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
    directive = Series(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Series(symbol, Required(Token("=")), expression)
    syntax = Series(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
143
144
145
    root__ = syntax


146
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
166
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
167
168
169
170
171
172
173
174
175
176
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


177
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


194
EBNF_transformation_table = {
195
    # AST Transformations for EBNF-grammar
196
    "+":
197
        remove_expendables,
198
    "syntax":
199
        [],  # otherwise '"*": replace_by_single_child' would be applied
200
    "directive, definition":
201
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
202
    "expression":
203
        [replace_by_single_child, flatten, remove_tokens('|')],
204
205
206
207
208
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
209
210
        [remove_tokens('(', ')'), replace_by_single_child],
    "oneormore, repetition, option":
211
        [reduce_single_child, remove_brackets],
212
    "symbol, literal, regexp":
213
        reduce_single_child,
214
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
215
        reduce_single_child,
216
    "list_":
217
        [flatten, remove_tokens(',')],
218
    "*":
219
        replace_by_single_child
220
221
}

222

223
EBNF_validation_table = {
224
    # Semantic validation on the AST. EXPERIMENTAL!
225
    "repetition, option, oneormore":
226
227
        [forbid('repetition', 'option', 'oneormore'),
         assert_content(r'(?!§)')]
228
}
229

230

231
def EBNFTransformer(syntax_tree: Node):
232
    for processing_table, key_func in [(EBNF_transformation_table, key_tag_name),
233
                                       (EBNF_validation_table, key_tag_name)]:
234
        traverse(syntax_tree, processing_table, key_func)
di68kap's avatar
di68kap committed
235
236


237
def get_ebnf_transformer() -> TransformationFunc:
238
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
239
240
241
242
243
244
245
246


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

247

248
PreprocessorFactoryFunc = Callable[[], PreprocessorFunc]
249
ParserFactoryFunc = Callable[[], Grammar]
250
TransformerFactoryFunc = Callable[[], TransformationFunc]
251
252
CompilerFactoryFunc = Callable[[], Compiler]

253
254
255
PREPROCESSOR_FACTORY = '''
def get_preprocessor() -> PreprocessorFunc:
    return {NAME}Preprocessor
256
257
258
259
'''


GRAMMAR_FACTORY = '''
260
def get_grammar() -> {NAME}Grammar:
261
262
263
264
265
266
267
268
269
270
271
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
272
def get_transformer() -> TransformationFunc:
273
274
275
276
277
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
278
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
279
280
281
282
283
284
285
286
287
288
289
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
290

291
292
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
293
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
294
295
296
    pass


297
298
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!? Is this possible at compile time?

299
class EBNFCompiler(Compiler):
300
301
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
302
303
304
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
305
306
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
307
308
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
309
310
311
312
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
313
314
315
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
316

317

318
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
319
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
320
321
        self._reset()

322

323
    def _reset(self):
324
        self._result = ''           # type: str
325
326
327
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
328
        self.variables = set()      # type: Set[str]
329
        self.definitions = []  # type: List[Tuple[str, str]]
330
        self.recursive = set()      # type: Set[str]
331
        self.deferred_tasks = []  # type: List[Callable]
332
        self.root = ""              # type: str
333
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
334
                           'comment': '',
335
                           'literalws': ['right'],
336
337
338
                           'tokens': set(),  # alt. 'preprocessor_tokens'
                           'filter': dict(),  # alt. 'filter'
                           'testing': False}
339

Eckhart Arnold's avatar
Eckhart Arnold committed
340
    @property
341
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
342
343
        return self._result

344
    # methods for generating skeleton code for preprocessor, transformer, and compiler
345

346
347
    def gen_preprocessor_skeleton(self) -> str:
        name = self.grammar_name + "Preprocessor"
348
        return "def %s(text):\n    return text\n" % name \
349
               + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
350

351

352
    def gen_transformer_skeleton(self) -> str:
353
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
354
355
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
356
357
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
358
        transtable = [tt_name + ' = {',
359
360
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
361
        transtable.append('    "+": remove_empty,')
362
        for name in self.rules:
363
364
            transtable.append('    "' + name + '": [],')
        transtable.append('    ":Token, :RE": reduce_single_child,')
Eckhart Arnold's avatar
Eckhart Arnold committed
365
        transtable += ['    "*": replace_by_single_child', '}', '', tf_name +
366
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
367
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
368
369
        return '\n'.join(transtable)

370

371
    def gen_compiler_skeleton(self) -> str:
372
        if not self.rules:
373
374
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
375
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
376
377
378
379
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
380
                    self.grammar_name + '", grammar_source=""):',
381
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
382
                    'Compiler, self).__init__(grammar_name, grammar_source)',
383
                    "        assert re.match('\w+\Z', grammar_name)", '']
384
        for name in self.rules:
385
            method_name = Compiler.method_name(name)
386
            if name == self.root:
387
                compiler += ['    def ' + method_name + '(self, node):',
388
389
                             '        return node', '']
            else:
390
                compiler += ['    def ' + method_name + '(self, node):',
391
                             '        pass', '']
392
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
393
        return '\n'.join(compiler)
394

395

396
397
398
399
400
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
401
402
403
404
405
406
407
408
409
410

        # execute deferred tasks, for example semantic checks that cannot
        # be done before the symbol table is complete

        for task in self.deferred_tasks:
            task()

        # provide for capturing of symbols that are variables, i.e. the
        # value of will be retrieved at some point during the parsing process

411
412
413
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
414
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
415

416
417
        # add special fields for Grammar class

418
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
419
                            if 'right' in self.directives['literalws'] else "''"))
420
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
421
                            if 'left' in self.directives['literalws'] else "''"))
422
        definitions.append((self.WHITESPACE_KEYWORD,
423
424
425
426
427
428
429
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
430

431
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
432
        declarations = ['class ' + self.grammar_name +
433
                        'Grammar(Grammar):',
434
435
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
436
                        (', with this grammar:' if self.grammar_source else '.')]
437
        definitions.append(('parser_initialization__', '"upon instantiation"'))
438
        if self.grammar_source:
439
            definitions.append(('source_hash__',
440
                                '"%s"' % md5(self.grammar_source, __version__)))
441
            declarations.append('')
442
            declarations += [line for line in self.grammar_source.split('\n')]
443
444
445
446
447
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
448

449
450
451
452
453
454
455
456
457
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
458
459
460
461
462
463
464

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
465
                root_node.error_flag = True
466
467
468

        # check for unconnected rules

469
        if not self.directives['testing']:
470
471
472
473
474
475
476
477
478
479
480
            defined_symbols.difference_update(self.RESERVED_SYMBOLS)

            def remove_connections(symbol):
                if symbol in defined_symbols:
                    defined_symbols.remove(symbol)
                    for related in self.rules[symbol][1:]:
                        remove_connections(str(related))

            remove_connections(self.root)
            for leftover in defined_symbols:
                self.rules[leftover][0].add_error(('Rule "%s" is not connected to parser '
481
482
                    'root "%s" !') % (leftover, self.root) + ' (Use directive "@testing=True" '
                    'to supress this error message.)')
483
484
485

        # set root parser and assemble python grammar definition

486
        if self.root and 'root__' not in self.rules:
487
488
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
489
490
491
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
492

493
494
495

    ## compilation methods

496
    def on_syntax(self, node: Node) -> str:
497
        self._reset()
498
        definitions = []  # type: List[Tuple[str, str]]
499
500

        # drop the wrapping sequence node
501
502
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
503
504

        # compile definitions and directives and collect definitions
505
        for nd in node.children:
506
            if nd.parser.name == "definition":
507
                definitions.append(self.compile(nd))
508
            else:
509
                assert nd.parser.name == "directive", nd.as_sxpr()
510
                self.compile(nd)
511
                node.error_flag = node.error_flag or nd.error_flag
512

513
        return self.assemble_parser(definitions, node)
514

515

516
    def on_definition(self, node: Node) -> Tuple[str, str]:
517
        rule = str(node.children[0])
518
519
520
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
521
522
523
524
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
525
        elif rule in self.directives['tokens']:
526
            node.add_error('Symbol "%s" has already been defined as '
527
                           'a preprocessor token.' % rule)
528
529
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
530
                           % rule + '(This may change in the future.)')
531
        try:
532
533
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
534
            defn = self.compile(node.children[1])
535
            if rule in self.variables:
536
                defn = 'Capture(%s)' % defn
537
                self.variables.remove(rule)
538
539
540
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
541
        except TypeError as error:
542
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sxpr()
543
544
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
545
        return rule, defn
546

547

548
    @staticmethod
549
    def _check_rx(node: Node, rx: str) -> str:
550
551
        """
        Checks whether the string `rx` represents a valid regular
552
553
554
555
556
557
558
559
560
561
562
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

563

564
    def on_directive(self, node: Node) -> str:
565
        key = str(node.children[0]).lower()
566
        assert key not in self.directives['tokens']
567

568
        if key in {'comment', 'whitespace'}:
569
570
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
571
                    node.add_error('Directive "%s" must have one, but not %i values.' %
572
                                   (key, len(node.children[1].result)))
573
                value = self.compile(node.children[1]).pop()
574
575
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
576
                else:
577
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
578
            else:
579
580
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
581
582
583
584
585
586
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
587
588
589
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
590
            self.directives[key] = value
591

592
593
594
595
        elif key == 'testing':
            value = str(node.children[1])
            self.directives['testing'] = value.lower() not in {"off", "false", "no"}

596
        elif key == 'literalws':
597
            value = {item.lower() for item in self.compile(node.children[1])}
598
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
599
                    or ('none' in value and len(value) > 1)):
600
601
602
603
604
605
606
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

607
        elif key in {'tokens', 'preprocessor_tokens'}:
608
            self.directives['tokens'] |= self.compile(node.children[1])
609

610
        elif key.endswith('_filter'):
611
            filter_set = self.compile(node.children[1])
612
613
614
615
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
616

617
618
619
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
620
                            ', '.join(list(self.directives.keys()))))
621
622
        return ""

623

624
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
625
626
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
627
628
        name for the particular non-terminal.
        """
629
        arguments = [self.compile(r) for r in node.children] + custom_args
630
631
        return parser_class + '(' + ', '.join(arguments) + ')'

632

633
    def on_expression(self, node) -> str:
634
635
        return self.non_terminal(node, 'Alternative')

636

637
    def on_term(self, node) -> str:
638
        return self.non_terminal(node, 'Series')
639

640

641
    def on_factor(self, node: Node) -> str:
642
        assert node.children
643
        assert len(node.children) >= 2, node.as_sxpr()
644
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
645
        custom_args = []  # type: List[str]
646
647

        if prefix in {'::', ':'}:
648
649
            assert len(node.children) == 2
            arg = node.children[-1]
650
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
651
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
652
653
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
654
            if str(arg) in self.directives['filter']:
655
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
656
            self.variables.add(str(arg))  # cast(str, arg.result)
657

658
        elif len(node.children) > 2:
659
660
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
661
662
663
664
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
665

666
        node.result = node.children[1:]
667
668
        try:
            parser_class = self.PREFIX_TABLE[prefix]
669
670
671
672
673
674
675
676
677
678
679
680
681
682
            result = self.non_terminal(node, parser_class, custom_args)
            if prefix[:1] == '-':
                def check(node):
                    nd = node
                    while len(nd.children) == 1 and nd.children[1].parser.name == "symbol":
                        nd = nd.children[1]
                    if (nd.parser.name != "regexp" or str(nd)[:1] != '/'
                        or str(nd)[-1:] != '/'):
                        node.add_error("Lookbehind-parser can only be used with plain RegExp-"
                                       "parsers, not with: " + str(nd))

                if not result.startswith('RegExp('):
                    self.deferred_tasks.append(lambda: check(node))
            return result
683
684
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
685
        return ""
686

687

688
    def on_option(self, node) -> str:
689
690
        return self.non_terminal(node, 'Optional')

691

692
    def on_repetition(self, node) -> str:
693
694
        return self.non_terminal(node, 'ZeroOrMore')

695

696
    def on_oneormore(self, node) -> str:
697
698
        return self.non_terminal(node, 'OneOrMore')

699

700
    def on_regexchain(self, node) -> str:
701
702
        raise EBNFCompilerError("Not yet implemented!")

703

704
    def on_group(self, node) -> str:
705
706
707
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

708

709
710
711
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
712
            return 'PreprocessorToken("' + symbol + '")'
713
        else:
714
715
716
717
718
719
            self.current_symbols.append(node)
            if symbol not in self.symbols:
                self.symbols[symbol] = node
            if symbol in self.rules:
                self.recursive.add(symbol)
            return symbol
720

721

722
    def on_literal(self, node) -> str:
723
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.join([node.result]) + ')' ?
724

725

726
    def on_regexp(self, node: Node) -> str:
727
        rx = str(node)
728
        name = []   # type: List[str]
729
730
        if rx[:2] == '~/':
            if not 'left' in self.directives['literalws']:
731
                name = ['wL=' + self.WHITESPACE_KEYWORD] + name
732
733
734
735
            rx = rx[1:]
        elif 'left' in self.directives['literalws']:
            name = ["wL=''"] + name
        if rx[-2:] == '/~':
Eckhart Arnold's avatar
Eckhart Arnold committed
736
            if 'right' not in self.directives['literalws']:
737
                name = ['wR=' + self.WHITESPACE_KEYWORD] + name
738
739
740
741
742
743
744
            rx = rx[:-1]
        elif 'right' in self.directives['literalws']:
            name = ["wR=''"] + name
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
745
                     node.as_sxpr()
746
747
748
749
            node.add_error(errmsg)
            return '"' + errmsg + '"'
        return 'RE(' + ', '.join([arg] + name) + ')'

750

751
    def on_list_(self, node) -> Set[str]:
752
        assert node.children
753
        return set(item.result.strip() for item in node.children)
754
755


756
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
757
758
759
760
761
762
763
764
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton