ebnf.py 30.2 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
21
from collections import OrderedDict

22
23
24
25
try:
    import regex as re
except ImportError:
    import re
26
27
28
29
try:
    from typing import Callable, Dict, List, Set, Tuple
except ImportError:
    from .typing34 import Callable, Dict, List, Set, Tuple
30

31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
32
from DHParser.parser import Grammar, mixin_comment, nil_preprocessor, Forward, RE, NegativeLookahead, \
33
    Alternative, Series, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
34
    PreprocessorFunc
35
36
37
38
from DHParser.syntaxtree import WHITESPACE_PTYPE, TOKEN_PTYPE, Node, TransformationFunc
from DHParser.transform import traverse, remove_brackets, \
    reduce_single_child, replace_by_single_child, remove_expendables, \
    remove_tokens, flatten, forbid, assert_content, key_tag_name
39
from DHParser.versionnumber import __version__
40

41
__all__ = ['get_ebnf_preprocessor',
42
43
44
45
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
46
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
47
           'EBNFCompilerError',
48
           'EBNFCompiler',
49
           'grammar_changed',
50
           'PreprocessorFactoryFunc',
51
52
53
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
           'CompilerFactoryFunc']
54
55


Eckhart Arnold's avatar
Eckhart Arnold committed
56
57
58
59
60
61
62
########################################################################
#
# EBNF scanning
#
########################################################################


63
64
def get_ebnf_preprocessor() -> PreprocessorFunc:
    return nil_preprocessor
Eckhart Arnold's avatar
Eckhart Arnold committed
65
66
67
68
69
70
71
72


########################################################################
#
# EBNF parsing
#
########################################################################

73

74
class EBNFGrammar(Grammar):
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
93
                | [flowmarker] regexchain
94
95
96
97
98
99
100
101
102
103
104
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
105
106
    option     =  "[" expression §"]"

107
108
109
110
111
112
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
113
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
114
115
116
117
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
118
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
119
    parser_initialization__ = "upon instantiation"
120
121
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
122
    wspL__ = ''
123
    wspR__ = WSP__
124
    EOF = NegativeLookahead(RE('.', wR=''))
125
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
126
    regexp = RE(r'~?/(?:\\/|[^/])*?/~?')  # RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
127
128
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
129
130
131
132
    option = Series(Token("["), expression, Required(Token("]")))
    repetition = Series(Token("{"), expression, Required(Token("}")))
    oneormore = Series(Token("{"), expression, Token("}+"))
    group = Series(Token("("), expression, Required(Token(")")))
133
134
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
135
136
    factor = Alternative(Series(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Series(Optional(flowmarker), literal), Series(Optional(flowmarker), regexp),
137
138
                         Series(Optional(flowmarker), group), Series(Optional(flowmarker), oneormore),
                         repetition, option)
139
    term = OneOrMore(factor)
140
141
142
143
    expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
    directive = Series(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Series(symbol, Required(Token("=")), expression)
    syntax = Series(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
144
145
146
    root__ = syntax


147
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
167
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
168
169
170
171
172
173
174
175
176
177
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


178
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


195
EBNF_transformation_table = {
196
    # AST Transformations for EBNF-grammar
197
    "+":
198
        remove_expendables,
199
    "syntax":
200
        [],  # otherwise '"*": replace_by_single_child' would be applied
201
    "directive, definition":
202
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
203
    "expression":
204
        [replace_by_single_child, flatten, remove_tokens('|')],
205
206
207
208
209
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
210
211
        [remove_tokens('(', ')'), replace_by_single_child],
    "oneormore, repetition, option":
212
        [reduce_single_child, remove_brackets],
213
    "symbol, literal, regexp":
214
        reduce_single_child,
215
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
216
        reduce_single_child,
217
    "list_":
218
        [flatten, remove_tokens(',')],
219
    "*":
220
        replace_by_single_child
221
222
}

223

224
EBNF_validation_table = {
225
    # Semantic validation on the AST. EXPERIMENTAL!
226
    "repetition, option, oneormore":
227
228
        [forbid('repetition', 'option', 'oneormore'),
         assert_content(r'(?!§)')]
229
}
230

231

232
def EBNFTransformer(syntax_tree: Node):
233
    for processing_table, key_func in [(EBNF_transformation_table, key_tag_name),
234
                                       (EBNF_validation_table, key_tag_name)]:
235
        traverse(syntax_tree, processing_table, key_func)
di68kap's avatar
di68kap committed
236
237


238
def get_ebnf_transformer() -> TransformationFunc:
239
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
240
241
242
243
244
245
246
247


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

248

249
PreprocessorFactoryFunc = Callable[[], PreprocessorFunc]
250
ParserFactoryFunc = Callable[[], Grammar]
251
TransformerFactoryFunc = Callable[[], TransformationFunc]
252
253
CompilerFactoryFunc = Callable[[], Compiler]

254
255
256
PREPROCESSOR_FACTORY = '''
def get_preprocessor() -> PreprocessorFunc:
    return {NAME}Preprocessor
257
258
259
260
'''


GRAMMAR_FACTORY = '''
261
def get_grammar() -> {NAME}Grammar:
262
263
264
265
266
267
268
269
270
271
272
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
273
def get_transformer() -> TransformationFunc:
274
275
276
277
278
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
279
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
280
281
282
283
284
285
286
287
288
289
290
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
291

292
293
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
294
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
295
296
297
    pass


298
299
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!? Is this possible at compile time?

300
class EBNFCompiler(Compiler):
301
302
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
303
304
305
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
306
307
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
308
309
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
310
311
312
313
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
314
315
316
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
317

318

319
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
320
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
321
322
        self._reset()

323

324
    def _reset(self):
325
        self._result = ''           # type: str
326
327
328
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
329
        self.variables = set()      # type: Set[str]
330
        # self.definition_names = []  # type: List[str]
331
332
        self.recursive = set()      # type: Set[str]
        self.root = ""              # type: str
333
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
334
                           'comment': '',
335
                           'literalws': ['right'],
336
337
338
                           'tokens': set(),  # alt. 'preprocessor_tokens'
                           'filter': dict(),  # alt. 'filter'
                           'testing': False}
339

Eckhart Arnold's avatar
Eckhart Arnold committed
340
    @property
341
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
342
343
        return self._result

344
    # methods for generating skeleton code for preprocessor, transformer, and compiler
345

346
347
    def gen_preprocessor_skeleton(self) -> str:
        name = self.grammar_name + "Preprocessor"
348
        return "def %s(text):\n    return text\n" % name \
349
               + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
350

351

352
    def gen_transformer_skeleton(self) -> str:
353
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
354
355
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
356
357
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
358
        transtable = [tt_name + ' = {',
359
360
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
361
        transtable.append('    "+": remove_empty,')
362
        for name in self.rules:
363
364
            transtable.append('    "' + name + '": [],')
        transtable.append('    ":Token, :RE": reduce_single_child,')
Eckhart Arnold's avatar
Eckhart Arnold committed
365
        transtable += ['    "*": replace_by_single_child', '}', '', tf_name +
366
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
367
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
368
369
        return '\n'.join(transtable)

370

371
    def gen_compiler_skeleton(self) -> str:
372
        if not self.rules:
373
374
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
375
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
376
377
378
379
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
380
                    self.grammar_name + '", grammar_source=""):',
381
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
382
                    'Compiler, self).__init__(grammar_name, grammar_source)',
383
                    "        assert re.match('\w+\Z', grammar_name)", '']
384
        for name in self.rules:
385
            method_name = Compiler.method_name(name)
386
            if name == self.root:
387
                compiler += ['    def ' + method_name + '(self, node):',
388
389
                             '        return node', '']
            else:
390
                compiler += ['    def ' + method_name + '(self, node):',
391
                             '        pass', '']
392
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
393
        return '\n'.join(compiler)
394

395

396
397
398
399
400
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
401
402
403
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
404
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
405

406
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
407
                            if 'right' in self.directives['literalws'] else "''"))
408
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
409
                            if 'left' in self.directives['literalws'] else "''"))
410
        definitions.append((self.WHITESPACE_KEYWORD,
411
412
413
414
415
416
417
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
418

419
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
420
        declarations = ['class ' + self.grammar_name +
421
                        'Grammar(Grammar):',
422
423
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
424
                        (', with this grammar:' if self.grammar_source else '.')]
425
        definitions.append(('parser_initialization__', '"upon instantiation"'))
426
        if self.grammar_source:
427
            definitions.append(('source_hash__',
428
                                '"%s"' % md5(self.grammar_source, __version__)))
429
            declarations.append('')
430
            declarations += [line for line in self.grammar_source.split('\n')]
431
432
433
434
435
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
436

437
438
439
440
441
442
443
444
445
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
446
447
448
449
450
451
452

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
453
                root_node.error_flag = True
454
455
456

        # check for unconnected rules

457
        if not self.directives['testing']:
458
459
460
461
462
463
464
465
466
467
468
            defined_symbols.difference_update(self.RESERVED_SYMBOLS)

            def remove_connections(symbol):
                if symbol in defined_symbols:
                    defined_symbols.remove(symbol)
                    for related in self.rules[symbol][1:]:
                        remove_connections(str(related))

            remove_connections(self.root)
            for leftover in defined_symbols:
                self.rules[leftover][0].add_error(('Rule "%s" is not connected to parser '
469
470
                    'root "%s" !') % (leftover, self.root) + ' (Use directive "@testing=True" '
                    'to supress this error message.)')
471
472
473

        # set root parser and assemble python grammar definition

474
        if self.root and 'root__' not in self.rules:
475
476
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
477
478
479
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
480

481
482
483

    ## compilation methods

484
    def on_syntax(self, node: Node) -> str:
485
486
487
488
        self._reset()
        definitions = []

        # drop the wrapping sequence node
489
490
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
491
492

        # compile definitions and directives and collect definitions
493
        for nd in node.children:
494
            if nd.parser.name == "definition":
495
                definitions.append(self.compile(nd))
496
            else:
497
                assert nd.parser.name == "directive", nd.as_sxpr()
498
                self.compile(nd)
499
                node.error_flag = node.error_flag or nd.error_flag
500

501
        return self.assemble_parser(definitions, node)
502

503

504
    def on_definition(self, node: Node) -> Tuple[str, str]:
505
        rule = str(node.children[0])
506
507
508
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
509
510
511
512
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
513
        elif rule in self.directives['tokens']:
514
            node.add_error('Symbol "%s" has already been defined as '
515
                           'a preprocessor token.' % rule)
516
517
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
518
                           % rule + '(This may change in the future.)')
519
        try:
520
521
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
522
            defn = self.compile(node.children[1])
523
            if rule in self.variables:
524
                defn = 'Capture(%s)' % defn
525
                self.variables.remove(rule)
526
527
528
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
529
        except TypeError as error:
530
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sxpr()
531
532
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
533
        return rule, defn
534

535

536
    @staticmethod
537
    def _check_rx(node: Node, rx: str) -> str:
538
539
        """
        Checks whether the string `rx` represents a valid regular
540
541
542
543
544
545
546
547
548
549
550
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

551

552
    def on_directive(self, node: Node) -> str:
553
        key = str(node.children[0]).lower()
554
        assert key not in self.directives['tokens']
555

556
        if key in {'comment', 'whitespace'}:
557
558
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
559
                    node.add_error('Directive "%s" must have one, but not %i values.' %
560
                                   (key, len(node.children[1].result)))
561
                value = self.compile(node.children[1]).pop()
562
563
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
564
                else:
565
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
566
            else:
567
568
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
569
570
571
572
573
574
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
575
576
577
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
578
            self.directives[key] = value
579

580
581
582
583
        elif key == 'testing':
            value = str(node.children[1])
            self.directives['testing'] = value.lower() not in {"off", "false", "no"}

584
        elif key == 'literalws':
585
            value = {item.lower() for item in self.compile(node.children[1])}
586
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
587
                    or ('none' in value and len(value) > 1)):
588
589
590
591
592
593
594
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

595
        elif key in {'tokens', 'preprocessor_tokens'}:
596
            self.directives['tokens'] |= self.compile(node.children[1])
597

598
        elif key.endswith('_filter'):
599
            filter_set = self.compile(node.children[1])
600
601
602
603
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
604

605
606
607
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
608
                            ', '.join(list(self.directives.keys()))))
609
610
        return ""

611

612
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
613
614
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
615
616
        name for the particular non-terminal.
        """
617
        arguments = [self.compile(r) for r in node.children] + custom_args
618
619
        return parser_class + '(' + ', '.join(arguments) + ')'

620

621
    def on_expression(self, node) -> str:
622
623
        return self.non_terminal(node, 'Alternative')

624

625
    def on_term(self, node) -> str:
626
        return self.non_terminal(node, 'Series')
627

628

629
    def on_factor(self, node: Node) -> str:
630
        assert node.children
631
        assert len(node.children) >= 2, node.as_sxpr()
632
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
633
        custom_args = []  # type: List[str]
634
635

        if prefix in {'::', ':'}:
636
637
            assert len(node.children) == 2
            arg = node.children[-1]
638
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
639
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
640
641
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
642
            if str(arg) in self.directives['filter']:
643
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
644
            self.variables.add(str(arg))  # cast(str, arg.result)
645

646
        elif len(node.children) > 2:
647
648
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
649
650
651
652
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
653

654
        node.result = node.children[1:]
655
656
657
658
659
        try:
            parser_class = self.PREFIX_TABLE[prefix]
            return self.non_terminal(node, parser_class, custom_args)
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
660
        return ""
661

662

663
    def on_option(self, node) -> str:
664
665
        return self.non_terminal(node, 'Optional')

666

667
    def on_repetition(self, node) -> str:
668
669
        return self.non_terminal(node, 'ZeroOrMore')

670

671
    def on_oneormore(self, node) -> str:
672
673
        return self.non_terminal(node, 'OneOrMore')

674

675
    def on_regexchain(self, node) -> str:
676
677
        raise EBNFCompilerError("Not yet implemented!")

678

679
    def on_group(self, node) -> str:
680
681
682
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

683

684
685
686
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
687
            return 'PreprocessorToken("' + symbol + '")'
688
        else:
689
690
691
692
693
694
            self.current_symbols.append(node)
            if symbol not in self.symbols:
                self.symbols[symbol] = node
            if symbol in self.rules:
                self.recursive.add(symbol)
            return symbol
695

696

697
    def on_literal(self, node) -> str:
698
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.join([node.result]) + ')' ?
699

700

701
    def on_regexp(self, node: Node) -> str:
702
        rx = str(node)
703
        name = []   # type: List[str]
704
705
        if rx[:2] == '~/':
            if not 'left' in self.directives['literalws']:
706
                name = ['wL=' + self.WHITESPACE_KEYWORD] + name
707
708
709
710
            rx = rx[1:]
        elif 'left' in self.directives['literalws']:
            name = ["wL=''"] + name
        if rx[-2:] == '/~':
Eckhart Arnold's avatar
Eckhart Arnold committed
711
            if 'right' not in self.directives['literalws']:
712
                name = ['wR=' + self.WHITESPACE_KEYWORD] + name
713
714
715
716
717
718
719
            rx = rx[:-1]
        elif 'right' in self.directives['literalws']:
            name = ["wR=''"] + name
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
720
                     node.as_sxpr()
721
722
723
724
            node.add_error(errmsg)
            return '"' + errmsg + '"'
        return 'RE(' + ', '.join([arg] + name) + ')'

725

726
    def on_list_(self, node) -> Set[str]:
727
        assert node.children
728
        return set(item.result.strip() for item in node.children)
729
730


731
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
732
733
734
735
736
737
738
739
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton