ebnf.py 30.2 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
21
from collections import OrderedDict

22
23
24
25
try:
    import regex as re
except ImportError:
    import re
26
27
28
29
try:
    from typing import Callable, Dict, List, Set, Tuple
except ImportError:
    from .typing34 import Callable, Dict, List, Set, Tuple
30

31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
32
from DHParser.parsers import Grammar, mixin_comment, nil_scanner, Forward, RE, NegativeLookahead, \
33
    Alternative, Series, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
34
    ScannerFunc
35
from DHParser.syntaxtree import Node, traverse, remove_first, remove_last, reduce_single_child, \
36
    replace_by_single_child, TOKEN_PTYPE, remove_expendables, remove_tokens, flatten, \
37
    forbid, assert_content, WHITESPACE_PTYPE, key_tag_name, TransformationFunc
38
from DHParser.versionnumber import __version__
39
40


41
42
43
44
45
__all__ = ['get_ebnf_scanner',
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
46
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
47
           'EBNFCompilerError',
48
           'EBNFCompiler',
49
50
51
52
53
           'grammar_changed',
           'ScannerFactoryFunc',
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
           'CompilerFactoryFunc']
54
55


Eckhart Arnold's avatar
Eckhart Arnold committed
56
57
58
59
60
61
62
########################################################################
#
# EBNF scanning
#
########################################################################


63
def get_ebnf_scanner() -> ScannerFunc:
Eckhart Arnold's avatar
Eckhart Arnold committed
64
65
66
67
68
69
70
71
72
    return nil_scanner


########################################################################
#
# EBNF parsing
#
########################################################################

73
74
# TODO: Introduce dummy/rename-parser, for simple assignments (e.g. jahr = JAHRESZAHL) or substition!
# TODO: Raise Error for unconnected parsers!
75
class EBNFGrammar(Grammar):
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
94
                | [flowmarker] regexchain
95
96
97
98
99
100
101
102
103
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
104
    regexchain =  ">" expression §"<"                # compiles "expression" into a singular regular expression
105
106
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
107
108
    option     =  "[" expression §"]"

109
110
111
112
113
114
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
115
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
116
117
118
119
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
120
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
121
    parser_initialization__ = "upon instantiation"
122
123
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
124
    wspL__ = ''
125
    wspR__ = WSP__
126
    EOF = NegativeLookahead(RE('.', wR=''))
127
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
128
129
130
    regexp = RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
131
132
133
134
135
    option = Series(Token("["), expression, Required(Token("]")))
    repetition = Series(Token("{"), expression, Required(Token("}")))
    oneormore = Series(Token("{"), expression, Token("}+"))
    regexchain = Series(Token("<"), expression, Required(Token(">")))
    group = Series(Token("("), expression, Required(Token(")")))
136
137
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
138
139
140
141
    factor = Alternative(Series(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Series(Optional(flowmarker), literal), Series(Optional(flowmarker), regexp),
                         Series(Optional(flowmarker), group), Series(Optional(flowmarker), regexchain),
                         Series(Optional(flowmarker), oneormore), repetition, option)
142
    term = OneOrMore(factor)
143
144
145
146
    expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
    directive = Series(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Series(symbol, Required(Token("=")), expression)
    syntax = Series(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
147
148
149
    root__ = syntax


150
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
170
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
171
172
173
174
175
176
177
178
179
180
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


181
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


198
199
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!?

200
EBNF_transformation_table = {
201
    # AST Transformations for EBNF-grammar
202
    "+":
203
        remove_expendables,
204
    "syntax":
205
        [],  # otherwise '"*": replace_by_single_child' would be applied
206
    "directive, definition":
207
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
208
    "expression":
209
        [replace_by_single_child, flatten, remove_tokens('|')],
210
211
212
213
214
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
215
216
217
        [remove_tokens('(', ')'), replace_by_single_child],
    "oneormore, repetition, option":
        [reduce_single_child, remove_first, remove_last],
218
    "symbol, literal, regexp":
219
        reduce_single_child,
220
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
221
        reduce_single_child,
222
    "list_":
223
        [flatten, remove_tokens(',')],
224
    "*":
225
        replace_by_single_child
226
227
}

228

229
EBNF_validation_table = {
230
    # Semantic validation on the AST. EXPERIMENTAL!
231
    "repetition, option, oneormore":
232
233
        [forbid('repetition', 'option', 'oneormore'),
         assert_content(r'(?!§)')]
234
}
235

236

237
def EBNFTransformer(syntax_tree: Node):
238
    for processing_table, key_func in [(EBNF_transformation_table, key_tag_name),
239
                                       (EBNF_validation_table, key_tag_name)]:
240
        traverse(syntax_tree, processing_table, key_func)
di68kap's avatar
di68kap committed
241
242


243
def get_ebnf_transformer() -> TransformationFunc:
244
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
245
246
247
248
249
250
251
252


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

253
254
255

ScannerFactoryFunc = Callable[[], ScannerFunc]
ParserFactoryFunc = Callable[[], Grammar]
256
TransformerFactoryFunc = Callable[[], TransformationFunc]
257
258
259
CompilerFactoryFunc = Callable[[], Compiler]


260
SCANNER_FACTORY = '''
261
def get_scanner() -> ScannerFunc:
262
263
264
265
266
    return {NAME}Scanner
'''


GRAMMAR_FACTORY = '''
267
def get_grammar() -> {NAME}Grammar:
268
269
270
271
272
273
274
275
276
277
278
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
279
def get_transformer() -> TransformationFunc:
280
281
282
283
284
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
285
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
286
287
288
289
290
291
292
293
294
295
296
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
297

298
299
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
300
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
301
302
303
    pass


304
class EBNFCompiler(Compiler):
305
306
307
308
    """Generates a Parser from an abstract syntax tree of a grammar specified
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
309
310
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
311
312
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
313
314
315
316
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
317
318
319
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
320

321
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
322
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
323
324
325
        self._reset()

    def _reset(self):
326
        self._result = ''           # type: str
327
328
329
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
330
        self.variables = set()      # type: Set[str]
331
        # self.definition_names = []  # type: List[str]
332
333
        self.recursive = set()      # type: Set[str]
        self.root = ""              # type: str
334
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
335
                           'comment': '',
336
                           'literalws': ['right'],
337
338
339
                           'tokens': set(),     # alt. 'scanner_tokens'
                           'filter': dict(),    # alt. 'filter'
                           'testing': False }
340

Eckhart Arnold's avatar
Eckhart Arnold committed
341
    @property
342
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
343
344
        return self._result

345
    def gen_scanner_skeleton(self) -> str:
346
        name = self.grammar_name + "Scanner"
347
348
        return "def %s(text):\n    return text\n" % name \
               + SCANNER_FACTORY.format(NAME=self.grammar_name)
349

350
    def gen_transformer_skeleton(self) -> str:
351
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
352
353
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
354
355
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
356
        transtable = [tt_name + ' = {',
357
358
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
359
        transtable.append('    "+": remove_empty,')
360
        for name in self.rules:
361
362
            transtable.append('    "' + name + '": [],')
        transtable.append('    ":Token, :RE": reduce_single_child,')
Eckhart Arnold's avatar
Eckhart Arnold committed
363
        transtable += ['    "*": replace_by_single_child', '}', '', tf_name +
364
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
365
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
366
367
        return '\n'.join(transtable)

368
    def gen_compiler_skeleton(self) -> str:
369
        if not self.rules:
370
371
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
372
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
373
374
375
376
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
377
                    self.grammar_name + '", grammar_source=""):',
378
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
379
                    'Compiler, self).__init__(grammar_name, grammar_source)',
380
                    "        assert re.match('\w+\Z', grammar_name)", '']
381
        for name in self.rules:
382
            method_name = Compiler.derive_method_name(name)
383
            if name == self.root:
384
                compiler += ['    def ' + method_name + '(self, node):',
385
386
                             '        return node', '']
            else:
387
                compiler += ['    def ' + method_name + '(self, node):',
388
                             '        pass', '']
389
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
390
        return '\n'.join(compiler)
391

392
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
393
        # fix capture of variables that have been defined before usage [sic!]
394

395
396
397
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
398
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
399

400
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
401
                            if 'right' in self.directives['literalws'] else "''"))
402
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
403
                            if 'left' in self.directives['literalws'] else "''"))
404
        definitions.append((self.WHITESPACE_KEYWORD,
405
406
407
408
409
410
411
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
412

413
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
414
        declarations = ['class ' + self.grammar_name +
415
                        'Grammar(Grammar):',
416
417
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
418
                        (', with this grammar:' if self.grammar_source else '.')]
419
        definitions.append(('parser_initialization__', '"upon instantiation"'))
420
        if self.grammar_source:
421
            definitions.append(('source_hash__',
422
                                '"%s"' % md5(self.grammar_source, __version__)))
423
            declarations.append('')
424
            declarations += [line for line in self.grammar_source.split('\n')]
425
426
427
428
429
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
430

431
432
433
434
435
436
437
438
439
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
440
441
442
443
444
445
446

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
447
                root_node.error_flag = True
448
449
450

        # check for unconnected rules

451
        if not self.directives['testing']:
452
453
454
455
456
457
458
459
460
461
462
            defined_symbols.difference_update(self.RESERVED_SYMBOLS)

            def remove_connections(symbol):
                if symbol in defined_symbols:
                    defined_symbols.remove(symbol)
                    for related in self.rules[symbol][1:]:
                        remove_connections(str(related))

            remove_connections(self.root)
            for leftover in defined_symbols:
                self.rules[leftover][0].add_error(('Rule "%s" is not connected to parser '
463
464
                    'root "%s" !') % (leftover, self.root) + ' (Use directive "@testing=True" '
                    'to supress this error message.)')
465
466
467

        # set root parser and assemble python grammar definition

468
        if self.root and 'root__' not in self.rules:
469
470
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
471
472
473
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
474

475
    def on_syntax(self, node: Node) -> str:
476
477
478
479
        self._reset()
        definitions = []

        # drop the wrapping sequence node
480
481
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
482
483

        # compile definitions and directives and collect definitions
484
        for nd in node.children:
485
            if nd.parser.name == "definition":
486
                definitions.append(self._compile(nd))
487
488
            else:
                assert nd.parser.name == "directive", nd.as_sexpr()
489
                self._compile(nd)
490
                node.error_flag = node.error_flag or nd.error_flag
491

492
        return self.assemble_parser(definitions, node)
493

494
    def on_definition(self, node: Node) -> Tuple[str, str]:
495
        rule = str(node.children[0])
496
497
498
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
499
500
501
502
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
503
        elif rule in self.directives['tokens']:
504
505
506
507
            node.add_error('Symbol "%s" has already been defined as '
                           'a scanner token.' % rule)
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
508
                           % rule + '(This may change in the future.)')
509
        try:
510
511
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
512
            defn = self._compile(node.children[1])
513
            if rule in self.variables:
514
                defn = 'Capture(%s)' % defn
515
                self.variables.remove(rule)
516
517
518
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
519
520
521
522
        except TypeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sexpr()
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
523
        return rule, defn
524
525

    @staticmethod
526
    def _check_rx(node: Node, rx: str) -> str:
527
528
529
530
531
532
533
534
535
536
537
538
        """Checks whether the string `rx` represents a valid regular
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

539
    def on_directive(self, node: Node) -> str:
540
        key = str(node.children[0]).lower()
541
        assert key not in self.directives['tokens']
542

543
        if key in {'comment', 'whitespace'}:
544
545
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
546
                    node.add_error('Directive "%s" must have one, but not %i values.' %
547
548
                                   (key, len(node.children[1].result)))
                value = self._compile(node.children[1]).pop()
549
550
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
551
                else:
552
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
553
            else:
554
555
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
556
557
558
559
560
561
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
562
563
564
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
565
            self.directives[key] = value
566

567
568
569
570
        elif key == 'testing':
            value = str(node.children[1])
            self.directives['testing'] = value.lower() not in {"off", "false", "no"}

571
        elif key == 'literalws':
572
            value = {item.lower() for item in self._compile(node.children[1])}
573
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
574
                    or ('none' in value and len(value) > 1)):
575
576
577
578
579
580
581
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

582
        elif key in {'tokens', 'scanner_tokens'}:
583
            self.directives['tokens'] |= self._compile(node.children[1])
584

585
        elif key.endswith('_filter'):
586
            filter_set = self._compile(node.children[1])
587
588
589
590
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
591

592
593
594
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
595
                            ', '.join(list(self.directives.keys()))))
596
597
        return ""

598
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
599
600
601
        """Compiles any non-terminal, where `parser_class` indicates the Parser class
        name for the particular non-terminal.
        """
602
        arguments = [self._compile(r) for r in node.children] + custom_args
603
604
        return parser_class + '(' + ', '.join(arguments) + ')'

605
    def on_expression(self, node) -> str:
606
607
        return self.non_terminal(node, 'Alternative')

608
    def on_term(self, node) -> str:
609
        return self.non_terminal(node, 'Series')
610

611
    def on_factor(self, node: Node) -> str:
612
        assert node.children
613
        assert len(node.children) >= 2, node.as_sexpr()
614
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
615
        custom_args = []  # type: List[str]
616
617

        if prefix in {'::', ':'}:
618
619
            assert len(node.children) == 2
            arg = node.children[-1]
620
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
621
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
622
623
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
624
            if str(arg) in self.directives['filter']:
625
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
626
            self.variables.add(str(arg))  # cast(str, arg.result)
627

628
        elif len(node.children) > 2:
629
630
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
631
632
633
634
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
635

636
        node.result = node.children[1:]
637
638
639
640
641
        try:
            parser_class = self.PREFIX_TABLE[prefix]
            return self.non_terminal(node, parser_class, custom_args)
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
642
        return ""
643

644
    def on_option(self, node) -> str:
645
646
        return self.non_terminal(node, 'Optional')

647
    def on_repetition(self, node) -> str:
648
649
        return self.non_terminal(node, 'ZeroOrMore')

650
    def on_oneormore(self, node) -> str:
651
652
        return self.non_terminal(node, 'OneOrMore')

653
    def on_regexchain(self, node) -> str:
654
655
        raise EBNFCompilerError("Not yet implemented!")

656
    def on_group(self, node) -> str:
657
658
659
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

660
661
662
663
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
            return 'ScannerToken("' + symbol + '")'
664
        else:
665
666
667
668
669
670
            self.current_symbols.append(node)
            if symbol not in self.symbols:
                self.symbols[symbol] = node
            if symbol in self.rules:
                self.recursive.add(symbol)
            return symbol
671

672
    def on_literal(self, node) -> str:
673
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.join([node.result]) + ')' ?
674

675
    def on_regexp(self, node: Node) -> str:
676
        rx = str(node)
677
        name = []   # type: List[str]
678
679
        if rx[:2] == '~/':
            if not 'left' in self.directives['literalws']:
680
                name = ['wL=' + self.WHITESPACE_KEYWORD] + name
681
682
683
684
            rx = rx[1:]
        elif 'left' in self.directives['literalws']:
            name = ["wL=''"] + name
        if rx[-2:] == '/~':
Eckhart Arnold's avatar
Eckhart Arnold committed
685
            if 'right' not in self.directives['literalws']:
686
                name = ['wR=' + self.WHITESPACE_KEYWORD] + name
687
688
689
690
691
692
693
694
695
696
697
698
            rx = rx[:-1]
        elif 'right' in self.directives['literalws']:
            name = ["wR=''"] + name
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
                     node.as_sexpr()
            node.add_error(errmsg)
            return '"' + errmsg + '"'
        return 'RE(' + ', '.join([arg] + name) + ')'

699
    def on_list_(self, node) -> Set[str]:
700
        assert node.children
701
        return set(item.result.strip() for item in node.children)
702
703


704
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
705
706
707
708
709
710
711
712
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton