ebnf.py 28.8 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
from functools import partial
21
22
23
24
try:
    import regex as re
except ImportError:
    import re
25
from typing import Callable, cast, List, Set, Tuple
26

27
28
29
30
31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
from DHParser.parsers import Grammar, mixin_comment, nil_scanner, Forward, RE, NegativeLookahead, \
    Alternative, Sequence, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
    ScannerFunc
from DHParser.syntaxtree import Node, traverse, remove_enclosing_delimiters, reduce_single_child, \
32
    replace_by_single_child, TOKEN_PTYPE, remove_expendables, remove_tokens, flatten, \
33
34
    forbid, assert_content, WHITESPACE_PTYPE, key_tag_name, TransformerFunc
from DHParser.versionnumber import __version__
35
36


37
38
39
40
41
__all__ = ['get_ebnf_scanner',
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
42
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
43
           'EBNFCompilerError',
44
           'EBNFCompiler',
45
46
47
48
49
           'grammar_changed',
           'ScannerFactoryFunc',
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
           'CompilerFactoryFunc']
50
51


Eckhart Arnold's avatar
Eckhart Arnold committed
52
53
54
55
56
57
58
########################################################################
#
# EBNF scanning
#
########################################################################


59
def get_ebnf_scanner() -> ScannerFunc:
Eckhart Arnold's avatar
Eckhart Arnold committed
60
61
62
63
64
65
66
67
68
69
    return nil_scanner


########################################################################
#
# EBNF parsing
#
########################################################################


70
class EBNFGrammar(Grammar):
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
89
                | [flowmarker] regexchain
90
91
92
93
94
95
96
97
98
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
99
    regexchain =  ">" expression §"<"                # compiles "expression" into a singular regular expression
100
101
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
102
103
    option     =  "[" expression §"]"

104
105
106
107
108
109
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
110
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
111
112
113
114
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
115
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
116
    parser_initialization__ = "upon instantiation"
117
118
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
119
    wspL__ = ''
120
    wspR__ = WSP__
121
    EOF = NegativeLookahead(RE('.', wR=''))
122
    list_ = Sequence(RE('\\w+'), ZeroOrMore(Sequence(Token(","), RE('\\w+'))))
123
124
125
    regexp = RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
126
    option = Sequence(Token("["), expression, Required(Token("]")))
127
128
    repetition = Sequence(Token("{"), expression, Required(Token("}")))
    oneormore = Sequence(Token("{"), expression, Token("}+"))
129
    regexchain = Sequence(Token("<"), expression, Required(Token(">")))
130
131
132
133
134
    group = Sequence(Token("("), expression, Required(Token(")")))
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
    factor = Alternative(Sequence(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Sequence(Optional(flowmarker), literal), Sequence(Optional(flowmarker), regexp),
135
136
                         Sequence(Optional(flowmarker), group), Sequence(Optional(flowmarker), regexchain),
                         Sequence(Optional(flowmarker), oneormore), repetition, option)
137
138
139
140
    term = OneOrMore(factor)
    expression.set(Sequence(term, ZeroOrMore(Sequence(Token("|"), term))))
    directive = Sequence(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Sequence(symbol, Required(Token("=")), expression)
141
    syntax = Sequence(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
142
143
144
    root__ = syntax


145
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
165
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
166
167
168
169
170
171
172
173
174
175
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


176
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


193
194
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!?

195
EBNF_transformation_table = {
196
197
198
199
200
    # AST Transformations for EBNF-grammar
    "syntax":
        remove_expendables,
    "directive, definition":
        partial(remove_tokens, tokens={'@', '='}),
Eckhart Arnold's avatar
Eckhart Arnold committed
201
    "expression":
202
        [replace_by_single_child, flatten,
Eckhart Arnold's avatar
Eckhart Arnold committed
203
         partial(remove_tokens, tokens={'|'})],
204
205
206
207
208
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
209
210
211
        [remove_enclosing_delimiters, replace_by_single_child],
    "oneormore, repetition, option, regexchain":
        [reduce_single_child, remove_enclosing_delimiters],
212
    "symbol, literal, regexp":
213
        [remove_expendables, reduce_single_child],
214
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
215
        [remove_expendables, reduce_single_child],
216
    "list_":
217
        [flatten, partial(remove_tokens, tokens={','})],
218
    "*":
219
220
221
        [remove_expendables, replace_by_single_child]
}

222

223
EBNF_validation_table = {
224
225
    # Semantic validation on the AST
    "repetition, option, oneormore":
226
        [partial(forbid, child_tags=['repetition', 'option', 'oneormore']),
227
228
         partial(assert_content, regex=r'(?!§)')],
}
229

230

231
def EBNFTransformer(syntax_tree: Node):
232
    for processing_table, key_func in [(EBNF_transformation_table, key_tag_name),
233
                                       (EBNF_validation_table, key_tag_name)]:
234
        traverse(syntax_tree, processing_table, key_func)
di68kap's avatar
di68kap committed
235
236


237
def get_ebnf_transformer() -> TransformerFunc:
238
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
239
240
241
242
243
244
245
246


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

247
248
249
250
251
252
253

ScannerFactoryFunc = Callable[[], ScannerFunc]
ParserFactoryFunc = Callable[[], Grammar]
TransformerFactoryFunc = Callable[[], TransformerFunc]
CompilerFactoryFunc = Callable[[], Compiler]


254
SCANNER_FACTORY = '''
255
def get_scanner():
256
257
258
259
260
    return {NAME}Scanner
'''


GRAMMAR_FACTORY = '''
261
def get_grammar():
262
263
264
265
266
267
268
269
270
271
272
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
273
def get_transformer():
274
275
276
277
278
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
279
def get_compiler(grammar_name="{NAME}", grammar_source=""):
280
281
282
283
284
285
286
287
288
289
290
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
291

292
293
294
295
296
297
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
    in the strict sense, see `CompilationError` below)"""
    pass


298
class EBNFCompiler(Compiler):
299
300
301
302
    """Generates a Parser from an abstract syntax tree of a grammar specified
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
303
304
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
305
306
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
307
308
309
310
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
311
312
313
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
314

315
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
316
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
317
318
319
        self._reset()

    def _reset(self):
320
321
322
323
324
325
326
        self._result = ''           # type: str
        self.rules = set()          # type: Set[str]
        self.variables = set()      # type: Set[str]
        self.symbol_nodes = []      # type: List[Node]
        self.definition_names = []  # type: List[str]
        self.recursive = set()      # type: Set[str]
        self.root = ""              # type: str
327
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
328
                           'comment': '',
329
                           'literalws': ['right'],
330
                           'tokens': set(),     # alt. 'scanner_tokens'
331
                           'filter': dict()}     # alt. 'retrieve_filter'
332

Eckhart Arnold's avatar
Eckhart Arnold committed
333
    @property
334
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
335
336
        return self._result

337
    def gen_scanner_skeleton(self) -> str:
338
        name = self.grammar_name + "Scanner"
339
340
        return "def %s(text):\n    return text\n" % name \
               + SCANNER_FACTORY.format(NAME=self.grammar_name)
341

342
    def gen_transformer_skeleton(self) -> str:
343
        if not self.definition_names:
Eckhart Arnold's avatar
Eckhart Arnold committed
344
345
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
346
347
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
348
        transtable = [tt_name + ' = {',
349
350
351
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
        for name in self.definition_names:
di68kap's avatar
di68kap committed
352
            transtable.append('    "' + name + '": no_operation,')
353
        transtable += ['    "*": no_operation', '}', '', tf_name +
354
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
355
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
356
357
        return '\n'.join(transtable)

358
    def gen_compiler_skeleton(self) -> str:
359
360
361
        if not self.definition_names:
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
362
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
363
364
365
366
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
367
                    self.grammar_name + '", grammar_source=""):',
368
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
369
                    'Compiler, self).__init__(grammar_name, grammar_source)',
370
371
                    "        assert re.match('\w+\Z', grammar_name)", '']
        for name in self.definition_names:
372
            method_name = Compiler.derive_method_name(name)
373
            if name == self.root:
374
                compiler += ['    def ' + method_name + '(self, node: Node) -> str:',
375
376
                             '        return node', '']
            else:
377
                compiler += ['    def ' + method_name + '(self, node: Node) -> str:',
378
                             '        pass', '']
379
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
380
        return '\n'.join(compiler)
381

382
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
383
        # fix capture of variables that have been defined before usage [sic!]
384

385
386
387
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
388
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
389
390

        self.definition_names = [defn[0] for defn in definitions]
391
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
392
                            if 'right' in self.directives['literalws'] else "''"))
393
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
394
                            if 'left' in self.directives['literalws'] else "''"))
395
        definitions.append((self.WHITESPACE_KEYWORD,
396
397
398
399
400
401
402
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
403

404
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
405
        declarations = ['class ' + self.grammar_name +
406
                        'Grammar(Grammar):',
407
408
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
409
                        (', with this grammar:' if self.grammar_source else '.')]
410
        definitions.append(('parser_initialization__', '"upon instatiation"'))
411
        if self.grammar_source:
412
            definitions.append(('source_hash__',
413
                                '"%s"' % md5(self.grammar_source, __version__)))
414
            declarations.append('')
415
            declarations += [line for line in self.grammar_source.split('\n')]
416
417
418
419
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

420
        # add default functions for retrieve_filter filters of pop or retrieve operators
421

422
423
424
        # for symbol, fun in self.directives['filter']:
        #     declarations.append(symbol + '_filter = lambda value: value.replace("(", ")")'
        #                         '.replace("[", "]").replace("{", "}").replace(">", "<")')
425

426
        # turn definitions into declarations in reverse order
427

428
429
430
431
432
433
434
435
436
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
437
        known_symbols = self.rules | self.RESERVED_SYMBOLS
438
        for nd in self.symbol_nodes:
439
            if nd.result not in known_symbols:
440
                nd.add_error("Missing production for symbol '%s'" % nd.result)
441
                root_node.error_flag = True
442
        if self.root and 'root__' not in self.rules:
443
444
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
445
446
447
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
448

449
    def on_syntax(self, node: Node) -> str:
450
451
452
453
        self._reset()
        definitions = []

        # drop the wrapping sequence node
454
455
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
456
457

        # compile definitions and directives and collect definitions
458
        for nd in node.children:
459
            if nd.parser.name == "definition":
460
                definitions.append(self._compile(nd))
461
462
            else:
                assert nd.parser.name == "directive", nd.as_sexpr()
463
                self._compile(nd)
464
                node.error_flag = node.error_flag or nd.error_flag
465

466
        return self.assemble_parser(definitions, node)
467

468
469
    def on_definition(self, node: Node) -> Tuple[str, str]:
        rule = cast(str, node.children[0].result)
470
471
472
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
473
474
475
476
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
477
        elif rule in self.directives['tokens']:
478
479
480
481
482
483
484
            node.add_error('Symbol "%s" has already been defined as '
                           'a scanner token.' % rule)
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
                           % rule + '(This may change in the furute.)')
        try:
            self.rules.add(rule)
485
            defn = self._compile(node.children[1])
486
            if rule in self.variables:
487
                defn = 'Capture(%s)' % defn
488
489
490
491
492
                self.variables.remove(rule)
        except TypeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sexpr()
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
493
        return rule, defn
494
495

    @staticmethod
496
    def _check_rx(node: Node, rx: str) -> str:
497
498
499
500
501
502
503
504
505
506
507
508
        """Checks whether the string `rx` represents a valid regular
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

509
510
    def on_directive(self, node: Node) -> str:
        key = cast(str, node.children[0].result).lower()
511
        assert key not in self.directives['tokens']
512
        if key in {'comment', 'whitespace'}:
513
514
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
515
                    node.add_error('Directive "%s" must have one, but not %i values.' %
516
517
                                   (key, len(node.children[1].result)))
                value = self._compile(node.children[1]).pop()
518
519
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
520
                else:
521
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
522
            else:
523
524
                value = cast(str, node.children[1].result).strip("~")
                if value != cast(str, node.children[1].result):
525
526
527
528
529
530
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
531
532
533
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
534
            self.directives[key] = value
535

536
        elif key == 'literalws':
537
            value = {item.lower() for item in self._compile(node.children[1])}
538
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
539
                    or ('none' in value and len(value) > 1)):
540
541
542
543
544
545
546
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

547
        elif key in {'tokens', 'scanner_tokens'}:
548
            self.directives['tokens'] |= self._compile(node.children[1])
549

550
        elif key.endswith('_filter'):
551
            filter_set = self._compile(node.children[1])
552
553
554
555
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
556

557
558
559
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
560
                            ', '.join(list(self.directives.keys()))))
561
562
        return ""

563
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
564
565
566
        """Compiles any non-terminal, where `parser_class` indicates the Parser class
        name for the particular non-terminal.
        """
567
        arguments = [self._compile(r) for r in node.children] + custom_args
568
569
        return parser_class + '(' + ', '.join(arguments) + ')'

570
    def on_expression(self, node) -> str:
571
572
        return self.non_terminal(node, 'Alternative')

573
    def on_term(self, node) -> str:
574
575
        return self.non_terminal(node, 'Sequence')

576
    def on_factor(self, node: Node) -> str:
577
        assert node.children
578
579
580
        assert len(node.children) >= 2, node.as_sexpr()
        prefix = cast(str, node.children[0].result)
        custom_args = []  # type: List[str]
581
582

        if prefix in {'::', ':'}:
583
584
            assert len(node.children) == 2
            arg = node.children[-1]
585
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
586
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
587
588
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
589
590
            if str(arg) in self.directives['filter']:
                custom_args = ['retrieve_filter=%s' % self.directives['filter'][str(arg)]]
591
            self.variables.add(cast(str, arg.result))
592

593
        elif len(node.children) > 2:
594
595
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
596
597
598
599
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
600

601
        node.result = node.children[1:]
602
603
604
605
606
        try:
            parser_class = self.PREFIX_TABLE[prefix]
            return self.non_terminal(node, parser_class, custom_args)
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
607
        return ""
608

609
    def on_option(self, node) -> str:
610
611
        return self.non_terminal(node, 'Optional')

612
    def on_repetition(self, node) -> str:
613
614
        return self.non_terminal(node, 'ZeroOrMore')

615
    def on_oneormore(self, node) -> str:
616
617
        return self.non_terminal(node, 'OneOrMore')

618
    def on_regexchain(self, node) -> str:
619
620
        raise EBNFCompilerError("Not yet implemented!")

621
    def on_group(self, node) -> str:
622
623
624
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

625
626
627
628
    def on_symbol(self, node: Node) -> str:
        result = cast(str, node.result)
        if result in self.directives['tokens']:
            return 'ScannerToken("' + result + '")'
629
        else:
630
            self.symbol_nodes.append(node)
631
632
633
            if result in self.rules:
                self.recursive.add(result)
            return result
634

635
636
    def on_literal(self, node) -> str:
        return 'Token(' + cast(str, node.result).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.join([node.result]) + ')' ?
637

638
639
640
    def on_regexp(self, node: Node) -> str:
        rx = cast(str, node.result)
        name = []   # type: List[str]
641
642
        if rx[:2] == '~/':
            if not 'left' in self.directives['literalws']:
643
                name = ['wL=' + self.WHITESPACE_KEYWORD] + name
644
645
646
647
            rx = rx[1:]
        elif 'left' in self.directives['literalws']:
            name = ["wL=''"] + name
        if rx[-2:] == '/~':
Eckhart Arnold's avatar
Eckhart Arnold committed
648
            if 'right' not in self.directives['literalws']:
649
                name = ['wR=' + self.WHITESPACE_KEYWORD] + name
650
651
652
653
654
655
656
657
658
659
660
661
            rx = rx[:-1]
        elif 'right' in self.directives['literalws']:
            name = ["wR=''"] + name
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
                     node.as_sexpr()
            node.add_error(errmsg)
            return '"' + errmsg + '"'
        return 'RE(' + ', '.join([arg] + name) + ')'

662
    def on_list_(self, node) -> Set[str]:
663
        assert node.children
664
        return set(item.result.strip() for item in node.children)
665
666


667
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
668
669
670
671
672
673
674
675
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton