ebnf.py 29.8 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
from collections import OrderedDict
20
import keyword
21
22
23
24
try:
    import regex as re
except ImportError:
    import re
25
from typing import Callable, Dict, List, Set, Tuple
26

27
28
29
30
31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
from DHParser.parsers import Grammar, mixin_comment, nil_scanner, Forward, RE, NegativeLookahead, \
    Alternative, Sequence, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
    ScannerFunc
from DHParser.syntaxtree import Node, traverse, remove_enclosing_delimiters, reduce_single_child, \
32
    replace_by_single_child, TOKEN_PTYPE, remove_expendables, remove_tokens, flatten, \
33
    forbid, assert_content, WHITESPACE_PTYPE, key_tag_name, TransformationFunc
34
from DHParser.versionnumber import __version__
35
36


37
38
39
40
41
__all__ = ['get_ebnf_scanner',
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
42
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
43
           'EBNFCompilerError',
44
           'EBNFCompiler',
45
46
47
48
49
           'grammar_changed',
           'ScannerFactoryFunc',
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
           'CompilerFactoryFunc']
50
51


Eckhart Arnold's avatar
Eckhart Arnold committed
52
53
54
55
56
57
58
########################################################################
#
# EBNF scanning
#
########################################################################


59
def get_ebnf_scanner() -> ScannerFunc:
Eckhart Arnold's avatar
Eckhart Arnold committed
60
61
62
63
64
65
66
67
68
    return nil_scanner


########################################################################
#
# EBNF parsing
#
########################################################################

69
70
# TODO: Introduce dummy/rename-parser, for simple assignments (e.g. jahr = JAHRESZAHL) or substition!
# TODO: Raise Error for unconnected parsers!
71
class EBNFGrammar(Grammar):
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
90
                | [flowmarker] regexchain
91
92
93
94
95
96
97
98
99
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
100
    regexchain =  ">" expression §"<"                # compiles "expression" into a singular regular expression
101
102
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
103
104
    option     =  "[" expression §"]"

105
106
107
108
109
110
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
111
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
112
113
114
115
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
116
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
117
    parser_initialization__ = "upon instantiation"
118
119
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
120
    wspL__ = ''
121
    wspR__ = WSP__
122
    EOF = NegativeLookahead(RE('.', wR=''))
123
    list_ = Sequence(RE('\\w+'), ZeroOrMore(Sequence(Token(","), RE('\\w+'))))
124
125
126
    regexp = RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
127
    option = Sequence(Token("["), expression, Required(Token("]")))
128
129
    repetition = Sequence(Token("{"), expression, Required(Token("}")))
    oneormore = Sequence(Token("{"), expression, Token("}+"))
130
    regexchain = Sequence(Token("<"), expression, Required(Token(">")))
131
132
133
134
135
    group = Sequence(Token("("), expression, Required(Token(")")))
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
    factor = Alternative(Sequence(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Sequence(Optional(flowmarker), literal), Sequence(Optional(flowmarker), regexp),
136
137
                         Sequence(Optional(flowmarker), group), Sequence(Optional(flowmarker), regexchain),
                         Sequence(Optional(flowmarker), oneormore), repetition, option)
138
139
140
141
    term = OneOrMore(factor)
    expression.set(Sequence(term, ZeroOrMore(Sequence(Token("|"), term))))
    directive = Sequence(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Sequence(symbol, Required(Token("=")), expression)
142
    syntax = Sequence(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
143
144
145
    root__ = syntax


146
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
166
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
167
168
169
170
171
172
173
174
175
176
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


177
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


194
195
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!?

196
EBNF_transformation_table = {
197
198
199
200
    # AST Transformations for EBNF-grammar
    "syntax":
        remove_expendables,
    "directive, definition":
201
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
202
    "expression":
203
        [replace_by_single_child, flatten, remove_tokens('|')],
204
205
206
207
208
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
209
210
211
        [remove_enclosing_delimiters, replace_by_single_child],
    "oneormore, repetition, option, regexchain":
        [reduce_single_child, remove_enclosing_delimiters],
212
    "symbol, literal, regexp":
213
        [remove_expendables, reduce_single_child],
214
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
215
        [remove_expendables, reduce_single_child],
216
    "list_":
217
        [flatten, remove_tokens(',')],
218
    "*":
219
220
221
        [remove_expendables, replace_by_single_child]
}

222

223
EBNF_validation_table = {
224
    # Semantic validation on the AST. EXPERIMENTAL!
225
    "repetition, option, oneormore":
226
227
        [forbid('repetition', 'option', 'oneormore'),
         assert_content(r'(?!§)')]
228
}
229

230

231
def EBNFTransformer(syntax_tree: Node):
232
    for processing_table, key_func in [(EBNF_transformation_table, key_tag_name),
233
                                       (EBNF_validation_table, key_tag_name)]:
234
        traverse(syntax_tree, processing_table, key_func)
di68kap's avatar
di68kap committed
235
236


237
def get_ebnf_transformer() -> TransformationFunc:
238
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
239
240
241
242
243
244
245
246


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

247
248
249

ScannerFactoryFunc = Callable[[], ScannerFunc]
ParserFactoryFunc = Callable[[], Grammar]
250
TransformerFactoryFunc = Callable[[], TransformationFunc]
251
252
253
CompilerFactoryFunc = Callable[[], Compiler]


254
SCANNER_FACTORY = '''
255
def get_scanner() -> ScannerFunc:
256
257
258
259
260
    return {NAME}Scanner
'''


GRAMMAR_FACTORY = '''
261
def get_grammar() -> {NAME}Grammar:
262
263
264
265
266
267
268
269
270
271
272
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
273
def get_transformer() -> TransformationFunc:
274
275
276
277
278
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
279
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
280
281
282
283
284
285
286
287
288
289
290
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
291

292
293
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
294
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
295
296
297
    pass


298
class EBNFCompiler(Compiler):
299
300
301
302
    """Generates a Parser from an abstract syntax tree of a grammar specified
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
303
304
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
305
306
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
307
308
309
310
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
311
312
313
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
314

315
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
316
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
317
318
319
        self._reset()

    def _reset(self):
320
        self._result = ''           # type: str
321
322
323
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
324
        self.variables = set()      # type: Set[str]
325
        # self.definition_names = []  # type: List[str]
326
327
        self.recursive = set()      # type: Set[str]
        self.root = ""              # type: str
328
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
329
                           'comment': '',
330
                           'literalws': ['right'],
331
332
                           'tokens': set(),  # alt. 'scanner_tokens'
                           'filter': dict()}  # alt. 'filter'
333

Eckhart Arnold's avatar
Eckhart Arnold committed
334
    @property
335
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
336
337
        return self._result

338
    def gen_scanner_skeleton(self) -> str:
339
        name = self.grammar_name + "Scanner"
340
341
        return "def %s(text):\n    return text\n" % name \
               + SCANNER_FACTORY.format(NAME=self.grammar_name)
342

343
    def gen_transformer_skeleton(self) -> str:
344
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
345
346
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
347
348
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
349
        transtable = [tt_name + ' = {',
350
351
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
352
        for name in self.rules:
353
354
            transtable.append('    "' + name + '": no_transformation,')
        transtable += ['    "*": no_transformation', '}', '', tf_name +
355
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
356
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
357
358
        return '\n'.join(transtable)

359
    def gen_compiler_skeleton(self) -> str:
360
        if not self.rules:
361
362
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
363
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
364
365
366
367
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
368
                    self.grammar_name + '", grammar_source=""):',
369
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
370
                    'Compiler, self).__init__(grammar_name, grammar_source)',
371
                    "        assert re.match('\w+\Z', grammar_name)", '']
372
        for name in self.rules:
373
            method_name = Compiler.derive_method_name(name)
374
            if name == self.root:
375
                compiler += ['    def ' + method_name + '(self, node):',
376
377
                             '        return node', '']
            else:
378
                compiler += ['    def ' + method_name + '(self, node):',
379
                             '        pass', '']
380
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
381
        return '\n'.join(compiler)
382

383
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
384
        # fix capture of variables that have been defined before usage [sic!]
385

386
387
388
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
389
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
390

391
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
392
                            if 'right' in self.directives['literalws'] else "''"))
393
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
394
                            if 'left' in self.directives['literalws'] else "''"))
395
        definitions.append((self.WHITESPACE_KEYWORD,
396
397
398
399
400
401
402
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
403

404
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
405
        declarations = ['class ' + self.grammar_name +
406
                        'Grammar(Grammar):',
407
408
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
409
                        (', with this grammar:' if self.grammar_source else '.')]
410
        definitions.append(('parser_initialization__', '"upon instatiation"'))
411
        if self.grammar_source:
412
            definitions.append(('source_hash__',
413
                                '"%s"' % md5(self.grammar_source, __version__)))
414
            declarations.append('')
415
            declarations += [line for line in self.grammar_source.split('\n')]
416
417
418
419
420
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
421

422
423
424
425
426
427
428
429
430
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
431
432
433
434
435
436
437

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
438
                root_node.error_flag = True
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455

        # check for unconnected rules

        defined_symbols.difference_update(self.RESERVED_SYMBOLS)

        def remove_connections(symbol):
            if symbol in defined_symbols:
                defined_symbols.remove(symbol)
                for related in self.rules[symbol][1:]:
                    remove_connections(str(related))
        remove_connections(self.root)
        for leftover in defined_symbols:
            self.rules[leftover][0].add_error(('Rule "%s" is not connected to parser '
                                               'root "%s"') % (leftover, self.root))

        # set root parser and assemble python grammar definition

456
        if self.root and 'root__' not in self.rules:
457
458
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
459
460
461
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
462

463
    def on_syntax(self, node: Node) -> str:
464
465
466
467
        self._reset()
        definitions = []

        # drop the wrapping sequence node
468
469
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
470
471

        # compile definitions and directives and collect definitions
472
        for nd in node.children:
473
            if nd.parser.name == "definition":
474
                definitions.append(self._compile(nd))
475
476
            else:
                assert nd.parser.name == "directive", nd.as_sexpr()
477
                self._compile(nd)
478
                node.error_flag = node.error_flag or nd.error_flag
479

480
        return self.assemble_parser(definitions, node)
481

482
    def on_definition(self, node: Node) -> Tuple[str, str]:
483
        rule = str(node.children[0])
484
485
486
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
487
488
489
490
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
491
        elif rule in self.directives['tokens']:
492
493
494
495
            node.add_error('Symbol "%s" has already been defined as '
                           'a scanner token.' % rule)
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
496
                           % rule + '(This may change in the future.)')
497
        try:
498
499
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
500
            defn = self._compile(node.children[1])
501
            if rule in self.variables:
502
                defn = 'Capture(%s)' % defn
503
                self.variables.remove(rule)
504
505
506
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
507
508
509
510
        except TypeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sexpr()
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
511
        return rule, defn
512
513

    @staticmethod
514
    def _check_rx(node: Node, rx: str) -> str:
515
516
517
518
519
520
521
522
523
524
525
526
        """Checks whether the string `rx` represents a valid regular
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

527
    def on_directive(self, node: Node) -> str:
528
        key = str(node.children[0]).lower()  # cast(str, node.children[0].result).lower()
529
        assert key not in self.directives['tokens']
530
        if key in {'comment', 'whitespace'}:
531
532
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
533
                    node.add_error('Directive "%s" must have one, but not %i values.' %
534
535
                                   (key, len(node.children[1].result)))
                value = self._compile(node.children[1]).pop()
536
537
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
538
                else:
539
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
540
            else:
541
542
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
543
544
545
546
547
548
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
549
550
551
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
552
            self.directives[key] = value
553

554
        elif key == 'literalws':
555
            value = {item.lower() for item in self._compile(node.children[1])}
556
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
557
                    or ('none' in value and len(value) > 1)):
558
559
560
561
562
563
564
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

565
        elif key in {'tokens', 'scanner_tokens'}:
566
            self.directives['tokens'] |= self._compile(node.children[1])
567

568
        elif key.endswith('_filter'):
569
            filter_set = self._compile(node.children[1])
570
571
572
573
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
574

575
576
577
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
578
                            ', '.join(list(self.directives.keys()))))
579
580
        return ""

581
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
582
583
584
        """Compiles any non-terminal, where `parser_class` indicates the Parser class
        name for the particular non-terminal.
        """
585
        arguments = [self._compile(r) for r in node.children] + custom_args
586
587
        return parser_class + '(' + ', '.join(arguments) + ')'

588
    def on_expression(self, node) -> str:
589
590
        return self.non_terminal(node, 'Alternative')

591
    def on_term(self, node) -> str:
592
593
        return self.non_terminal(node, 'Sequence')

594
    def on_factor(self, node: Node) -> str:
595
        assert node.children
596
        assert len(node.children) >= 2, node.as_sexpr()
597
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
598
        custom_args = []  # type: List[str]
599
600

        if prefix in {'::', ':'}:
601
602
            assert len(node.children) == 2
            arg = node.children[-1]
603
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
604
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
605
606
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
607
            if str(arg) in self.directives['filter']:
608
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
609
            self.variables.add(str(arg))  # cast(str, arg.result)
610

611
        elif len(node.children) > 2:
612
613
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
614
615
616
617
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
618

619
        node.result = node.children[1:]
620
621
622
623
624
        try:
            parser_class = self.PREFIX_TABLE[prefix]
            return self.non_terminal(node, parser_class, custom_args)
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
625
        return ""
626

627
    def on_option(self, node) -> str:
628
629
        return self.non_terminal(node, 'Optional')

630
    def on_repetition(self, node) -> str:
631
632
        return self.non_terminal(node, 'ZeroOrMore')

633
    def on_oneormore(self, node) -> str:
634
635
        return self.non_terminal(node, 'OneOrMore')

636
    def on_regexchain(self, node) -> str:
637
638
        raise EBNFCompilerError("Not yet implemented!")

639
    def on_group(self, node) -> str:
640
641
642
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

643
644
645
646
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
            return 'ScannerToken("' + symbol + '")'
647
        else:
648
649
650
651
652
653
            self.current_symbols.append(node)
            if symbol not in self.symbols:
                self.symbols[symbol] = node
            if symbol in self.rules:
                self.recursive.add(symbol)
            return symbol
654

655
    def on_literal(self, node) -> str:
656
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.join([node.result]) + ')' ?
657

658
    def on_regexp(self, node: Node) -> str:
659
        rx = str(node)
660
        name = []   # type: List[str]
661
662
        if rx[:2] == '~/':
            if not 'left' in self.directives['literalws']:
663
                name = ['wL=' + self.WHITESPACE_KEYWORD] + name
664
665
666
667
            rx = rx[1:]
        elif 'left' in self.directives['literalws']:
            name = ["wL=''"] + name
        if rx[-2:] == '/~':
Eckhart Arnold's avatar
Eckhart Arnold committed
668
            if 'right' not in self.directives['literalws']:
669
                name = ['wR=' + self.WHITESPACE_KEYWORD] + name
670
671
672
673
674
675
676
677
678
679
680
681
            rx = rx[:-1]
        elif 'right' in self.directives['literalws']:
            name = ["wR=''"] + name
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
                     node.as_sexpr()
            node.add_error(errmsg)
            return '"' + errmsg + '"'
        return 'RE(' + ', '.join([arg] + name) + ')'

682
    def on_list_(self, node) -> Set[str]:
683
        assert node.children
684
        return set(item.result.strip() for item in node.children)
685
686


687
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
688
689
690
691
692
693
694
695
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton