ebnf.py 30 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
21
from collections import OrderedDict

22
23
24
25
try:
    import regex as re
except ImportError:
    import re
26
27
28
29
try:
    from typing import Callable, Dict, List, Set, Tuple
except ImportError:
    from .typing34 import Callable, Dict, List, Set, Tuple
30

31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
32
from DHParser.parsers import Grammar, mixin_comment, nil_scanner, Forward, RE, NegativeLookahead, \
33
    Alternative, Series, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
34
    ScannerFunc
35
36
37
38
from DHParser.syntaxtree import Node, traverse, remove_brackets, \
    reduce_single_child, replace_by_single_child, TOKEN_PTYPE, remove_expendables, \
    remove_tokens, flatten, forbid, assert_content, WHITESPACE_PTYPE, key_tag_name, \
    TransformationFunc
39
from DHParser.versionnumber import __version__
40
41


42
43
44
45
46
__all__ = ['get_ebnf_scanner',
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
47
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
48
           'EBNFCompilerError',
49
           'EBNFCompiler',
50
51
52
53
54
           'grammar_changed',
           'ScannerFactoryFunc',
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
           'CompilerFactoryFunc']
55
56


Eckhart Arnold's avatar
Eckhart Arnold committed
57
58
59
60
61
62
63
########################################################################
#
# EBNF scanning
#
########################################################################


64
def get_ebnf_scanner() -> ScannerFunc:
Eckhart Arnold's avatar
Eckhart Arnold committed
65
66
67
68
69
70
71
72
73
    return nil_scanner


########################################################################
#
# EBNF parsing
#
########################################################################

74
75
# TODO: Introduce dummy/rename-parser, for simple assignments (e.g. jahr = JAHRESZAHL) or substition!
# TODO: Raise Error for unconnected parsers!
76
class EBNFGrammar(Grammar):
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
95
                | [flowmarker] regexchain
96
97
98
99
100
101
102
103
104
105
106
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
107
108
    option     =  "[" expression §"]"

109
110
111
112
113
114
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
115
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
116
117
118
119
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
120
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
121
    parser_initialization__ = "upon instantiation"
122
123
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
124
    wspL__ = ''
125
    wspR__ = WSP__
126
    EOF = NegativeLookahead(RE('.', wR=''))
127
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
128
129
130
    regexp = RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
131
132
133
134
    option = Series(Token("["), expression, Required(Token("]")))
    repetition = Series(Token("{"), expression, Required(Token("}")))
    oneormore = Series(Token("{"), expression, Token("}+"))
    group = Series(Token("("), expression, Required(Token(")")))
135
136
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
137
138
    factor = Alternative(Series(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Series(Optional(flowmarker), literal), Series(Optional(flowmarker), regexp),
139
140
                         Series(Optional(flowmarker), group), Series(Optional(flowmarker), oneormore),
                         repetition, option)
141
    term = OneOrMore(factor)
142
143
144
145
    expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
    directive = Series(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Series(symbol, Required(Token("=")), expression)
    syntax = Series(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
146
147
148
    root__ = syntax


149
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
169
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
170
171
172
173
174
175
176
177
178
179
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


180
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


197
198
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!?

199
EBNF_transformation_table = {
200
    # AST Transformations for EBNF-grammar
201
    "+":
202
        remove_expendables,
203
    "syntax":
204
        [],  # otherwise '"*": replace_by_single_child' would be applied
205
    "directive, definition":
206
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
207
    "expression":
208
        [replace_by_single_child, flatten, remove_tokens('|')],
209
210
211
212
213
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
214
215
        [remove_tokens('(', ')'), replace_by_single_child],
    "oneormore, repetition, option":
216
        [reduce_single_child, remove_brackets],
217
    "symbol, literal, regexp":
218
        reduce_single_child,
219
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
220
        reduce_single_child,
221
    "list_":
222
        [flatten, remove_tokens(',')],
223
    "*":
224
        replace_by_single_child
225
226
}

227

228
EBNF_validation_table = {
229
    # Semantic validation on the AST. EXPERIMENTAL!
230
    "repetition, option, oneormore":
231
232
        [forbid('repetition', 'option', 'oneormore'),
         assert_content(r'(?!§)')]
233
}
234

235

236
def EBNFTransformer(syntax_tree: Node):
237
    for processing_table, key_func in [(EBNF_transformation_table, key_tag_name),
238
                                       (EBNF_validation_table, key_tag_name)]:
239
        traverse(syntax_tree, processing_table, key_func)
di68kap's avatar
di68kap committed
240
241


242
def get_ebnf_transformer() -> TransformationFunc:
243
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
244
245
246
247
248
249
250
251


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

252
253
254

ScannerFactoryFunc = Callable[[], ScannerFunc]
ParserFactoryFunc = Callable[[], Grammar]
255
TransformerFactoryFunc = Callable[[], TransformationFunc]
256
257
258
CompilerFactoryFunc = Callable[[], Compiler]


259
SCANNER_FACTORY = '''
260
def get_scanner() -> ScannerFunc:
261
262
263
264
265
    return {NAME}Scanner
'''


GRAMMAR_FACTORY = '''
266
def get_grammar() -> {NAME}Grammar:
267
268
269
270
271
272
273
274
275
276
277
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
278
def get_transformer() -> TransformationFunc:
279
280
281
282
283
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
284
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
285
286
287
288
289
290
291
292
293
294
295
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
296

297
298
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
299
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
300
301
302
    pass


303
class EBNFCompiler(Compiler):
304
305
306
307
    """Generates a Parser from an abstract syntax tree of a grammar specified
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
308
309
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
310
311
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
312
313
314
315
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
316
317
318
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
319

320
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
321
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
322
323
324
        self._reset()

    def _reset(self):
325
        self._result = ''           # type: str
326
327
328
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
329
        self.variables = set()      # type: Set[str]
330
        # self.definition_names = []  # type: List[str]
331
332
        self.recursive = set()      # type: Set[str]
        self.root = ""              # type: str
333
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
334
                           'comment': '',
335
                           'literalws': ['right'],
336
337
338
                           'tokens': set(),     # alt. 'scanner_tokens'
                           'filter': dict(),    # alt. 'filter'
                           'testing': False }
339

Eckhart Arnold's avatar
Eckhart Arnold committed
340
    @property
341
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
342
343
        return self._result

344
    def gen_scanner_skeleton(self) -> str:
345
        name = self.grammar_name + "Scanner"
346
347
        return "def %s(text):\n    return text\n" % name \
               + SCANNER_FACTORY.format(NAME=self.grammar_name)
348

349
    def gen_transformer_skeleton(self) -> str:
350
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
351
352
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
353
354
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
355
        transtable = [tt_name + ' = {',
356
357
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
358
        transtable.append('    "+": remove_empty,')
359
        for name in self.rules:
360
361
            transtable.append('    "' + name + '": [],')
        transtable.append('    ":Token, :RE": reduce_single_child,')
Eckhart Arnold's avatar
Eckhart Arnold committed
362
        transtable += ['    "*": replace_by_single_child', '}', '', tf_name +
363
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
364
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
365
366
        return '\n'.join(transtable)

367
    def gen_compiler_skeleton(self) -> str:
368
        if not self.rules:
369
370
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
371
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
372
373
374
375
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
376
                    self.grammar_name + '", grammar_source=""):',
377
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
378
                    'Compiler, self).__init__(grammar_name, grammar_source)',
379
                    "        assert re.match('\w+\Z', grammar_name)", '']
380
        for name in self.rules:
381
            method_name = Compiler.method_name(name)
382
            if name == self.root:
383
                compiler += ['    def ' + method_name + '(self, node):',
384
385
                             '        return node', '']
            else:
386
                compiler += ['    def ' + method_name + '(self, node):',
387
                             '        pass', '']
388
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
389
        return '\n'.join(compiler)
390

391
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
392
        # fix capture of variables that have been defined before usage [sic!]
393

394
395
396
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
397
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
398

399
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
400
                            if 'right' in self.directives['literalws'] else "''"))
401
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
402
                            if 'left' in self.directives['literalws'] else "''"))
403
        definitions.append((self.WHITESPACE_KEYWORD,
404
405
406
407
408
409
410
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
411

412
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
413
        declarations = ['class ' + self.grammar_name +
414
                        'Grammar(Grammar):',
415
416
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
417
                        (', with this grammar:' if self.grammar_source else '.')]
418
        definitions.append(('parser_initialization__', '"upon instantiation"'))
419
        if self.grammar_source:
420
            definitions.append(('source_hash__',
421
                                '"%s"' % md5(self.grammar_source, __version__)))
422
            declarations.append('')
423
            declarations += [line for line in self.grammar_source.split('\n')]
424
425
426
427
428
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
429

430
431
432
433
434
435
436
437
438
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
439
440
441
442
443
444
445

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
446
                root_node.error_flag = True
447
448
449

        # check for unconnected rules

450
        if not self.directives['testing']:
451
452
453
454
455
456
457
458
459
460
461
            defined_symbols.difference_update(self.RESERVED_SYMBOLS)

            def remove_connections(symbol):
                if symbol in defined_symbols:
                    defined_symbols.remove(symbol)
                    for related in self.rules[symbol][1:]:
                        remove_connections(str(related))

            remove_connections(self.root)
            for leftover in defined_symbols:
                self.rules[leftover][0].add_error(('Rule "%s" is not connected to parser '
462
463
                    'root "%s" !') % (leftover, self.root) + ' (Use directive "@testing=True" '
                    'to supress this error message.)')
464
465
466

        # set root parser and assemble python grammar definition

467
        if self.root and 'root__' not in self.rules:
468
469
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
470
471
472
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
473

474
    def on_syntax(self, node: Node) -> str:
475
476
477
478
        self._reset()
        definitions = []

        # drop the wrapping sequence node
479
480
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
481
482

        # compile definitions and directives and collect definitions
483
        for nd in node.children:
484
            if nd.parser.name == "definition":
485
                definitions.append(self._compile(nd))
486
            else:
487
                assert nd.parser.name == "directive", nd.as_sxpr()
488
                self._compile(nd)
489
                node.error_flag = node.error_flag or nd.error_flag
490

491
        return self.assemble_parser(definitions, node)
492

493
    def on_definition(self, node: Node) -> Tuple[str, str]:
494
        rule = str(node.children[0])
495
496
497
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
498
499
500
501
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
502
        elif rule in self.directives['tokens']:
503
504
505
506
            node.add_error('Symbol "%s" has already been defined as '
                           'a scanner token.' % rule)
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
507
                           % rule + '(This may change in the future.)')
508
        try:
509
510
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
511
            defn = self._compile(node.children[1])
512
            if rule in self.variables:
513
                defn = 'Capture(%s)' % defn
514
                self.variables.remove(rule)
515
516
517
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
518
        except TypeError as error:
519
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sxpr()
520
521
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
522
        return rule, defn
523
524

    @staticmethod
525
    def _check_rx(node: Node, rx: str) -> str:
526
527
528
529
530
531
532
533
534
535
536
537
        """Checks whether the string `rx` represents a valid regular
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

538
    def on_directive(self, node: Node) -> str:
539
        key = str(node.children[0]).lower()
540
        assert key not in self.directives['tokens']
541

542
        if key in {'comment', 'whitespace'}:
543
544
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
545
                    node.add_error('Directive "%s" must have one, but not %i values.' %
546
547
                                   (key, len(node.children[1].result)))
                value = self._compile(node.children[1]).pop()
548
549
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
550
                else:
551
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
552
            else:
553
554
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
555
556
557
558
559
560
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
561
562
563
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
564
            self.directives[key] = value
565

566
567
568
569
        elif key == 'testing':
            value = str(node.children[1])
            self.directives['testing'] = value.lower() not in {"off", "false", "no"}

570
        elif key == 'literalws':
571
            value = {item.lower() for item in self._compile(node.children[1])}
572
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
573
                    or ('none' in value and len(value) > 1)):
574
575
576
577
578
579
580
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

581
        elif key in {'tokens', 'scanner_tokens'}:
582
            self.directives['tokens'] |= self._compile(node.children[1])
583

584
        elif key.endswith('_filter'):
585
            filter_set = self._compile(node.children[1])
586
587
588
589
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
590

591
592
593
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
594
                            ', '.join(list(self.directives.keys()))))
595
596
        return ""

597
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
598
599
600
        """Compiles any non-terminal, where `parser_class` indicates the Parser class
        name for the particular non-terminal.
        """
601
        arguments = [self._compile(r) for r in node.children] + custom_args
602
603
        return parser_class + '(' + ', '.join(arguments) + ')'

604
    def on_expression(self, node) -> str:
605
606
        return self.non_terminal(node, 'Alternative')

607
    def on_term(self, node) -> str:
608
        return self.non_terminal(node, 'Series')
609

610
    def on_factor(self, node: Node) -> str:
611
        assert node.children
612
        assert len(node.children) >= 2, node.as_sxpr()
613
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
614
        custom_args = []  # type: List[str]
615
616

        if prefix in {'::', ':'}:
617
618
            assert len(node.children) == 2
            arg = node.children[-1]
619
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
620
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
621
622
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
623
            if str(arg) in self.directives['filter']:
624
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
625
            self.variables.add(str(arg))  # cast(str, arg.result)
626

627
        elif len(node.children) > 2:
628
629
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
630
631
632
633
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
634

635
        node.result = node.children[1:]
636
637
638
639
640
        try:
            parser_class = self.PREFIX_TABLE[prefix]
            return self.non_terminal(node, parser_class, custom_args)
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
641
        return ""
642

643
    def on_option(self, node) -> str:
644
645
        return self.non_terminal(node, 'Optional')

646
    def on_repetition(self, node) -> str:
647
648
        return self.non_terminal(node, 'ZeroOrMore')

649
    def on_oneormore(self, node) -> str:
650
651
        return self.non_terminal(node, 'OneOrMore')

652
    def on_regexchain(self, node) -> str:
653
654
        raise EBNFCompilerError("Not yet implemented!")

655
    def on_group(self, node) -> str:
656
657
658
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

659
660
661
662
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
            return 'ScannerToken("' + symbol + '")'
663
        else:
664
665
666
667
668
669
            self.current_symbols.append(node)
            if symbol not in self.symbols:
                self.symbols[symbol] = node
            if symbol in self.rules:
                self.recursive.add(symbol)
            return symbol
670

671
    def on_literal(self, node) -> str:
672
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.join([node.result]) + ')' ?
673

674
    def on_regexp(self, node: Node) -> str:
675
        rx = str(node)
676
        name = []   # type: List[str]
677
678
        if rx[:2] == '~/':
            if not 'left' in self.directives['literalws']:
679
                name = ['wL=' + self.WHITESPACE_KEYWORD] + name
680
681
682
683
            rx = rx[1:]
        elif 'left' in self.directives['literalws']:
            name = ["wL=''"] + name
        if rx[-2:] == '/~':
Eckhart Arnold's avatar
Eckhart Arnold committed
684
            if 'right' not in self.directives['literalws']:
685
                name = ['wR=' + self.WHITESPACE_KEYWORD] + name
686
687
688
689
690
691
692
            rx = rx[:-1]
        elif 'right' in self.directives['literalws']:
            name = ["wR=''"] + name
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
693
                     node.as_sxpr()
694
695
696
697
            node.add_error(errmsg)
            return '"' + errmsg + '"'
        return 'RE(' + ', '.join([arg] + name) + ')'

698
    def on_list_(self, node) -> Set[str]:
699
        assert node.children
700
        return set(item.result.strip() for item in node.children)
701
702


703
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
704
705
706
707
708
709
710
711
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton