ebnf.py 30.1 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
21
from collections import OrderedDict

22
23
24
25
try:
    import regex as re
except ImportError:
    import re
26
27
28
29
try:
    from typing import Callable, Dict, List, Set, Tuple
except ImportError:
    from .typing34 import Callable, Dict, List, Set, Tuple
30

31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
32
from DHParser.parsers import Grammar, mixin_comment, nil_scanner, Forward, RE, NegativeLookahead, \
33
    Alternative, Series, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
34
    ScannerFunc
35
36
37
38
from DHParser.syntaxtree import Node, traverse, remove_brackets, \
    reduce_single_child, replace_by_single_child, TOKEN_PTYPE, remove_expendables, \
    remove_tokens, flatten, forbid, assert_content, WHITESPACE_PTYPE, key_tag_name, \
    TransformationFunc
39
from DHParser.versionnumber import __version__
40
41


42
43
44
45
46
__all__ = ['get_ebnf_scanner',
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
47
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
48
           'EBNFCompilerError',
49
           'EBNFCompiler',
50
51
52
53
54
           'grammar_changed',
           'ScannerFactoryFunc',
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
           'CompilerFactoryFunc']
55
56


Eckhart Arnold's avatar
Eckhart Arnold committed
57
58
59
60
61
62
63
########################################################################
#
# EBNF scanning
#
########################################################################


64
def get_ebnf_scanner() -> ScannerFunc:
Eckhart Arnold's avatar
Eckhart Arnold committed
65
66
67
68
69
70
71
72
73
    return nil_scanner


########################################################################
#
# EBNF parsing
#
########################################################################

74

75
class EBNFGrammar(Grammar):
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
94
                | [flowmarker] regexchain
95
96
97
98
99
100
101
102
103
104
105
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
106
107
    option     =  "[" expression §"]"

108
109
110
111
112
113
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
114
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
115
116
117
118
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
119
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
120
    parser_initialization__ = "upon instantiation"
121
122
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
123
    wspL__ = ''
124
    wspR__ = WSP__
125
    EOF = NegativeLookahead(RE('.', wR=''))
126
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
127
    regexp = RE(r'~?/(?:\\/|[^/])*?/~?')  # RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
128
129
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
130
131
132
133
    option = Series(Token("["), expression, Required(Token("]")))
    repetition = Series(Token("{"), expression, Required(Token("}")))
    oneormore = Series(Token("{"), expression, Token("}+"))
    group = Series(Token("("), expression, Required(Token(")")))
134
135
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
136
137
    factor = Alternative(Series(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Series(Optional(flowmarker), literal), Series(Optional(flowmarker), regexp),
138
139
                         Series(Optional(flowmarker), group), Series(Optional(flowmarker), oneormore),
                         repetition, option)
140
    term = OneOrMore(factor)
141
142
143
144
    expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
    directive = Series(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Series(symbol, Required(Token("=")), expression)
    syntax = Series(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
145
146
147
    root__ = syntax


148
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
168
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
169
170
171
172
173
174
175
176
177
178
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


179
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


196
EBNF_transformation_table = {
197
    # AST Transformations for EBNF-grammar
198
    "+":
199
        remove_expendables,
200
    "syntax":
201
        [],  # otherwise '"*": replace_by_single_child' would be applied
202
    "directive, definition":
203
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
204
    "expression":
205
        [replace_by_single_child, flatten, remove_tokens('|')],
206
207
208
209
210
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
211
212
        [remove_tokens('(', ')'), replace_by_single_child],
    "oneormore, repetition, option":
213
        [reduce_single_child, remove_brackets],
214
    "symbol, literal, regexp":
215
        reduce_single_child,
216
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
217
        reduce_single_child,
218
    "list_":
219
        [flatten, remove_tokens(',')],
220
    "*":
221
        replace_by_single_child
222
223
}

224

225
EBNF_validation_table = {
226
    # Semantic validation on the AST. EXPERIMENTAL!
227
    "repetition, option, oneormore":
228
229
        [forbid('repetition', 'option', 'oneormore'),
         assert_content(r'(?!§)')]
230
}
231

232

233
def EBNFTransformer(syntax_tree: Node):
234
    for processing_table, key_func in [(EBNF_transformation_table, key_tag_name),
235
                                       (EBNF_validation_table, key_tag_name)]:
236
        traverse(syntax_tree, processing_table, key_func)
di68kap's avatar
di68kap committed
237
238


239
def get_ebnf_transformer() -> TransformationFunc:
240
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
241
242
243
244
245
246
247
248


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

249
250
251

ScannerFactoryFunc = Callable[[], ScannerFunc]
ParserFactoryFunc = Callable[[], Grammar]
252
TransformerFactoryFunc = Callable[[], TransformationFunc]
253
254
255
CompilerFactoryFunc = Callable[[], Compiler]


256
SCANNER_FACTORY = '''
257
def get_scanner() -> ScannerFunc:
258
259
260
261
262
    return {NAME}Scanner
'''


GRAMMAR_FACTORY = '''
263
def get_grammar() -> {NAME}Grammar:
264
265
266
267
268
269
270
271
272
273
274
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
275
def get_transformer() -> TransformationFunc:
276
277
278
279
280
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
281
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
282
283
284
285
286
287
288
289
290
291
292
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
293

294
295
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
296
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
297
298
299
    pass


300
301
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!? Is this possible at compile time?

302
class EBNFCompiler(Compiler):
303
304
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
305
306
307
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
308
309
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
310
311
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
312
313
314
315
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
316
317
318
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
319

320

321
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
322
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
323
324
        self._reset()

325

326
    def _reset(self):
327
        self._result = ''           # type: str
328
329
330
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
331
        self.variables = set()      # type: Set[str]
332
        # self.definition_names = []  # type: List[str]
333
334
        self.recursive = set()      # type: Set[str]
        self.root = ""              # type: str
335
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
336
                           'comment': '',
337
                           'literalws': ['right'],
338
339
340
                           'tokens': set(),     # alt. 'scanner_tokens'
                           'filter': dict(),    # alt. 'filter'
                           'testing': False }
341

Eckhart Arnold's avatar
Eckhart Arnold committed
342
    @property
343
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
344
345
        return self._result

346
347
348

    # methods for generating skeleton code for scanner, transformer, and compiler

349
    def gen_scanner_skeleton(self) -> str:
350
        name = self.grammar_name + "Scanner"
351
352
        return "def %s(text):\n    return text\n" % name \
               + SCANNER_FACTORY.format(NAME=self.grammar_name)
353

354

355
    def gen_transformer_skeleton(self) -> str:
356
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
357
358
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
359
360
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
361
        transtable = [tt_name + ' = {',
362
363
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
364
        transtable.append('    "+": remove_empty,')
365
        for name in self.rules:
366
367
            transtable.append('    "' + name + '": [],')
        transtable.append('    ":Token, :RE": reduce_single_child,')
Eckhart Arnold's avatar
Eckhart Arnold committed
368
        transtable += ['    "*": replace_by_single_child', '}', '', tf_name +
369
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
370
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
371
372
        return '\n'.join(transtable)

373

374
    def gen_compiler_skeleton(self) -> str:
375
        if not self.rules:
376
377
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
378
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
379
380
381
382
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
383
                    self.grammar_name + '", grammar_source=""):',
384
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
385
                    'Compiler, self).__init__(grammar_name, grammar_source)',
386
                    "        assert re.match('\w+\Z', grammar_name)", '']
387
        for name in self.rules:
388
            method_name = Compiler.method_name(name)
389
            if name == self.root:
390
                compiler += ['    def ' + method_name + '(self, node):',
391
392
                             '        return node', '']
            else:
393
                compiler += ['    def ' + method_name + '(self, node):',
394
                             '        pass', '']
395
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
396
        return '\n'.join(compiler)
397

398

399
400
401
402
403
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
404
405
406
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
407
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
408

409
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
410
                            if 'right' in self.directives['literalws'] else "''"))
411
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
412
                            if 'left' in self.directives['literalws'] else "''"))
413
        definitions.append((self.WHITESPACE_KEYWORD,
414
415
416
417
418
419
420
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
421

422
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
423
        declarations = ['class ' + self.grammar_name +
424
                        'Grammar(Grammar):',
425
426
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
427
                        (', with this grammar:' if self.grammar_source else '.')]
428
        definitions.append(('parser_initialization__', '"upon instantiation"'))
429
        if self.grammar_source:
430
            definitions.append(('source_hash__',
431
                                '"%s"' % md5(self.grammar_source, __version__)))
432
            declarations.append('')
433
            declarations += [line for line in self.grammar_source.split('\n')]
434
435
436
437
438
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
439

440
441
442
443
444
445
446
447
448
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
449
450
451
452
453
454
455

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
456
                root_node.error_flag = True
457
458
459

        # check for unconnected rules

460
        if not self.directives['testing']:
461
462
463
464
465
466
467
468
469
470
471
            defined_symbols.difference_update(self.RESERVED_SYMBOLS)

            def remove_connections(symbol):
                if symbol in defined_symbols:
                    defined_symbols.remove(symbol)
                    for related in self.rules[symbol][1:]:
                        remove_connections(str(related))

            remove_connections(self.root)
            for leftover in defined_symbols:
                self.rules[leftover][0].add_error(('Rule "%s" is not connected to parser '
472
473
                    'root "%s" !') % (leftover, self.root) + ' (Use directive "@testing=True" '
                    'to supress this error message.)')
474
475
476

        # set root parser and assemble python grammar definition

477
        if self.root and 'root__' not in self.rules:
478
479
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
480
481
482
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
483

484
485
486

    ## compilation methods

487
    def on_syntax(self, node: Node) -> str:
488
489
490
491
        self._reset()
        definitions = []

        # drop the wrapping sequence node
492
493
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
494
495

        # compile definitions and directives and collect definitions
496
        for nd in node.children:
497
            if nd.parser.name == "definition":
498
                definitions.append(self.compile(nd))
499
            else:
500
                assert nd.parser.name == "directive", nd.as_sxpr()
501
                self.compile(nd)
502
                node.error_flag = node.error_flag or nd.error_flag
503

504
        return self.assemble_parser(definitions, node)
505

506

507
    def on_definition(self, node: Node) -> Tuple[str, str]:
508
        rule = str(node.children[0])
509
510
511
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
512
513
514
515
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
516
        elif rule in self.directives['tokens']:
517
518
519
520
            node.add_error('Symbol "%s" has already been defined as '
                           'a scanner token.' % rule)
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
521
                           % rule + '(This may change in the future.)')
522
        try:
523
524
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
525
            defn = self.compile(node.children[1])
526
            if rule in self.variables:
527
                defn = 'Capture(%s)' % defn
528
                self.variables.remove(rule)
529
530
531
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
532
        except TypeError as error:
533
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sxpr()
534
535
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
536
        return rule, defn
537

538

539
    @staticmethod
540
    def _check_rx(node: Node, rx: str) -> str:
541
542
        """
        Checks whether the string `rx` represents a valid regular
543
544
545
546
547
548
549
550
551
552
553
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

554

555
    def on_directive(self, node: Node) -> str:
556
        key = str(node.children[0]).lower()
557
        assert key not in self.directives['tokens']
558

559
        if key in {'comment', 'whitespace'}:
560
561
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
562
                    node.add_error('Directive "%s" must have one, but not %i values.' %
563
                                   (key, len(node.children[1].result)))
564
                value = self.compile(node.children[1]).pop()
565
566
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
567
                else:
568
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
569
            else:
570
571
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
572
573
574
575
576
577
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
578
579
580
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
581
            self.directives[key] = value
582

583
584
585
586
        elif key == 'testing':
            value = str(node.children[1])
            self.directives['testing'] = value.lower() not in {"off", "false", "no"}

587
        elif key == 'literalws':
588
            value = {item.lower() for item in self.compile(node.children[1])}
589
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
590
                    or ('none' in value and len(value) > 1)):
591
592
593
594
595
596
597
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

598
        elif key in {'tokens', 'scanner_tokens'}:
599
            self.directives['tokens'] |= self.compile(node.children[1])
600

601
        elif key.endswith('_filter'):
602
            filter_set = self.compile(node.children[1])
603
604
605
606
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
607

608
609
610
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
611
                            ', '.join(list(self.directives.keys()))))
612
613
        return ""

614

615
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
616
617
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
618
619
        name for the particular non-terminal.
        """
620
        arguments = [self.compile(r) for r in node.children] + custom_args
621
622
        return parser_class + '(' + ', '.join(arguments) + ')'

623

624
    def on_expression(self, node) -> str:
625
626
        return self.non_terminal(node, 'Alternative')

627

628
    def on_term(self, node) -> str:
629
        return self.non_terminal(node, 'Series')
630

631

632
    def on_factor(self, node: Node) -> str:
633
        assert node.children
634
        assert len(node.children) >= 2, node.as_sxpr()
635
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
636
        custom_args = []  # type: List[str]
637
638

        if prefix in {'::', ':'}:
639
640
            assert len(node.children) == 2
            arg = node.children[-1]
641
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
642
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
643
644
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
645
            if str(arg) in self.directives['filter']:
646
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
647
            self.variables.add(str(arg))  # cast(str, arg.result)
648

649
        elif len(node.children) > 2:
650
651
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
652
653
654
655
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
656

657
        node.result = node.children[1:]
658
659
660
661
662
        try:
            parser_class = self.PREFIX_TABLE[prefix]
            return self.non_terminal(node, parser_class, custom_args)
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
663
        return ""
664

665

666
    def on_option(self, node) -> str:
667
668
        return self.non_terminal(node, 'Optional')

669

670
    def on_repetition(self, node) -> str:
671
672
        return self.non_terminal(node, 'ZeroOrMore')

673

674
    def on_oneormore(self, node) -> str:
675
676
        return self.non_terminal(node, 'OneOrMore')

677

678
    def on_regexchain(self, node) -> str:
679
680
        raise EBNFCompilerError("Not yet implemented!")

681

682
    def on_group(self, node) -> str:
683
684
685
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

686

687
688
689
690
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
            return 'ScannerToken("' + symbol + '")'
691
        else:
692
693
694
695
696
697
            self.current_symbols.append(node)
            if symbol not in self.symbols:
                self.symbols[symbol] = node
            if symbol in self.rules:
                self.recursive.add(symbol)
            return symbol
698

699

700
    def on_literal(self, node) -> str:
701
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.join([node.result]) + ')' ?
702

703

704
    def on_regexp(self, node: Node) -> str:
705
        rx = str(node)
706
        name = []   # type: List[str]
707
708
        if rx[:2] == '~/':
            if not 'left' in self.directives['literalws']:
709
                name = ['wL=' + self.WHITESPACE_KEYWORD] + name
710
711
712
713
            rx = rx[1:]
        elif 'left' in self.directives['literalws']:
            name = ["wL=''"] + name
        if rx[-2:] == '/~':
Eckhart Arnold's avatar
Eckhart Arnold committed
714
            if 'right' not in self.directives['literalws']:
715
                name = ['wR=' + self.WHITESPACE_KEYWORD] + name
716
717
718
719
720
721
722
            rx = rx[:-1]
        elif 'right' in self.directives['literalws']:
            name = ["wR=''"] + name
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
723
                     node.as_sxpr()
724
725
726
727
            node.add_error(errmsg)
            return '"' + errmsg + '"'
        return 'RE(' + ', '.join([arg] + name) + ')'

728

729
    def on_list_(self, node) -> Set[str]:
730
        assert node.children
731
        return set(item.result.strip() for item in node.children)
732
733


734
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
735
736
737
738
739
740
741
742
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton