ebnf.py 30.2 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20 21
from collections import OrderedDict

22 23 24 25
try:
    import regex as re
except ImportError:
    import re
26 27 28 29
try:
    from typing import Callable, Dict, List, Set, Tuple
except ImportError:
    from .typing34 import Callable, Dict, List, Set, Tuple
30

31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
32
from DHParser.parsers import Grammar, mixin_comment, nil_preprocessor, Forward, RE, NegativeLookahead, \
33
    Alternative, Series, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
34
    PreprocessorFunc
35 36 37 38
from DHParser.syntaxtree import Node, traverse, remove_brackets, \
    reduce_single_child, replace_by_single_child, TOKEN_PTYPE, remove_expendables, \
    remove_tokens, flatten, forbid, assert_content, WHITESPACE_PTYPE, key_tag_name, \
    TransformationFunc
39
from DHParser.versionnumber import __version__
40

41
__all__ = ['get_ebnf_preprocessor',
42 43 44 45
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
46
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
47
           'EBNFCompilerError',
48
           'EBNFCompiler',
49
           'grammar_changed',
50
           'PreprocessorFactoryFunc',
51 52 53
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
           'CompilerFactoryFunc']
54 55


Eckhart Arnold's avatar
Eckhart Arnold committed
56 57 58 59 60 61 62
########################################################################
#
# EBNF scanning
#
########################################################################


63 64
def get_ebnf_preprocessor() -> PreprocessorFunc:
    return nil_preprocessor
Eckhart Arnold's avatar
Eckhart Arnold committed
65 66 67 68 69 70 71 72


########################################################################
#
# EBNF parsing
#
########################################################################

73

74
class EBNFGrammar(Grammar):
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
93
                | [flowmarker] regexchain
94 95 96 97 98 99 100 101 102 103 104
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
105 106
    option     =  "[" expression §"]"

107 108 109 110 111 112
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
113
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
114 115 116 117
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
118
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
119
    parser_initialization__ = "upon instantiation"
120 121
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
122
    wspL__ = ''
123
    wspR__ = WSP__
124
    EOF = NegativeLookahead(RE('.', wR=''))
125
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
126
    regexp = RE(r'~?/(?:\\/|[^/])*?/~?')  # RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
127 128
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
129 130 131 132
    option = Series(Token("["), expression, Required(Token("]")))
    repetition = Series(Token("{"), expression, Required(Token("}")))
    oneormore = Series(Token("{"), expression, Token("}+"))
    group = Series(Token("("), expression, Required(Token(")")))
133 134
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
135 136
    factor = Alternative(Series(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Series(Optional(flowmarker), literal), Series(Optional(flowmarker), regexp),
137 138
                         Series(Optional(flowmarker), group), Series(Optional(flowmarker), oneormore),
                         repetition, option)
139
    term = OneOrMore(factor)
140 141 142 143
    expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
    directive = Series(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Series(symbol, Required(Token("=")), expression)
    syntax = Series(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
144 145 146
    root__ = syntax


147
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
167
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
168 169 170 171 172 173 174 175 176 177
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


178
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


195
EBNF_transformation_table = {
196
    # AST Transformations for EBNF-grammar
197
    "+":
198
        remove_expendables,
199
    "syntax":
200
        [],  # otherwise '"*": replace_by_single_child' would be applied
201
    "directive, definition":
202
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
203
    "expression":
204
        [replace_by_single_child, flatten, remove_tokens('|')],
205 206 207 208 209
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
210 211
        [remove_tokens('(', ')'), replace_by_single_child],
    "oneormore, repetition, option":
212
        [reduce_single_child, remove_brackets],
213
    "symbol, literal, regexp":
214
        reduce_single_child,
215
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
216
        reduce_single_child,
217
    "list_":
218
        [flatten, remove_tokens(',')],
219
    "*":
220
        replace_by_single_child
221 222
}

223

224
EBNF_validation_table = {
225
    # Semantic validation on the AST. EXPERIMENTAL!
226
    "repetition, option, oneormore":
227 228
        [forbid('repetition', 'option', 'oneormore'),
         assert_content(r'(?!§)')]
229
}
230

231

232
def EBNFTransformer(syntax_tree: Node):
233
    for processing_table, key_func in [(EBNF_transformation_table, key_tag_name),
234
                                       (EBNF_validation_table, key_tag_name)]:
235
        traverse(syntax_tree, processing_table, key_func)
di68kap's avatar
di68kap committed
236 237


238
def get_ebnf_transformer() -> TransformationFunc:
239
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
240 241 242 243 244 245 246 247


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

248

249
PreprocessorFactoryFunc = Callable[[], PreprocessorFunc]
250
ParserFactoryFunc = Callable[[], Grammar]
251
TransformerFactoryFunc = Callable[[], TransformationFunc]
252 253
CompilerFactoryFunc = Callable[[], Compiler]

254 255 256
PREPROCESSOR_FACTORY = '''
def get_preprocessor() -> PreprocessorFunc:
    return {NAME}Preprocessor
257 258 259 260
'''


GRAMMAR_FACTORY = '''
261
def get_grammar() -> {NAME}Grammar:
262 263 264 265 266 267 268 269 270 271 272
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
273
def get_transformer() -> TransformationFunc:
274 275 276 277 278
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
279
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
280 281 282 283 284 285 286 287 288 289 290
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
291

292 293
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
294
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
295 296 297
    pass


298 299
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!? Is this possible at compile time?

300
class EBNFCompiler(Compiler):
301 302
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
303 304 305
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
306 307
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
308 309
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
310 311 312 313
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
314 315 316
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
317

318

319
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
320
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
321 322
        self._reset()

323

324
    def _reset(self):
325
        self._result = ''           # type: str
326 327 328
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
329
        self.variables = set()      # type: Set[str]
330
        # self.definition_names = []  # type: List[str]
331 332
        self.recursive = set()      # type: Set[str]
        self.root = ""              # type: str
333
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
334
                           'comment': '',
335
                           'literalws': ['right'],
336 337 338
                           'tokens': set(),  # alt. 'preprocessor_tokens'
                           'filter': dict(),  # alt. 'filter'
                           'testing': False}
339

Eckhart Arnold's avatar
Eckhart Arnold committed
340
    @property
341
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
342 343
        return self._result

344
    # methods for generating skeleton code for preprocessor, transformer, and compiler
345

346 347
    def gen_preprocessor_skeleton(self) -> str:
        name = self.grammar_name + "Preprocessor"
348
        return "def %s(text):\n    return text\n" % name \
349
               + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
350

351

352
    def gen_transformer_skeleton(self) -> str:
353
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
354 355
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
356 357
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
358
        transtable = [tt_name + ' = {',
359 360
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
361
        transtable.append('    "+": remove_empty,')
362
        for name in self.rules:
363 364
            transtable.append('    "' + name + '": [],')
        transtable.append('    ":Token, :RE": reduce_single_child,')
Eckhart Arnold's avatar
Eckhart Arnold committed
365
        transtable += ['    "*": replace_by_single_child', '}', '', tf_name +
366
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
367
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
368 369
        return '\n'.join(transtable)

370

371
    def gen_compiler_skeleton(self) -> str:
372
        if not self.rules:
373 374
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
375
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
376 377 378 379
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
380
                    self.grammar_name + '", grammar_source=""):',
381
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
382
                    'Compiler, self).__init__(grammar_name, grammar_source)',
383
                    "        assert re.match('\w+\Z', grammar_name)", '']
384
        for name in self.rules:
385
            method_name = Compiler.method_name(name)
386
            if name == self.root:
387
                compiler += ['    def ' + method_name + '(self, node):',
388 389
                             '        return node', '']
            else:
390
                compiler += ['    def ' + method_name + '(self, node):',
391
                             '        pass', '']
392
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
393
        return '\n'.join(compiler)
394

395

396 397 398 399 400
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
401 402 403
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
404
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
405

406
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
407
                            if 'right' in self.directives['literalws'] else "''"))
408
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
409
                            if 'left' in self.directives['literalws'] else "''"))
410
        definitions.append((self.WHITESPACE_KEYWORD,
411 412 413 414 415 416 417
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
418

419
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
420
        declarations = ['class ' + self.grammar_name +
421
                        'Grammar(Grammar):',
422 423
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
424
                        (', with this grammar:' if self.grammar_source else '.')]
425
        definitions.append(('parser_initialization__', '"upon instantiation"'))
426
        if self.grammar_source:
427
            definitions.append(('source_hash__',
428
                                '"%s"' % md5(self.grammar_source, __version__)))
429
            declarations.append('')
430
            declarations += [line for line in self.grammar_source.split('\n')]
431 432 433 434 435
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
436

437 438 439 440 441 442 443 444 445
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
446 447 448 449 450 451 452

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
453
                root_node.error_flag = True
454 455 456

        # check for unconnected rules

457
        if not self.directives['testing']:
458 459 460 461 462 463 464 465 466 467 468
            defined_symbols.difference_update(self.RESERVED_SYMBOLS)

            def remove_connections(symbol):
                if symbol in defined_symbols:
                    defined_symbols.remove(symbol)
                    for related in self.rules[symbol][1:]:
                        remove_connections(str(related))

            remove_connections(self.root)
            for leftover in defined_symbols:
                self.rules[leftover][0].add_error(('Rule "%s" is not connected to parser '
469 470
                    'root "%s" !') % (leftover, self.root) + ' (Use directive "@testing=True" '
                    'to supress this error message.)')
471 472 473

        # set root parser and assemble python grammar definition

474
        if self.root and 'root__' not in self.rules:
475 476
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
477 478 479
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
480

481 482 483

    ## compilation methods

484
    def on_syntax(self, node: Node) -> str:
485 486 487 488
        self._reset()
        definitions = []

        # drop the wrapping sequence node
489 490
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
491 492

        # compile definitions and directives and collect definitions
493
        for nd in node.children:
494
            if nd.parser.name == "definition":
495
                definitions.append(self.compile(nd))
496
            else:
497
                assert nd.parser.name == "directive", nd.as_sxpr()
498
                self.compile(nd)
499
                node.error_flag = node.error_flag or nd.error_flag
500

501
        return self.assemble_parser(definitions, node)
502

503

504
    def on_definition(self, node: Node) -> Tuple[str, str]:
505
        rule = str(node.children[0])
506 507 508
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
509 510 511 512
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
513
        elif rule in self.directives['tokens']:
514
            node.add_error('Symbol "%s" has already been defined as '
515
                           'a preprocessor token.' % rule)
516 517
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
518
                           % rule + '(This may change in the future.)')
519
        try:
520 521
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
522
            defn = self.compile(node.children[1])
523
            if rule in self.variables:
524
                defn = 'Capture(%s)' % defn
525
                self.variables.remove(rule)
526 527 528
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
529
        except TypeError as error:
530
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sxpr()
531 532
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
533
        return rule, defn
534

535

536
    @staticmethod
537
    def _check_rx(node: Node, rx: str) -> str:
538 539
        """
        Checks whether the string `rx` represents a valid regular
540 541 542 543 544 545 546 547 548 549 550
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

551

552
    def on_directive(self, node: Node) -> str:
553
        key = str(node.children[0]).lower()
554
        assert key not in self.directives['tokens']
555

556
        if key in {'comment', 'whitespace'}:
557 558
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
559
                    node.add_error('Directive "%s" must have one, but not %i values.' %
560
                                   (key, len(node.children[1].result)))
561
                value = self.compile(node.children[1]).pop()
562 563
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
564
                else:
565
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
566
            else:
567 568
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
569 570 571 572 573 574
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
575 576 577
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
578
            self.directives[key] = value
579

580 581 582 583
        elif key == 'testing':
            value = str(node.children[1])
            self.directives['testing'] = value.lower() not in {"off", "false", "no"}

584
        elif key == 'literalws':
585
            value = {item.lower() for item in self.compile(node.children[1])}
586
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
587
                    or ('none' in value and len(value) > 1)):
588 589 590 591 592 593 594
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

595
        elif key in {'tokens', 'preprocessor_tokens'}:
596
            self.directives['tokens'] |= self.compile(node.children[1])
597

598
        elif key.endswith('_filter'):
599
            filter_set = self.compile(node.children[1])
600 601 602 603
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
604

605 606 607
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
608
                            ', '.join(list(self.directives.keys()))))
609 610
        return ""

611

612
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
613 614
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
615 616
        name for the particular non-terminal.
        """
617
        arguments = [self.compile(r) for r in node.children] + custom_args
618 619
        return parser_class + '(' + ', '.join(arguments) + ')'

620

621
    def on_expression(self, node) -> str:
622 623
        return self.non_terminal(node, 'Alternative')

624

625
    def on_term(self, node) -> str:
626
        return self.non_terminal(node, 'Series')
627

628

629
    def on_factor(self, node: Node) -> str:
630
        assert node.children
631
        assert len(node.children) >= 2, node.as_sxpr()
632
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
633
        custom_args = []  # type: List[str]
634 635

        if prefix in {'::', ':'}:
636 637
            assert len(node.children) == 2
            arg = node.children[-1]
638
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
639
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
640 641
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
642
            if str(arg) in self.directives['filter']:
643
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
644
            self.variables.add(str(arg))  # cast(str, arg.result)
645

646
        elif len(node.children) > 2:
647 648
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
649 650 651 652
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
653

654
        node.result = node.children[1:]
655 656 657 658 659
        try:
            parser_class = self.PREFIX_TABLE[prefix]
            return self.non_terminal(node, parser_class, custom_args)
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
660
        return ""
661

662

663
    def on_option(self, node) -> str:
664 665
        return self.non_terminal(node, 'Optional')

666

667
    def on_repetition(self, node) -> str:
668 669
        return self.non_terminal(node, 'ZeroOrMore')

670

671
    def on_oneormore(self, node) -> str:
672 673
        return self.non_terminal(node, 'OneOrMore')

674

675
    def on_regexchain(self, node) -> str:
676 677
        raise EBNFCompilerError("Not yet implemented!")

678

679
    def on_group(self, node) -> str:
680 681 682
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

683

684 685 686
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
687
            return 'PreprocessorToken("' + symbol + '")'
688
        else:
689 690 691 692 693 694
            self.current_symbols.append(node)
            if symbol not in self.symbols:
                self.symbols[symbol] = node
            if symbol in self.rules:
                self.recursive.add(symbol)
            return symbol
695

696

697
    def on_literal(self, node) -> str:
698
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.join([node.result]) + ')' ?
699

700

701
    def on_regexp(self, node: Node) -> str:
702
        rx = str(node)
703
        name = []   # type: List[str]
704 705
        if rx[:2] == '~/':
            if not 'left' in self.directives['literalws']:
706
                name = ['wL=' + self.WHITESPACE_KEYWORD] + name
707 708 709 710
            rx = rx[1:]
        elif 'left' in self.directives['literalws']:
            name = ["wL=''"] + name
        if rx[-2:] == '/~':
Eckhart Arnold's avatar
Eckhart Arnold committed
711
            if 'right' not in self.directives['literalws']:
712
                name = ['wR=' + self.WHITESPACE_KEYWORD] + name
713 714 715 716 717 718 719
            rx = rx[:-1]
        elif 'right' in self.directives['literalws']:
            name = ["wR=''"] + name
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
720
                     node.as_sxpr()
721 722 723 724
            node.add_error(errmsg)
            return '"' + errmsg + '"'
        return 'RE(' + ', '.join([arg] + name) + ')'

725

726
    def on_list_(self, node) -> Set[str]:
727
        assert node.children
728
        return set(item.result.strip() for item in node.children)
729 730


731
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
732 733 734 735 736 737 738 739
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton