ebnf.py 28.8 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
from functools import partial
21 22 23 24
try:
    import regex as re
except ImportError:
    import re
25
from typing import Callable, cast, List, Set, Tuple
26

27 28 29 30 31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
from DHParser.parsers import Grammar, mixin_comment, nil_scanner, Forward, RE, NegativeLookahead, \
    Alternative, Sequence, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
    ScannerFunc
from DHParser.syntaxtree import Node, traverse, remove_enclosing_delimiters, reduce_single_child, \
32
    replace_by_single_child, TOKEN_PTYPE, remove_expendables, remove_tokens, flatten, \
33 34
    forbid, assert_content, WHITESPACE_PTYPE, key_tag_name, TransformerFunc
from DHParser.versionnumber import __version__
35 36


37 38 39 40 41
__all__ = ['get_ebnf_scanner',
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
42
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
43
           'EBNFCompilerError',
44
           'EBNFCompiler',
45 46 47 48 49
           'grammar_changed',
           'ScannerFactoryFunc',
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
           'CompilerFactoryFunc']
50 51


Eckhart Arnold's avatar
Eckhart Arnold committed
52 53 54 55 56 57 58
########################################################################
#
# EBNF scanning
#
########################################################################


59
def get_ebnf_scanner() -> ScannerFunc:
Eckhart Arnold's avatar
Eckhart Arnold committed
60 61 62 63 64 65 66 67 68 69
    return nil_scanner


########################################################################
#
# EBNF parsing
#
########################################################################


70
class EBNFGrammar(Grammar):
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
89
                | [flowmarker] regexchain
90 91 92 93 94 95 96 97 98
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
99
    regexchain =  ">" expression §"<"                # compiles "expression" into a singular regular expression
100 101
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
102 103
    option     =  "[" expression §"]"

104 105 106 107 108 109
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
110
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
111 112 113 114
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
115
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
116
    parser_initialization__ = "upon instantiation"
117 118
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
119
    wspL__ = ''
120
    wspR__ = WSP__
121
    EOF = NegativeLookahead(RE('.', wR=''))
122
    list_ = Sequence(RE('\\w+'), ZeroOrMore(Sequence(Token(","), RE('\\w+'))))
123 124 125
    regexp = RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
126
    option = Sequence(Token("["), expression, Required(Token("]")))
127 128
    repetition = Sequence(Token("{"), expression, Required(Token("}")))
    oneormore = Sequence(Token("{"), expression, Token("}+"))
129
    regexchain = Sequence(Token("<"), expression, Required(Token(">")))
130 131 132 133 134
    group = Sequence(Token("("), expression, Required(Token(")")))
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
    factor = Alternative(Sequence(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Sequence(Optional(flowmarker), literal), Sequence(Optional(flowmarker), regexp),
135 136
                         Sequence(Optional(flowmarker), group), Sequence(Optional(flowmarker), regexchain),
                         Sequence(Optional(flowmarker), oneormore), repetition, option)
137 138 139 140
    term = OneOrMore(factor)
    expression.set(Sequence(term, ZeroOrMore(Sequence(Token("|"), term))))
    directive = Sequence(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Sequence(symbol, Required(Token("=")), expression)
141
    syntax = Sequence(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
142 143 144
    root__ = syntax


145
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
165
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
166 167 168 169 170 171 172 173 174 175
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


176
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


193 194
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!?

195
EBNF_transformation_table = {
196 197 198 199 200
    # AST Transformations for EBNF-grammar
    "syntax":
        remove_expendables,
    "directive, definition":
        partial(remove_tokens, tokens={'@', '='}),
Eckhart Arnold's avatar
Eckhart Arnold committed
201
    "expression":
202
        [replace_by_single_child, flatten,
Eckhart Arnold's avatar
Eckhart Arnold committed
203
         partial(remove_tokens, tokens={'|'})],
204 205 206 207 208
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
209 210 211
        [remove_enclosing_delimiters, replace_by_single_child],
    "oneormore, repetition, option, regexchain":
        [reduce_single_child, remove_enclosing_delimiters],
212
    "symbol, literal, regexp":
213
        [remove_expendables, reduce_single_child],
214
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
215
        [remove_expendables, reduce_single_child],
216
    "list_":
217
        [flatten, partial(remove_tokens, tokens={','})],
218
    "*":
219 220 221
        [remove_expendables, replace_by_single_child]
}

222

223
EBNF_validation_table = {
224 225
    # Semantic validation on the AST
    "repetition, option, oneormore":
226
        [partial(forbid, child_tags=['repetition', 'option', 'oneormore']),
227 228
         partial(assert_content, regex=r'(?!§)')],
}
229

230

231
def EBNFTransformer(syntax_tree: Node):
232
    for processing_table, key_func in [(EBNF_transformation_table, key_tag_name),
233
                                       (EBNF_validation_table, key_tag_name)]:
234
        traverse(syntax_tree, processing_table, key_func)
di68kap's avatar
di68kap committed
235 236


237
def get_ebnf_transformer() -> TransformerFunc:
238
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
239 240 241 242 243 244 245 246


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

247 248 249 250 251 252 253

ScannerFactoryFunc = Callable[[], ScannerFunc]
ParserFactoryFunc = Callable[[], Grammar]
TransformerFactoryFunc = Callable[[], TransformerFunc]
CompilerFactoryFunc = Callable[[], Compiler]


254
SCANNER_FACTORY = '''
255
def get_scanner():
256 257 258 259 260
    return {NAME}Scanner
'''


GRAMMAR_FACTORY = '''
261
def get_grammar():
262 263 264 265 266 267 268 269 270 271 272
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
273
def get_transformer():
274 275 276 277 278
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
279
def get_compiler(grammar_name="{NAME}", grammar_source=""):
280 281 282 283 284 285 286 287 288 289 290
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
291

292 293 294 295 296 297
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
    in the strict sense, see `CompilationError` below)"""
    pass


298
class EBNFCompiler(Compiler):
299 300 301 302
    """Generates a Parser from an abstract syntax tree of a grammar specified
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
303 304
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
305 306
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
307 308 309 310
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
311 312 313
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
314

315
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
316
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
317 318 319
        self._reset()

    def _reset(self):
320 321 322 323 324 325 326
        self._result = ''           # type: str
        self.rules = set()          # type: Set[str]
        self.variables = set()      # type: Set[str]
        self.symbol_nodes = []      # type: List[Node]
        self.definition_names = []  # type: List[str]
        self.recursive = set()      # type: Set[str]
        self.root = ""              # type: str
327
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
328
                           'comment': '',
329
                           'literalws': ['right'],
330
                           'tokens': set(),     # alt. 'scanner_tokens'
331
                           'filter': dict()}     # alt. 'retrieve_filter'
332

Eckhart Arnold's avatar
Eckhart Arnold committed
333
    @property
334
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
335 336
        return self._result

337
    def gen_scanner_skeleton(self) -> str:
338
        name = self.grammar_name + "Scanner"
339 340
        return "def %s(text):\n    return text\n" % name \
               + SCANNER_FACTORY.format(NAME=self.grammar_name)
341

342
    def gen_transformer_skeleton(self) -> str:
343
        if not self.definition_names:
Eckhart Arnold's avatar
Eckhart Arnold committed
344 345
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
346 347
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
348
        transtable = [tt_name + ' = {',
349 350 351
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
        for name in self.definition_names:
di68kap's avatar
di68kap committed
352
            transtable.append('    "' + name + '": no_operation,')
353
        transtable += ['    "*": no_operation', '}', '', tf_name +
354
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
355
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
356 357
        return '\n'.join(transtable)

358
    def gen_compiler_skeleton(self) -> str:
359 360 361
        if not self.definition_names:
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
362
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
363 364 365 366
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
367
                    self.grammar_name + '", grammar_source=""):',
368
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
369
                    'Compiler, self).__init__(grammar_name, grammar_source)',
370 371
                    "        assert re.match('\w+\Z', grammar_name)", '']
        for name in self.definition_names:
372
            method_name = Compiler.derive_method_name(name)
373
            if name == self.root:
374
                compiler += ['    def ' + method_name + '(self, node: Node) -> str:',
375 376
                             '        return node', '']
            else:
377
                compiler += ['    def ' + method_name + '(self, node: Node) -> str:',
378
                             '        pass', '']
379
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
380
        return '\n'.join(compiler)
381

382
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
383
        # fix capture of variables that have been defined before usage [sic!]
384

385 386 387
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
388
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
389 390

        self.definition_names = [defn[0] for defn in definitions]
391
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
392
                            if 'right' in self.directives['literalws'] else "''"))
393
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
394
                            if 'left' in self.directives['literalws'] else "''"))
395
        definitions.append((self.WHITESPACE_KEYWORD,
396 397 398 399 400 401 402
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
403

404
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
405
        declarations = ['class ' + self.grammar_name +
406
                        'Grammar(Grammar):',
407 408
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
409
                        (', with this grammar:' if self.grammar_source else '.')]
410
        definitions.append(('parser_initialization__', '"upon instatiation"'))
411
        if self.grammar_source:
412
            definitions.append(('source_hash__',
413
                                '"%s"' % md5(self.grammar_source, __version__)))
414
            declarations.append('')
415
            declarations += [line for line in self.grammar_source.split('\n')]
416 417 418 419
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

420
        # add default functions for retrieve_filter filters of pop or retrieve operators
421

422 423 424
        # for symbol, fun in self.directives['filter']:
        #     declarations.append(symbol + '_filter = lambda value: value.replace("(", ")")'
        #                         '.replace("[", "]").replace("{", "}").replace(">", "<")')
425

426
        # turn definitions into declarations in reverse order
427

428 429 430 431 432 433 434 435 436
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
437
        known_symbols = self.rules | self.RESERVED_SYMBOLS
438
        for nd in self.symbol_nodes:
439
            if nd.result not in known_symbols:
440
                nd.add_error("Missing production for symbol '%s'" % nd.result)
441
                root_node.error_flag = True
442
        if self.root and 'root__' not in self.rules:
443 444
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
445 446 447
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
448

449
    def on_syntax(self, node: Node) -> str:
450 451 452 453
        self._reset()
        definitions = []

        # drop the wrapping sequence node
454 455
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
456 457

        # compile definitions and directives and collect definitions
458
        for nd in node.children:
459
            if nd.parser.name == "definition":
460
                definitions.append(self._compile(nd))
461 462
            else:
                assert nd.parser.name == "directive", nd.as_sexpr()
463
                self._compile(nd)
464
                node.error_flag = node.error_flag or nd.error_flag
465

466
        return self.assemble_parser(definitions, node)
467

468 469
    def on_definition(self, node: Node) -> Tuple[str, str]:
        rule = cast(str, node.children[0].result)
470 471 472
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
473 474 475 476
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
477
        elif rule in self.directives['tokens']:
478 479 480 481 482 483 484
            node.add_error('Symbol "%s" has already been defined as '
                           'a scanner token.' % rule)
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
                           % rule + '(This may change in the furute.)')
        try:
            self.rules.add(rule)
485
            defn = self._compile(node.children[1])
486
            if rule in self.variables:
487
                defn = 'Capture(%s)' % defn
488 489 490 491 492
                self.variables.remove(rule)
        except TypeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sexpr()
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
493
        return rule, defn
494 495

    @staticmethod
496
    def _check_rx(node: Node, rx: str) -> str:
497 498 499 500 501 502 503 504 505 506 507 508
        """Checks whether the string `rx` represents a valid regular
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

509 510
    def on_directive(self, node: Node) -> str:
        key = cast(str, node.children[0].result).lower()
511
        assert key not in self.directives['tokens']
512
        if key in {'comment', 'whitespace'}:
513 514
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
515
                    node.add_error('Directive "%s" must have one, but not %i values.' %
516 517
                                   (key, len(node.children[1].result)))
                value = self._compile(node.children[1]).pop()
518 519
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
520
                else:
521
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
522
            else:
523 524
                value = cast(str, node.children[1].result).strip("~")
                if value != cast(str, node.children[1].result):
525 526 527 528 529 530
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
531 532 533
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
534
            self.directives[key] = value
535

536
        elif key == 'literalws':
537
            value = {item.lower() for item in self._compile(node.children[1])}
538
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
539
                    or ('none' in value and len(value) > 1)):
540 541 542 543 544 545 546
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

547
        elif key in {'tokens', 'scanner_tokens'}:
548
            self.directives['tokens'] |= self._compile(node.children[1])
549

550
        elif key.endswith('_filter'):
551
            filter_set = self._compile(node.children[1])
552 553 554 555
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
556

557 558 559
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
560
                            ', '.join(list(self.directives.keys()))))
561 562
        return ""

563
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
564 565 566
        """Compiles any non-terminal, where `parser_class` indicates the Parser class
        name for the particular non-terminal.
        """
567
        arguments = [self._compile(r) for r in node.children] + custom_args
568 569
        return parser_class + '(' + ', '.join(arguments) + ')'

570
    def on_expression(self, node) -> str:
571 572
        return self.non_terminal(node, 'Alternative')

573
    def on_term(self, node) -> str:
574 575
        return self.non_terminal(node, 'Sequence')

576
    def on_factor(self, node: Node) -> str:
577
        assert node.children
578 579 580
        assert len(node.children) >= 2, node.as_sexpr()
        prefix = cast(str, node.children[0].result)
        custom_args = []  # type: List[str]
581 582

        if prefix in {'::', ':'}:
583 584
            assert len(node.children) == 2
            arg = node.children[-1]
585
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
586
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
587 588
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
589 590
            if str(arg) in self.directives['filter']:
                custom_args = ['retrieve_filter=%s' % self.directives['filter'][str(arg)]]
591
            self.variables.add(cast(str, arg.result))
592

593
        elif len(node.children) > 2:
594 595
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
596 597 598 599
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
600

601
        node.result = node.children[1:]
602 603 604 605 606
        try:
            parser_class = self.PREFIX_TABLE[prefix]
            return self.non_terminal(node, parser_class, custom_args)
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
607
        return ""
608

609
    def on_option(self, node) -> str:
610 611
        return self.non_terminal(node, 'Optional')

612
    def on_repetition(self, node) -> str:
613 614
        return self.non_terminal(node, 'ZeroOrMore')

615
    def on_oneormore(self, node) -> str:
616 617
        return self.non_terminal(node, 'OneOrMore')

618
    def on_regexchain(self, node) -> str:
619 620
        raise EBNFCompilerError("Not yet implemented!")

621
    def on_group(self, node) -> str:
622 623 624
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

625 626 627 628
    def on_symbol(self, node: Node) -> str:
        result = cast(str, node.result)
        if result in self.directives['tokens']:
            return 'ScannerToken("' + result + '")'
629
        else:
630
            self.symbol_nodes.append(node)
631 632 633
            if result in self.rules:
                self.recursive.add(result)
            return result
634

635 636
    def on_literal(self, node) -> str:
        return 'Token(' + cast(str, node.result).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.join([node.result]) + ')' ?
637

638 639 640
    def on_regexp(self, node: Node) -> str:
        rx = cast(str, node.result)
        name = []   # type: List[str]
641 642
        if rx[:2] == '~/':
            if not 'left' in self.directives['literalws']:
643
                name = ['wL=' + self.WHITESPACE_KEYWORD] + name
644 645 646 647
            rx = rx[1:]
        elif 'left' in self.directives['literalws']:
            name = ["wL=''"] + name
        if rx[-2:] == '/~':
Eckhart Arnold's avatar
Eckhart Arnold committed
648
            if 'right' not in self.directives['literalws']:
649
                name = ['wR=' + self.WHITESPACE_KEYWORD] + name
650 651 652 653 654 655 656 657 658 659 660 661
            rx = rx[:-1]
        elif 'right' in self.directives['literalws']:
            name = ["wR=''"] + name
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
                     node.as_sexpr()
            node.add_error(errmsg)
            return '"' + errmsg + '"'
        return 'RE(' + ', '.join([arg] + name) + ')'

662
    def on_list_(self, node) -> Set[str]:
663
        assert node.children
664
        return set(item.result.strip() for item in node.children)
665 666


667
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
668 669 670 671 672 673 674 675
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton