Starting from 2021-07-01, all LRZ GitLab users will be required to explicitly accept the GitLab Terms of Service. Please see the detailed information at https://doku.lrz.de/display/PUBLIC/GitLab and make sure that your projects conform to the requirements.

ebnf.py 31.6 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20 21
from collections import OrderedDict

22 23 24 25
try:
    import regex as re
except ImportError:
    import re
26 27 28 29
try:
    from typing import Callable, Dict, List, Set, Tuple
except ImportError:
    from .typing34 import Callable, Dict, List, Set, Tuple
30

31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
32
from DHParser.parser import Grammar, mixin_comment, nil_preprocessor, Forward, RE, NegativeLookahead, \
33
    Alternative, Series, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
34
    PreprocessorFunc
35 36 37
from DHParser.syntaxtree import WHITESPACE_PTYPE, TOKEN_PTYPE, Node, TransformationFunc
from DHParser.transform import traverse, remove_brackets, \
    reduce_single_child, replace_by_single_child, remove_expendables, \
38
    remove_tokens, flatten, forbid, assert_content, key_tag_name, remove_infix_operator
39
from DHParser.versionnumber import __version__
40

41
__all__ = ('get_ebnf_preprocessor',
42 43 44 45
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
46
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
47
           'EBNFCompilerError',
48
           'EBNFCompiler',
49
           'grammar_changed',
50
           'PreprocessorFactoryFunc',
51 52
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
53
           'CompilerFactoryFunc')
54 55


Eckhart Arnold's avatar
Eckhart Arnold committed
56 57 58 59 60 61 62
########################################################################
#
# EBNF scanning
#
########################################################################


63 64
def get_ebnf_preprocessor() -> PreprocessorFunc:
    return nil_preprocessor
Eckhart Arnold's avatar
Eckhart Arnold committed
65 66 67 68 69 70 71 72


########################################################################
#
# EBNF parsing
#
########################################################################

73

74
class EBNFGrammar(Grammar):
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
104 105
    option     =  "[" expression §"]"

106 107 108 109 110 111
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
112
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
113 114 115 116
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
117
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
118
    parser_initialization__ = "upon instantiation"
119 120
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
121
    wspL__ = ''
122
    wspR__ = WSP__
123
    EOF = NegativeLookahead(RE('.', wR=''))
124
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
125
    regexp = RE(r'~?/(?:\\/|[^/])*?/~?')  # RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
126 127
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
128 129 130 131
    option = Series(Token("["), expression, Required(Token("]")))
    repetition = Series(Token("{"), expression, Required(Token("}")))
    oneormore = Series(Token("{"), expression, Token("}+"))
    group = Series(Token("("), expression, Required(Token(")")))
132 133
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
134 135
    factor = Alternative(Series(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Series(Optional(flowmarker), literal), Series(Optional(flowmarker), regexp),
136 137
                         Series(Optional(flowmarker), group), Series(Optional(flowmarker), oneormore),
                         repetition, option)
138
    term = OneOrMore(factor)
139 140 141 142
    expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
    directive = Series(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Series(symbol, Required(Token("=")), expression)
    syntax = Series(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
143 144 145
    root__ = syntax


146
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
166
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
167 168 169 170 171 172 173 174 175 176
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


177
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


194
EBNF_transformation_table = {
195
    # AST Transformations for EBNF-grammar
196
    "+":
197
        remove_expendables,
198
    "syntax":
199
        [],  # otherwise '"*": replace_by_single_child' would be applied
200
    "directive, definition":
201
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
202
    "expression":
203
        [replace_by_single_child, flatten, remove_tokens('|')],  # remove_infix_operator],
204 205 206 207 208
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
209
        [remove_brackets, replace_by_single_child],
210
    "oneormore, repetition, option":
211 212
        [reduce_single_child, remove_brackets,
         forbid('repetition', 'option', 'oneormore'), assert_content(r'(?!§)')],
213
    "symbol, literal, regexp":
214
        reduce_single_child,
215
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
216
        reduce_single_child,
217
    "list_":
218
        [flatten, remove_infix_operator],
219
    "*":
220
        replace_by_single_child
221 222
}

223

224
def EBNFTransformer(syntax_tree: Node):
225
    traverse(syntax_tree, EBNF_transformation_table, key_tag_name)
di68kap's avatar
di68kap committed
226 227


228
def get_ebnf_transformer() -> TransformationFunc:
229
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
230 231 232 233 234 235 236 237


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

238

239
PreprocessorFactoryFunc = Callable[[], PreprocessorFunc]
240
ParserFactoryFunc = Callable[[], Grammar]
241
TransformerFactoryFunc = Callable[[], TransformationFunc]
242 243
CompilerFactoryFunc = Callable[[], Compiler]

244 245 246
PREPROCESSOR_FACTORY = '''
def get_preprocessor() -> PreprocessorFunc:
    return {NAME}Preprocessor
247 248 249 250
'''


GRAMMAR_FACTORY = '''
251
def get_grammar() -> {NAME}Grammar:
252 253 254 255 256 257 258 259 260 261 262
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
263
def get_transformer() -> TransformationFunc:
264 265 266 267 268
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
269
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
270 271 272 273 274 275 276 277 278 279 280
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
281

282 283
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
284
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
285 286 287
    pass


288 289
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!? Is this possible at compile time?

290
class EBNFCompiler(Compiler):
291 292
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
293 294 295
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
296 297
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
298 299
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
300 301 302 303
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
304 305 306
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
307

308

309
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
310
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
311 312
        self._reset()

313

314
    def _reset(self):
315
        self._result = ''           # type: str
316 317 318
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
319
        self.variables = set()      # type: Set[str]
320
        # self.definitions = []     # type: List[Tuple[str, str]]
321
        self.recursive = set()      # type: Set[str]
322
        self.deferred_tasks = []    # type: List[Callable]
323
        self.root = ""              # type: str
324
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
325
                           'comment': '',
326
                           'literalws': ['right'],
327 328 329
                           'tokens': set(),  # alt. 'preprocessor_tokens'
                           'filter': dict(),  # alt. 'filter'
                           'testing': False}
330

Eckhart Arnold's avatar
Eckhart Arnold committed
331
    @property
332
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
333 334
        return self._result

335
    # methods for generating skeleton code for preprocessor, transformer, and compiler
336

337 338
    def gen_preprocessor_skeleton(self) -> str:
        name = self.grammar_name + "Preprocessor"
339
        return "def %s(text):\n    return text\n" % name \
340
               + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
341

342

343
    def gen_transformer_skeleton(self) -> str:
344
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
345 346
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
347 348
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
349
        transtable = [tt_name + ' = {',
350 351
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
352
        transtable.append('    "+": remove_empty,')
353
        for name in self.rules:
354 355
            transtable.append('    "' + name + '": [],')
        transtable.append('    ":Token, :RE": reduce_single_child,')
356
        transtable += ['    # "*": replace_by_single_child', '}', '', tf_name +
357
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
358
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
359 360
        return '\n'.join(transtable)

361

362
    def gen_compiler_skeleton(self) -> str:
363
        if not self.rules:
364 365
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
366
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
367 368 369 370
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
371
                    self.grammar_name + '", grammar_source=""):',
372
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
373
                    'Compiler, self).__init__(grammar_name, grammar_source)',
374
                    "        assert re.match('\w+\Z', grammar_name)", '']
375
        for name in self.rules:
376
            method_name = Compiler.method_name(name)
377
            if name == self.root:
378
                compiler += ['    def ' + method_name + '(self, node):',
379 380
                             '        return node', '']
            else:
381
                compiler += ['    def ' + method_name + '(self, node):',
382
                             '        pass', '']
383
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
384
        return '\n'.join(compiler)
385

386

387 388 389 390 391
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
392 393 394 395 396 397 398 399 400 401

        # execute deferred tasks, for example semantic checks that cannot
        # be done before the symbol table is complete

        for task in self.deferred_tasks:
            task()

        # provide for capturing of symbols that are variables, i.e. the
        # value of will be retrieved at some point during the parsing process

402 403 404
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
405
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
406

407 408
        # add special fields for Grammar class

409
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
410
                            if 'right' in self.directives['literalws'] else "''"))
411
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
412
                            if 'left' in self.directives['literalws'] else "''"))
413
        definitions.append((self.WHITESPACE_KEYWORD,
414 415 416 417 418 419 420
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
421

422
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
423
        declarations = ['class ' + self.grammar_name +
424
                        'Grammar(Grammar):',
425 426
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
427
                        (', with this grammar:' if self.grammar_source else '.')]
428
        definitions.append(('parser_initialization__', '"upon instantiation"'))
429
        if self.grammar_source:
430
            definitions.append(('source_hash__',
431
                                '"%s"' % md5(self.grammar_source, __version__)))
432
            declarations.append('')
433
            declarations += [line for line in self.grammar_source.split('\n')]
434 435 436 437 438
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
439

440 441 442 443 444 445 446 447 448
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
449 450 451 452 453 454 455

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
456
                root_node.error_flag = True
457 458 459

        # check for unconnected rules

460
        if not self.directives['testing']:
461 462 463 464 465 466 467 468 469 470 471
            defined_symbols.difference_update(self.RESERVED_SYMBOLS)

            def remove_connections(symbol):
                if symbol in defined_symbols:
                    defined_symbols.remove(symbol)
                    for related in self.rules[symbol][1:]:
                        remove_connections(str(related))

            remove_connections(self.root)
            for leftover in defined_symbols:
                self.rules[leftover][0].add_error(('Rule "%s" is not connected to parser '
472 473
                    'root "%s" !') % (leftover, self.root) + ' (Use directive "@testing=True" '
                    'to supress this error message.)')
474 475 476

        # set root parser and assemble python grammar definition

477
        if self.root and 'root__' not in self.rules:
478 479
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
480 481 482
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
483

484 485 486

    ## compilation methods

487
    def on_syntax(self, node: Node) -> str:
488
        self._reset()
489
        definitions = []  # type: List[Tuple[str, str]]
490 491

        # drop the wrapping sequence node
492 493
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
494 495

        # compile definitions and directives and collect definitions
496
        for nd in node.children:
497
            if nd.parser.name == "definition":
498
                definitions.append(self.compile(nd))
499
            else:
500
                assert nd.parser.name == "directive", nd.as_sxpr()
501
                self.compile(nd)
502
                node.error_flag = node.error_flag or nd.error_flag
503

504
        return self.assemble_parser(definitions, node)
505

506

507
    def on_definition(self, node: Node) -> Tuple[str, str]:
508
        rule = str(node.children[0])
509 510 511
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
512 513 514 515
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
516
        elif rule in self.directives['tokens']:
517
            node.add_error('Symbol "%s" has already been defined as '
518
                           'a preprocessor token.' % rule)
519 520
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
521
                           % rule + '(This may change in the future.)')
522
        try:
523 524
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
525
            defn = self.compile(node.children[1])
526
            if rule in self.variables:
527
                defn = 'Capture(%s)' % defn
528
                self.variables.remove(rule)
529 530 531
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
532
        except TypeError as error:
533
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sxpr()
534 535
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
536
        return rule, defn
537

538

539
    @staticmethod
540
    def _check_rx(node: Node, rx: str) -> str:
541 542
        """
        Checks whether the string `rx` represents a valid regular
543 544 545 546 547 548 549 550 551 552 553
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

554

555
    def on_directive(self, node: Node) -> str:
556
        key = str(node.children[0]).lower()
557
        assert key not in self.directives['tokens']
558

559
        if key in {'comment', 'whitespace'}:
560 561
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
562
                    node.add_error('Directive "%s" must have one, but not %i values.' %
563
                                   (key, len(node.children[1].result)))
564
                value = self.compile(node.children[1]).pop()
565 566
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
567
                else:
568
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
569
            else:
570 571
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
572 573 574 575 576 577
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
578 579 580
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
581
            self.directives[key] = value
582

583 584 585 586
        elif key == 'testing':
            value = str(node.children[1])
            self.directives['testing'] = value.lower() not in {"off", "false", "no"}

587
        elif key == 'literalws':
588
            value = {item.lower() for item in self.compile(node.children[1])}
589
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
590
                    or ('none' in value and len(value) > 1)):
591 592 593 594 595 596 597
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

598
        elif key in {'tokens', 'preprocessor_tokens'}:
599
            self.directives['tokens'] |= self.compile(node.children[1])
600

601
        elif key.endswith('_filter'):
602
            filter_set = self.compile(node.children[1])
603 604 605 606
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
607

608 609 610
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
611
                            ', '.join(list(self.directives.keys()))))
612 613
        return ""

614

615
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
616 617
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
618 619
        name for the particular non-terminal.
        """
620
        arguments = [self.compile(r) for r in node.children] + custom_args
621 622
        return parser_class + '(' + ', '.join(arguments) + ')'

623

624
    def on_expression(self, node) -> str:
625 626
        return self.non_terminal(node, 'Alternative')

627

628
    def on_term(self, node) -> str:
629
        return self.non_terminal(node, 'Series')
630

631

632
    def on_factor(self, node: Node) -> str:
633
        assert node.children
634
        assert len(node.children) >= 2, node.as_sxpr()
635
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
636
        custom_args = []  # type: List[str]
637 638

        if prefix in {'::', ':'}:
639 640
            assert len(node.children) == 2
            arg = node.children[-1]
641
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
642
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
643 644
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
645
            if str(arg) in self.directives['filter']:
646
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
647
            self.variables.add(str(arg))  # cast(str, arg.result)
648

649
        elif len(node.children) > 2:
650 651
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
652 653 654 655
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
656

657
        node.result = node.children[1:]
658 659
        try:
            parser_class = self.PREFIX_TABLE[prefix]
660 661 662 663
            result = self.non_terminal(node, parser_class, custom_args)
            if prefix[:1] == '-':
                def check(node):
                    nd = node
664 665 666 667 668 669 670 671 672 673
                    if len(nd.children) >= 1:
                        nd = nd.children[0]
                    while nd.parser.name == "symbol":
                        symlist = self.rules.get(str(nd), [])
                        if len(symlist) == 2:
                            nd = symlist[1]
                        else:
                            if len(symlist) == 1:
                                nd = symlist[0].children[1]
                            break
674 675 676
                    if (nd.parser.name != "regexp" or str(nd)[:1] != '/'
                        or str(nd)[-1:] != '/'):
                        node.add_error("Lookbehind-parser can only be used with plain RegExp-"
677
                                       "parsers, not with: " + nd.parser.name + nd.parser.ptype)
678 679 680 681

                if not result.startswith('RegExp('):
                    self.deferred_tasks.append(lambda: check(node))
            return result
682 683
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
684
        return ""
685

686

687
    def on_option(self, node) -> str:
688 689
        return self.non_terminal(node, 'Optional')

690

691
    def on_repetition(self, node) -> str:
692 693
        return self.non_terminal(node, 'ZeroOrMore')

694

695
    def on_oneormore(self, node) -> str:
696 697
        return self.non_terminal(node, 'OneOrMore')

698

699
    def on_regexchain(self, node) -> str:
700 701
        raise EBNFCompilerError("Not yet implemented!")

702

703
    def on_group(self, node) -> str:
704 705 706
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

707

708 709 710
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
711
            return 'PreprocessorToken("' + symbol + '")'
712
        else:
713 714
            self.current_symbols.append(node)
            if symbol not in self.symbols:
715
                self.symbols[symbol] = node  # remember first use of symbol
716 717 718
            if symbol in self.rules:
                self.recursive.add(symbol)
            return symbol
719

720

721
    def on_literal(self, node) -> str:
722
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.merge_children([node.result]) + ')' ?
723

724

725
    def on_regexp(self, node: Node) -> str:
726
        rx = str(node)
727
        name = []   # type: List[str]
728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743
        if rx[0] == '/' and rx[-1] == '/':
            parser = 'RegExp('
        else:
            parser = 'RE('
            if rx[:2] == '~/':
                if not 'left' in self.directives['literalws']:
                    name = ['wL=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[1:]
            elif 'left' in self.directives['literalws']:
                name = ["wL=''"] + name
            if rx[-2:] == '/~':
                if 'right' not in self.directives['literalws']:
                    name = ['wR=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[:-1]
            elif 'right' in self.directives['literalws']:
                name = ["wR=''"] + name
744 745 746 747
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
748
                     node.as_sxpr()
749 750
            node.add_error(errmsg)
            return '"' + errmsg + '"'
751
        return parser + ', '.join([arg] + name) + ')'
752

753

754
    def on_list_(self, node) -> Set[str]:
755
        assert node.children
756
        return set(item.result.strip() for item in node.children)
757 758


759
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
760 761 762 763 764 765 766 767
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton