The expiration time for new job artifacts in CI/CD pipelines is now 30 days (GitLab default). Previously generated artifacts in already completed jobs will not be affected by the change. The latest artifacts for all jobs in the latest successful pipelines will be kept. More information: https://gitlab.lrz.de/help/user/admin_area/settings/continuous_integration.html#default-artifacts-expiration

ebnf.py 31.5 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
21
from collections import OrderedDict

22
23
24
25
try:
    import regex as re
except ImportError:
    import re
26
27
28
29
try:
    from typing import Callable, Dict, List, Set, Tuple
except ImportError:
    from .typing34 import Callable, Dict, List, Set, Tuple
30

31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
32
from DHParser.parser import Grammar, mixin_comment, nil_preprocessor, Forward, RE, NegativeLookahead, \
33
    Alternative, Series, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
34
    PreprocessorFunc
35
36
37
38
from DHParser.syntaxtree import WHITESPACE_PTYPE, TOKEN_PTYPE, Node, TransformationFunc
from DHParser.transform import traverse, remove_brackets, \
    reduce_single_child, replace_by_single_child, remove_expendables, \
    remove_tokens, flatten, forbid, assert_content, key_tag_name
39
from DHParser.versionnumber import __version__
40

41
__all__ = ('get_ebnf_preprocessor',
42
43
44
45
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
46
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
47
           'EBNFCompilerError',
48
           'EBNFCompiler',
49
           'grammar_changed',
50
           'PreprocessorFactoryFunc',
51
52
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
53
           'CompilerFactoryFunc')
54
55


Eckhart Arnold's avatar
Eckhart Arnold committed
56
57
58
59
60
61
62
########################################################################
#
# EBNF scanning
#
########################################################################


63
64
def get_ebnf_preprocessor() -> PreprocessorFunc:
    return nil_preprocessor
Eckhart Arnold's avatar
Eckhart Arnold committed
65
66
67
68
69
70
71
72


########################################################################
#
# EBNF parsing
#
########################################################################

73

74
class EBNFGrammar(Grammar):
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
104
105
    option     =  "[" expression §"]"

106
107
108
109
110
111
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
112
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
113
114
115
116
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
117
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
118
    parser_initialization__ = "upon instantiation"
119
120
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
121
    wspL__ = ''
122
    wspR__ = WSP__
123
    EOF = NegativeLookahead(RE('.', wR=''))
124
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
125
    regexp = RE(r'~?/(?:\\/|[^/])*?/~?')  # RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
126
127
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
128
129
130
131
    option = Series(Token("["), expression, Required(Token("]")))
    repetition = Series(Token("{"), expression, Required(Token("}")))
    oneormore = Series(Token("{"), expression, Token("}+"))
    group = Series(Token("("), expression, Required(Token(")")))
132
133
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
134
135
    factor = Alternative(Series(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Series(Optional(flowmarker), literal), Series(Optional(flowmarker), regexp),
136
137
                         Series(Optional(flowmarker), group), Series(Optional(flowmarker), oneormore),
                         repetition, option)
138
    term = OneOrMore(factor)
139
140
141
142
    expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
    directive = Series(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Series(symbol, Required(Token("=")), expression)
    syntax = Series(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
143
144
145
    root__ = syntax


146
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
166
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
167
168
169
170
171
172
173
174
175
176
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


177
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


194
EBNF_transformation_table = {
195
    # AST Transformations for EBNF-grammar
196
    "+":
197
        remove_expendables,
198
    "syntax":
199
        [],  # otherwise '"*": replace_by_single_child' would be applied
200
    "directive, definition":
201
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
202
    "expression":
203
        [replace_by_single_child, flatten, remove_tokens('|')],
204
205
206
207
208
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
209
        [remove_brackets, replace_by_single_child],
210
    "oneormore, repetition, option":
211
212
        [reduce_single_child, remove_brackets,
         forbid('repetition', 'option', 'oneormore'), assert_content(r'(?!§)')],
213
    "symbol, literal, regexp":
214
        reduce_single_child,
215
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
216
        reduce_single_child,
217
    "list_":
218
        [flatten, remove_tokens(',')],
219
    "*":
220
        replace_by_single_child
221
222
}

223

224
def EBNFTransformer(syntax_tree: Node):
225
    traverse(syntax_tree, EBNF_transformation_table, key_tag_name)
di68kap's avatar
di68kap committed
226
227


228
def get_ebnf_transformer() -> TransformationFunc:
229
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
230
231
232
233
234
235
236
237


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

238

239
PreprocessorFactoryFunc = Callable[[], PreprocessorFunc]
240
ParserFactoryFunc = Callable[[], Grammar]
241
TransformerFactoryFunc = Callable[[], TransformationFunc]
242
243
CompilerFactoryFunc = Callable[[], Compiler]

244
245
246
PREPROCESSOR_FACTORY = '''
def get_preprocessor() -> PreprocessorFunc:
    return {NAME}Preprocessor
247
248
249
250
'''


GRAMMAR_FACTORY = '''
251
def get_grammar() -> {NAME}Grammar:
252
253
254
255
256
257
258
259
260
261
262
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
263
def get_transformer() -> TransformationFunc:
264
265
266
267
268
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
269
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
270
271
272
273
274
275
276
277
278
279
280
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
281

282
283
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
284
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
285
286
287
    pass


288
289
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!? Is this possible at compile time?

290
class EBNFCompiler(Compiler):
291
292
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
293
294
295
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
296
297
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
298
299
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
300
301
302
303
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
304
305
306
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
307

308

309
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
310
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
311
312
        self._reset()

313

314
    def _reset(self):
315
        self._result = ''           # type: str
316
317
318
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
319
        self.variables = set()      # type: Set[str]
320
        self.definitions = []  # type: List[Tuple[str, str]]
321
        self.recursive = set()      # type: Set[str]
322
        self.deferred_tasks = []  # type: List[Callable]
323
        self.root = ""              # type: str
324
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
325
                           'comment': '',
326
                           'literalws': ['right'],
327
328
329
                           'tokens': set(),  # alt. 'preprocessor_tokens'
                           'filter': dict(),  # alt. 'filter'
                           'testing': False}
330

Eckhart Arnold's avatar
Eckhart Arnold committed
331
    @property
332
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
333
334
        return self._result

335
    # methods for generating skeleton code for preprocessor, transformer, and compiler
336

337
338
    def gen_preprocessor_skeleton(self) -> str:
        name = self.grammar_name + "Preprocessor"
339
        return "def %s(text):\n    return text\n" % name \
340
               + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
341

342

343
    def gen_transformer_skeleton(self) -> str:
344
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
345
346
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
347
348
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
349
        transtable = [tt_name + ' = {',
350
351
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
352
        transtable.append('    "+": remove_empty,')
353
        for name in self.rules:
354
355
            transtable.append('    "' + name + '": [],')
        transtable.append('    ":Token, :RE": reduce_single_child,')
Eckhart Arnold's avatar
Eckhart Arnold committed
356
        transtable += ['    "*": replace_by_single_child', '}', '', tf_name +
357
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
358
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
359
360
        return '\n'.join(transtable)

361

362
    def gen_compiler_skeleton(self) -> str:
363
        if not self.rules:
364
365
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
366
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
367
368
369
370
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
371
                    self.grammar_name + '", grammar_source=""):',
372
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
373
                    'Compiler, self).__init__(grammar_name, grammar_source)',
374
                    "        assert re.match('\w+\Z', grammar_name)", '']
375
        for name in self.rules:
376
            method_name = Compiler.method_name(name)
377
            if name == self.root:
378
                compiler += ['    def ' + method_name + '(self, node):',
379
380
                             '        return node', '']
            else:
381
                compiler += ['    def ' + method_name + '(self, node):',
382
                             '        pass', '']
383
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
384
        return '\n'.join(compiler)
385

386

387
388
389
390
391
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
392
393
394
395
396
397
398
399
400
401

        # execute deferred tasks, for example semantic checks that cannot
        # be done before the symbol table is complete

        for task in self.deferred_tasks:
            task()

        # provide for capturing of symbols that are variables, i.e. the
        # value of will be retrieved at some point during the parsing process

402
403
404
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
405
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
406

407
408
        # add special fields for Grammar class

409
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
410
                            if 'right' in self.directives['literalws'] else "''"))
411
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
412
                            if 'left' in self.directives['literalws'] else "''"))
413
        definitions.append((self.WHITESPACE_KEYWORD,
414
415
416
417
418
419
420
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
421

422
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
423
        declarations = ['class ' + self.grammar_name +
424
                        'Grammar(Grammar):',
425
426
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
427
                        (', with this grammar:' if self.grammar_source else '.')]
428
        definitions.append(('parser_initialization__', '"upon instantiation"'))
429
        if self.grammar_source:
430
            definitions.append(('source_hash__',
431
                                '"%s"' % md5(self.grammar_source, __version__)))
432
            declarations.append('')
433
            declarations += [line for line in self.grammar_source.split('\n')]
434
435
436
437
438
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
439

440
441
442
443
444
445
446
447
448
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
449
450
451
452
453
454
455

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
456
                root_node.error_flag = True
457
458
459

        # check for unconnected rules

460
        if not self.directives['testing']:
461
462
463
464
465
466
467
468
469
470
471
            defined_symbols.difference_update(self.RESERVED_SYMBOLS)

            def remove_connections(symbol):
                if symbol in defined_symbols:
                    defined_symbols.remove(symbol)
                    for related in self.rules[symbol][1:]:
                        remove_connections(str(related))

            remove_connections(self.root)
            for leftover in defined_symbols:
                self.rules[leftover][0].add_error(('Rule "%s" is not connected to parser '
472
473
                    'root "%s" !') % (leftover, self.root) + ' (Use directive "@testing=True" '
                    'to supress this error message.)')
474
475
476

        # set root parser and assemble python grammar definition

477
        if self.root and 'root__' not in self.rules:
478
479
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
480
481
482
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
483

484
485
486

    ## compilation methods

487
    def on_syntax(self, node: Node) -> str:
488
        self._reset()
489
        definitions = []  # type: List[Tuple[str, str]]
490
491

        # drop the wrapping sequence node
492
493
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
494
495

        # compile definitions and directives and collect definitions
496
        for nd in node.children:
497
            if nd.parser.name == "definition":
498
                definitions.append(self.compile(nd))
499
            else:
500
                assert nd.parser.name == "directive", nd.as_sxpr()
501
                self.compile(nd)
502
                node.error_flag = node.error_flag or nd.error_flag
503

504
        return self.assemble_parser(definitions, node)
505

506

507
    def on_definition(self, node: Node) -> Tuple[str, str]:
508
        rule = str(node.children[0])
509
510
511
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
512
513
514
515
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
516
        elif rule in self.directives['tokens']:
517
            node.add_error('Symbol "%s" has already been defined as '
518
                           'a preprocessor token.' % rule)
519
520
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
521
                           % rule + '(This may change in the future.)')
522
        try:
523
524
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
525
            defn = self.compile(node.children[1])
526
            if rule in self.variables:
527
                defn = 'Capture(%s)' % defn
528
                self.variables.remove(rule)
529
530
531
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
532
        except TypeError as error:
533
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sxpr()
534
535
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
536
        return rule, defn
537

538

539
    @staticmethod
540
    def _check_rx(node: Node, rx: str) -> str:
541
542
        """
        Checks whether the string `rx` represents a valid regular
543
544
545
546
547
548
549
550
551
552
553
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

554

555
    def on_directive(self, node: Node) -> str:
556
        key = str(node.children[0]).lower()
557
        assert key not in self.directives['tokens']
558

559
        if key in {'comment', 'whitespace'}:
560
561
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
562
                    node.add_error('Directive "%s" must have one, but not %i values.' %
563
                                   (key, len(node.children[1].result)))
564
                value = self.compile(node.children[1]).pop()
565
566
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
567
                else:
568
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
569
            else:
570
571
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
572
573
574
575
576
577
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
578
579
580
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
581
            self.directives[key] = value
582

583
584
585
586
        elif key == 'testing':
            value = str(node.children[1])
            self.directives['testing'] = value.lower() not in {"off", "false", "no"}

587
        elif key == 'literalws':
588
            value = {item.lower() for item in self.compile(node.children[1])}
589
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
590
                    or ('none' in value and len(value) > 1)):
591
592
593
594
595
596
597
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

598
        elif key in {'tokens', 'preprocessor_tokens'}:
599
            self.directives['tokens'] |= self.compile(node.children[1])
600

601
        elif key.endswith('_filter'):
602
            filter_set = self.compile(node.children[1])
603
604
605
606
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
607

608
609
610
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
611
                            ', '.join(list(self.directives.keys()))))
612
613
        return ""

614

615
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
616
617
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
618
619
        name for the particular non-terminal.
        """
620
        arguments = [self.compile(r) for r in node.children] + custom_args
621
622
        return parser_class + '(' + ', '.join(arguments) + ')'

623

624
    def on_expression(self, node) -> str:
625
626
        return self.non_terminal(node, 'Alternative')

627

628
    def on_term(self, node) -> str:
629
        return self.non_terminal(node, 'Series')
630

631

632
    def on_factor(self, node: Node) -> str:
633
        assert node.children
634
        assert len(node.children) >= 2, node.as_sxpr()
635
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
636
        custom_args = []  # type: List[str]
637
638

        if prefix in {'::', ':'}:
639
640
            assert len(node.children) == 2
            arg = node.children[-1]
641
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
642
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
643
644
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
645
            if str(arg) in self.directives['filter']:
646
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
647
            self.variables.add(str(arg))  # cast(str, arg.result)
648

649
        elif len(node.children) > 2:
650
651
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
652
653
654
655
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
656

657
        node.result = node.children[1:]
658
659
        try:
            parser_class = self.PREFIX_TABLE[prefix]
660
661
662
663
            result = self.non_terminal(node, parser_class, custom_args)
            if prefix[:1] == '-':
                def check(node):
                    nd = node
664
665
666
667
668
669
670
671
672
673
                    if len(nd.children) >= 1:
                        nd = nd.children[0]
                    while nd.parser.name == "symbol":
                        symlist = self.rules.get(str(nd), [])
                        if len(symlist) == 2:
                            nd = symlist[1]
                        else:
                            if len(symlist) == 1:
                                nd = symlist[0].children[1]
                            break
674
675
676
                    if (nd.parser.name != "regexp" or str(nd)[:1] != '/'
                        or str(nd)[-1:] != '/'):
                        node.add_error("Lookbehind-parser can only be used with plain RegExp-"
677
                                       "parsers, not with: " + nd.parser.name + nd.parser.ptype)
678
679
680
681

                if not result.startswith('RegExp('):
                    self.deferred_tasks.append(lambda: check(node))
            return result
682
683
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
684
        return ""
685

686

687
    def on_option(self, node) -> str:
688
689
        return self.non_terminal(node, 'Optional')

690

691
    def on_repetition(self, node) -> str:
692
693
        return self.non_terminal(node, 'ZeroOrMore')

694

695
    def on_oneormore(self, node) -> str:
696
697
        return self.non_terminal(node, 'OneOrMore')

698

699
    def on_regexchain(self, node) -> str:
700
701
        raise EBNFCompilerError("Not yet implemented!")

702

703
    def on_group(self, node) -> str:
704
705
706
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

707

708
709
710
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
711
            return 'PreprocessorToken("' + symbol + '")'
712
        else:
713
714
            self.current_symbols.append(node)
            if symbol not in self.symbols:
715
                self.symbols[symbol] = node  # remember first use of symbol
716
717
718
            if symbol in self.rules:
                self.recursive.add(symbol)
            return symbol
719

720

721
    def on_literal(self, node) -> str:
722
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.join_children([node.result]) + ')' ?
723

724

725
    def on_regexp(self, node: Node) -> str:
726
        rx = str(node)
727
        name = []   # type: List[str]
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
        if rx[0] == '/' and rx[-1] == '/':
            parser = 'RegExp('
        else:
            parser = 'RE('
            if rx[:2] == '~/':
                if not 'left' in self.directives['literalws']:
                    name = ['wL=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[1:]
            elif 'left' in self.directives['literalws']:
                name = ["wL=''"] + name
            if rx[-2:] == '/~':
                if 'right' not in self.directives['literalws']:
                    name = ['wR=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[:-1]
            elif 'right' in self.directives['literalws']:
                name = ["wR=''"] + name
744
745
746
747
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
748
                     node.as_sxpr()
749
750
            node.add_error(errmsg)
            return '"' + errmsg + '"'
751
        return parser + ', '.join([arg] + name) + ')'
752

753

754
    def on_list_(self, node) -> Set[str]:
755
        assert node.children
756
        return set(item.result.strip() for item in node.children)
757
758


759
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
760
761
762
763
764
765
766
767
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton