ebnf.py 34.2 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20
21
from collections import OrderedDict

22
23
24
25
try:
    import regex as re
except ImportError:
    import re
26
27
28
29
try:
    from typing import Callable, Dict, List, Set, Tuple
except ImportError:
    from .typing34 import Callable, Dict, List, Set, Tuple
30

31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
32
from DHParser.parser import Grammar, mixin_comment, nil_preprocessor, Forward, RE, NegativeLookahead, \
33
    Alternative, Series, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
34
    PreprocessorFunc
35
36
37
from DHParser.syntaxtree import WHITESPACE_PTYPE, TOKEN_PTYPE, Node, TransformationFunc
from DHParser.transform import traverse, remove_brackets, \
    reduce_single_child, replace_by_single_child, remove_expendables, \
38
    remove_tokens, flatten, forbid, assert_content, key_tag_name, remove_infix_operator
39
from DHParser.versionnumber import __version__
40

41
__all__ = ('get_ebnf_preprocessor',
42
43
44
45
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
46
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
47
           'EBNFCompilerError',
48
           'EBNFCompiler',
49
           'grammar_changed',
50
           'PreprocessorFactoryFunc',
51
52
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
53
           'CompilerFactoryFunc')
54
55


Eckhart Arnold's avatar
Eckhart Arnold committed
56
57
58
59
60
61
62
########################################################################
#
# EBNF scanning
#
########################################################################


63
64
def get_ebnf_preprocessor() -> PreprocessorFunc:
    return nil_preprocessor
Eckhart Arnold's avatar
Eckhart Arnold committed
65
66
67
68
69
70
71
72


########################################################################
#
# EBNF parsing
#
########################################################################

73

74
class EBNFGrammar(Grammar):
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
104
105
    option     =  "[" expression §"]"

106
107
108
109
110
111
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
112
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
113
114
115
116
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
117
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
118
    parser_initialization__ = "upon instantiation"
119
120
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
121
    wspL__ = ''
122
    wspR__ = WSP__
123
    EOF = NegativeLookahead(RE('.', wR=''))
124
    list_ = Series(RE('\\w+'), ZeroOrMore(Series(Token(","), RE('\\w+'))))
125
    regexp = RE(r'~?/(?:\\/|[^/])*?/~?')  # RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
126
127
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
128
129
130
131
    option = Series(Token("["), expression, Required(Token("]")))
    repetition = Series(Token("{"), expression, Required(Token("}")))
    oneormore = Series(Token("{"), expression, Token("}+"))
    group = Series(Token("("), expression, Required(Token(")")))
132
133
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
134
135
    factor = Alternative(Series(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Series(Optional(flowmarker), literal), Series(Optional(flowmarker), regexp),
136
137
                         Series(Optional(flowmarker), group), Series(Optional(flowmarker), oneormore),
                         repetition, option)
138
    term = OneOrMore(factor)
139
140
141
142
    expression.set(Series(term, ZeroOrMore(Series(Token("|"), term))))
    directive = Series(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Series(symbol, Required(Token("=")), expression)
    syntax = Series(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
143
144
145
    root__ = syntax


146
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
166
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
167
168
169
170
171
172
173
174
175
176
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


177
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


194
EBNF_transformation_table = {
195
    # AST Transformations for EBNF-grammar
196
    "+":
197
        remove_expendables,
198
    "syntax":
199
        [],  # otherwise '"*": replace_by_single_child' would be applied
200
    "directive, definition":
201
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
202
    "expression":
203
        [replace_by_single_child, flatten, remove_tokens('|')],  # remove_infix_operator],
204
205
206
207
208
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
209
        [remove_brackets, replace_by_single_child],
210
    "oneormore, repetition, option":
211
212
        [reduce_single_child, remove_brackets,
         forbid('repetition', 'option', 'oneormore'), assert_content(r'(?!§)')],
213
    "symbol, literal, regexp":
214
        reduce_single_child,
215
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
216
        reduce_single_child,
217
    "list_":
218
        [flatten, remove_infix_operator],
219
    "*":
220
        replace_by_single_child
221
222
}

223

224
def EBNFTransformer(syntax_tree: Node):
225
    traverse(syntax_tree, EBNF_transformation_table, key_tag_name)
di68kap's avatar
di68kap committed
226
227


228
def get_ebnf_transformer() -> TransformationFunc:
229
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
230
231
232
233
234
235
236
237


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

238

239
PreprocessorFactoryFunc = Callable[[], PreprocessorFunc]
240
ParserFactoryFunc = Callable[[], Grammar]
241
TransformerFactoryFunc = Callable[[], TransformationFunc]
242
243
CompilerFactoryFunc = Callable[[], Compiler]

244
245
246
PREPROCESSOR_FACTORY = '''
def get_preprocessor() -> PreprocessorFunc:
    return {NAME}Preprocessor
247
248
249
250
'''


GRAMMAR_FACTORY = '''
251
def get_grammar() -> {NAME}Grammar:
252
253
254
255
256
257
258
259
260
261
262
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
263
def get_transformer() -> TransformationFunc:
264
265
266
267
268
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
269
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
270
271
272
273
274
275
276
277
278
279
280
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
281

282
283
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
284
    in the strict sense, see `CompilationError` in module ``dsl.py``)"""
285
286
287
    pass


288
289
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!? Is this possible at compile time?

290
class EBNFCompiler(Compiler):
291
292
    """
    Generates a Parser from an abstract syntax tree of a grammar specified
293
    in EBNF-Notation.
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342

    Instances of this class must be called with the root-node of the
    abstract syntax tree from an EBNF-specification of a formal language.
    The returned value is the Python-source-code of a Grammar class for
    this language that can be used to parse texts in this language.
    See classes `parser.Compiler` and `parser.Grammar` for more information.

    Addionally, class EBNFCompiler provides helper methods to generate
    code-skeletons for a preprocessor, AST-transformation and full
    compilation of the formal language. These method's names start with
    the prefix `gen_`.

    Attributes:
        current_symbols - During compilation, a list containing the root
                node of the currently compiled definition as first element
                and then the nodes of the symbols that are referred to in
                the currently compiled definition.

        rules - Dictionary that maps rule names to a list of Nodes that
                contain symbol-references in the definition of the rule.
                The first item in the list is the node of the rule-
                definition itself. Example:

                           `alternative = a | b`

                Now `[str(node) for node in self.rules['alternative']]`
                yields `['alternative = a | b', 'a', 'b']`

        symbols - A mapping of symbol names to their first usage (not
                their definition!) in the EBNF source.

        variables - A set of symbols names that are used with the
                Pop or Retrieve operator. Because the values of these
                symbols need to be captured they are called variables.
                See `test_parser.TestPopRetrieve` for an example.

        recursive - A set of symbols that are used recursively and
                therefore require a `Forward`-operator.

        deferred_taks - A list of callables that is filled during
                compilatation, but that will be executed only after
                compilation has finished. Typically, it contains
                sementatic checks that require information that
                is only available upon completion of compilation.

        root -   The name of the root symbol.

        directives - A dictionary of all directives and their default
                values.
343
344
    """
    COMMENT_KEYWORD = "COMMENT__"
345
346
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
347
348
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
349
350
351
352
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
353
354
355
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
356

357

358
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
359
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
360
361
        self._reset()

362

363
    def _reset(self):
364
        self._result = ''           # type: str
365
366
367
        self.rules = OrderedDict()  # type: OrderedDict[str, List[Node]]
        self.current_symbols = []   # type: List[Node]
        self.symbols = {}           # type: Dict[str, Node]
368
369
        self.variables = set()      # type: Set[str]
        self.recursive = set()      # type: Set[str]
370
        self.deferred_tasks = []    # type: List[Callable]
371
        self.root = ""              # type: str
372
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
373
                           'comment': '',
374
                           'literalws': ['right'],
375
376
377
                           'tokens': set(),  # alt. 'preprocessor_tokens'
                           'filter': dict(),  # alt. 'filter'
                           'testing': False}
378

Eckhart Arnold's avatar
Eckhart Arnold committed
379
    @property
380
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
381
382
        return self._result

383
    # methods for generating skeleton code for preprocessor, transformer, and compiler
384

385
    def gen_preprocessor_skeleton(self) -> str:
386
387
388
389
        """
        Returns Python-skeleton-code for a preprocessor-function for
        the previously compiled formal language.
        """
390
        name = self.grammar_name + "Preprocessor"
391
        return "def %s(text):\n    return text\n" % name \
392
               + PREPROCESSOR_FACTORY.format(NAME=self.grammar_name)
393

394

395
    def gen_transformer_skeleton(self) -> str:
396
397
398
399
        """
        Returns Python-skeleton-code for the AST-transformation for the
        previously compiled formal language.
        """
400
        if not self.rules:
Eckhart Arnold's avatar
Eckhart Arnold committed
401
402
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
403
404
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
405
        transtable = [tt_name + ' = {',
406
407
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
Eckhart Arnold's avatar
Eckhart Arnold committed
408
        transtable.append('    "+": remove_empty,')
409
        for name in self.rules:
410
411
            transtable.append('    "' + name + '": [],')
        transtable.append('    ":Token, :RE": reduce_single_child,')
412
        transtable += ['    # "*": replace_by_single_child', '}', '', tf_name +
413
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
414
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
415
416
        return '\n'.join(transtable)

417

418
    def gen_compiler_skeleton(self) -> str:
419
420
421
422
        """
        Returns Python-skeleton-code for a Compiler-class for the
        previously compiled formal language.
        """
423
        if not self.rules:
424
425
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
426
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
427
428
429
430
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
431
                    self.grammar_name + '", grammar_source=""):',
432
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
433
                    'Compiler, self).__init__(grammar_name, grammar_source)',
434
                    "        assert re.match('\w+\Z', grammar_name)", '']
435
        for name in self.rules:
436
            method_name = Compiler.method_name(name)
437
            if name == self.root:
438
                compiler += ['    def ' + method_name + '(self, node):',
439
440
                             '        return node', '']
            else:
441
                compiler += ['    def ' + method_name + '(self, node):',
442
                             '        pass', '']
443
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
444
        return '\n'.join(compiler)
445

446

447
448
449
450
451
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
        """
        Creates the Python code for the parser after compilation of
        the EBNF-Grammar
        """
452
453
454
455
456
457
458
459
460
461

        # execute deferred tasks, for example semantic checks that cannot
        # be done before the symbol table is complete

        for task in self.deferred_tasks:
            task()

        # provide for capturing of symbols that are variables, i.e. the
        # value of will be retrieved at some point during the parsing process

462
463
464
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
465
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
466

467
468
        # add special fields for Grammar class

469
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
470
                            if 'right' in self.directives['literalws'] else "''"))
471
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
472
                            if 'left' in self.directives['literalws'] else "''"))
473
        definitions.append((self.WHITESPACE_KEYWORD,
474
475
476
477
478
479
480
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
481

482
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
483
        declarations = ['class ' + self.grammar_name +
484
                        'Grammar(Grammar):',
485
486
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
487
                        (', with this grammar:' if self.grammar_source else '.')]
488
        definitions.append(('parser_initialization__', '"upon instantiation"'))
489
        if self.grammar_source:
490
            definitions.append(('source_hash__',
491
                                '"%s"' % md5(self.grammar_source, __version__)))
492
            declarations.append('')
493
            declarations += [line for line in self.grammar_source.split('\n')]
494
495
496
497
498
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

        # turn definitions into declarations in reverse order
499

500
501
502
503
504
505
506
507
508
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
509
510
511
512
513
514
515

        # check for symbols used but never defined

        defined_symbols = set(self.rules.keys()) | self.RESERVED_SYMBOLS
        for symbol in self.symbols:
            if symbol not in defined_symbols:
                self.symbols[symbol].add_error("Missing definition for symbol '%s'" % symbol)
516
                root_node.error_flag = True
517
518
519

        # check for unconnected rules

520
        if not self.directives['testing']:
521
522
523
524
525
526
527
528
529
530
531
            defined_symbols.difference_update(self.RESERVED_SYMBOLS)

            def remove_connections(symbol):
                if symbol in defined_symbols:
                    defined_symbols.remove(symbol)
                    for related in self.rules[symbol][1:]:
                        remove_connections(str(related))

            remove_connections(self.root)
            for leftover in defined_symbols:
                self.rules[leftover][0].add_error(('Rule "%s" is not connected to parser '
532
533
                    'root "%s" !') % (leftover, self.root) + ' (Use directive "@testing=True" '
                    'to supress this error message.)')
534
535
536

        # set root parser and assemble python grammar definition

537
        if self.root and 'root__' not in self.rules:
538
539
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
540
541
542
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
543

544
545
546

    ## compilation methods

547
    def on_syntax(self, node: Node) -> str:
548
        self._reset()
549
        definitions = []  # type: List[Tuple[str, str]]
550
551

        # drop the wrapping sequence node
552
553
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
554
555

        # compile definitions and directives and collect definitions
556
        for nd in node.children:
557
            if nd.parser.name == "definition":
558
                definitions.append(self.compile(nd))
559
            else:
560
                assert nd.parser.name == "directive", nd.as_sxpr()
561
                self.compile(nd)
562
                node.error_flag = node.error_flag or nd.error_flag
563

564
        return self.assemble_parser(definitions, node)
565

566

567
    def on_definition(self, node: Node) -> Tuple[str, str]:
568
        rule = str(node.children[0])
569
570
571
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
572
573
574
575
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
576
        elif rule in self.directives['tokens']:
577
            node.add_error('Symbol "%s" has already been defined as '
578
                           'a preprocessor token.' % rule)
579
580
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
581
                           % rule + '(This may change in the future.)')
582
        try:
583
584
            self.current_symbols = [node]
            self.rules[rule] = self.current_symbols
585
            defn = self.compile(node.children[1])
586
            if rule in self.variables:
587
                defn = 'Capture(%s)' % defn
588
                self.variables.remove(rule)
589
590
591
            elif defn.find("(") < 0:
                # assume it's a synonym, like 'page = REGEX_PAGE_NR'
                defn = 'Synonym(%s)' % defn
592
        except TypeError as error:
593
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sxpr()
594
595
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
596
        return rule, defn
597

598

599
    @staticmethod
600
    def _check_rx(node: Node, rx: str) -> str:
601
602
        """
        Checks whether the string `rx` represents a valid regular
603
604
605
606
607
608
609
610
611
612
613
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

614

615
    def on_directive(self, node: Node) -> str:
616
        key = str(node.children[0]).lower()
617
        assert key not in self.directives['tokens']
618

619
        if key in {'comment', 'whitespace'}:
620
621
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
622
                    node.add_error('Directive "%s" must have one, but not %i values.' %
623
                                   (key, len(node.children[1].result)))
624
                value = self.compile(node.children[1]).pop()
625
626
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
627
                else:
628
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
629
            else:
630
631
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
632
633
634
635
636
637
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
638
639
640
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
641
            self.directives[key] = value
642

643
644
645
646
        elif key == 'testing':
            value = str(node.children[1])
            self.directives['testing'] = value.lower() not in {"off", "false", "no"}

647
        elif key == 'literalws':
648
            value = {item.lower() for item in self.compile(node.children[1])}
649
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
650
                    or ('none' in value and len(value) > 1)):
651
652
653
654
655
656
657
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

658
        elif key in {'tokens', 'preprocessor_tokens'}:
659
            self.directives['tokens'] |= self.compile(node.children[1])
660

661
        elif key.endswith('_filter'):
662
            filter_set = self.compile(node.children[1])
663
664
665
666
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
667

668
669
670
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
671
                            ', '.join(list(self.directives.keys()))))
672
673
        return ""

674

675
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
676
677
        """
        Compiles any non-terminal, where `parser_class` indicates the Parser class
678
679
        name for the particular non-terminal.
        """
680
        arguments = [self.compile(r) for r in node.children] + custom_args
681
682
        return parser_class + '(' + ', '.join(arguments) + ')'

683

684
    def on_expression(self, node) -> str:
685
686
        return self.non_terminal(node, 'Alternative')

687

688
    def on_term(self, node) -> str:
689
        return self.non_terminal(node, 'Series')
690

691

692
    def on_factor(self, node: Node) -> str:
693
        assert node.children
694
        assert len(node.children) >= 2, node.as_sxpr()
695
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
696
        custom_args = []  # type: List[str]
697
698

        if prefix in {'::', ':'}:
699
700
            assert len(node.children) == 2
            arg = node.children[-1]
701
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
702
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
703
704
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
705
            if str(arg) in self.directives['filter']:
706
                custom_args = ['filter=%s' % self.directives['filter'][str(arg)]]
707
            self.variables.add(str(arg))  # cast(str, arg.result)
708

709
        elif len(node.children) > 2:
710
711
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
712
713
714
715
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
716

717
        node.result = node.children[1:]
718
719
        try:
            parser_class = self.PREFIX_TABLE[prefix]
720
721
722
723
            result = self.non_terminal(node, parser_class, custom_args)
            if prefix[:1] == '-':
                def check(node):
                    nd = node
724
725
726
727
728
729
730
731
732
733
                    if len(nd.children) >= 1:
                        nd = nd.children[0]
                    while nd.parser.name == "symbol":
                        symlist = self.rules.get(str(nd), [])
                        if len(symlist) == 2:
                            nd = symlist[1]
                        else:
                            if len(symlist) == 1:
                                nd = symlist[0].children[1]
                            break
734
735
736
                    if (nd.parser.name != "regexp" or str(nd)[:1] != '/'
                        or str(nd)[-1:] != '/'):
                        node.add_error("Lookbehind-parser can only be used with plain RegExp-"
737
                                       "parsers, not with: " + nd.parser.name + nd.parser.ptype)
738
739
740
741

                if not result.startswith('RegExp('):
                    self.deferred_tasks.append(lambda: check(node))
            return result
742
743
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
744
        return ""
745

746

747
    def on_option(self, node) -> str:
748
749
        return self.non_terminal(node, 'Optional')

750

751
    def on_repetition(self, node) -> str:
752
753
        return self.non_terminal(node, 'ZeroOrMore')

754

755
    def on_oneormore(self, node) -> str:
756
757
        return self.non_terminal(node, 'OneOrMore')

758

759
    def on_regexchain(self, node) -> str:
760
761
        raise EBNFCompilerError("Not yet implemented!")

762

763
    def on_group(self, node) -> str:
764
765
766
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

767

768
769
770
    def on_symbol(self, node: Node) -> str:     # called only for symbols on the right hand side!
        symbol = str(node)  # ; assert result == cast(str, node.result)
        if symbol in self.directives['tokens']:
771
            return 'PreprocessorToken("' + symbol + '")'
772
        else:
773
774
            self.current_symbols.append(node)
            if symbol not in self.symbols:
775
                self.symbols[symbol] = node  # remember first use of symbol
776
777
778
            if symbol in self.rules:
                self.recursive.add(symbol)
            return symbol
779

780

781
    def on_literal(self, node) -> str:
782
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.merge_children([node.result]) + ')' ?
783

784

785
    def on_regexp(self, node: Node) -> str:
786
        rx = str(node)
787
        name = []   # type: List[str]
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
        if rx[0] == '/' and rx[-1] == '/':
            parser = 'RegExp('
        else:
            parser = 'RE('
            if rx[:2] == '~/':
                if not 'left' in self.directives['literalws']:
                    name = ['wL=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[1:]
            elif 'left' in self.directives['literalws']:
                name = ["wL=''"] + name
            if rx[-2:] == '/~':
                if 'right' not in self.directives['literalws']:
                    name = ['wR=' + self.WHITESPACE_KEYWORD] + name
                rx = rx[:-1]
            elif 'right' in self.directives['literalws']:
                name = ["wR=''"] + name
804
805
806
807
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
808
                     node.as_sxpr()
809
810
            node.add_error(errmsg)
            return '"' + errmsg + '"'
811
        return parser + ', '.join([arg] + name) + ')'
812

813

814
    def on_list_(self, node) -> Set[str]:
815
        assert node.children
816
        return set(item.result.strip() for item in node.children)
817
818


819
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
820
821
822
823
824
825
826
827
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton