ebnf.py 28.9 KB
Newer Older
1
"""ebnf.py - EBNF -> Python-Parser compilation for DHParser
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

Copyright 2016  by Eckhart Arnold (arnold@badw.de)
                Bavarian Academy of Sciences an Humanities (badw.de)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.  See the License for the specific language governing
permissions and limitations under the License.
"""

19
import keyword
20

21
22
23
24
try:
    import regex as re
except ImportError:
    import re
25
from typing import Callable, List, Set, Tuple
26

27
28
29
30
31
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
from DHParser.parsers import Grammar, mixin_comment, nil_scanner, Forward, RE, NegativeLookahead, \
    Alternative, Sequence, Optional, Required, OneOrMore, ZeroOrMore, Token, Compiler, \
    ScannerFunc
from DHParser.syntaxtree import Node, traverse, remove_enclosing_delimiters, reduce_single_child, \
32
    replace_by_single_child, TOKEN_PTYPE, remove_expendables, remove_tokens, flatten, \
33
    forbid, assert_content, WHITESPACE_PTYPE, key_tag_name, TransformationFunc
34
from DHParser.versionnumber import __version__
35
36


37
38
39
40
41
__all__ = ['get_ebnf_scanner',
           'get_ebnf_grammar',
           'get_ebnf_transformer',
           'get_ebnf_compiler',
           'EBNFGrammar',
42
           'EBNFTransformer',
Eckhart Arnold's avatar
Eckhart Arnold committed
43
           'EBNFCompilerError',
44
           'EBNFCompiler',
45
46
47
48
49
           'grammar_changed',
           'ScannerFactoryFunc',
           'ParserFactoryFunc',
           'TransformerFactoryFunc',
           'CompilerFactoryFunc']
50
51


Eckhart Arnold's avatar
Eckhart Arnold committed
52
53
54
55
56
57
58
########################################################################
#
# EBNF scanning
#
########################################################################


59
def get_ebnf_scanner() -> ScannerFunc:
Eckhart Arnold's avatar
Eckhart Arnold committed
60
61
62
63
64
65
66
67
68
69
    return nil_scanner


########################################################################
#
# EBNF parsing
#
########################################################################


70
class EBNFGrammar(Grammar):
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
    r"""Parser for an EBNF source file, with this grammar:

    # EBNF-Grammar in EBNF

    @ comment    =  /#.*(?:\n|$)/                    # comments start with '#' and eat all chars up to and including '\n'
    @ whitespace =  /\s*/                            # whitespace includes linefeed
    @ literalws  =  right                            # trailing whitespace of literals will be ignored tacitly

    syntax     =  [~//] { definition | directive } §EOF
    definition =  symbol §"=" expression
    directive  =  "@" §symbol §"=" ( regexp | literal | list_ )

    expression =  term { "|" term }
    term       =  { factor }+
    factor     =  [flowmarker] [retrieveop] symbol !"="   # negative lookahead to be sure it's not a definition
                | [flowmarker] literal
                | [flowmarker] regexp
                | [flowmarker] group
89
                | [flowmarker] regexchain
90
91
92
93
94
95
96
97
98
                | [flowmarker] oneormore
                | repetition
                | option

    flowmarker =  "!"  | "&"  | "§" |                # '!' negative lookahead, '&' positive lookahead, '§' required
                  "-!" | "-&"                        # '-' negative lookbehind, '-&' positive lookbehind
    retrieveop =  "::" | ":"                         # '::' pop, ':' retrieve

    group      =  "(" expression §")"
99
    regexchain =  ">" expression §"<"                # compiles "expression" into a singular regular expression
100
101
    oneormore  =  "{" expression "}+"
    repetition =  "{" expression §"}"
102
103
    option     =  "[" expression §"]"

104
105
106
107
108
109
    symbol     =  /(?!\d)\w+/~                       # e.g. expression, factor, parameter_list
    literal    =  /"(?:[^"]|\\")*?"/~                # e.g. "(", '+', 'while'
                | /'(?:[^']|\\')*?'/~                # whitespace following literals will be ignored tacitly.
    regexp     =  /~?\/(?:[^\/]|(?<=\\)\/)*\/~?/~    # e.g. /\w+/, ~/#.*(?:\n|$)/~
                                                     # '~' is a whitespace-marker, if present leading or trailing
                                                     # whitespace of a regular expression will be ignored tacitly.
110
    list_      =  /\w+/~ { "," /\w+/~ }              # comma separated list of symbols, e.g. BEGIN_LIST, END_LIST,
111
112
113
114
                                                     # BEGIN_QUOTE, END_QUOTE ; see CommonMark/markdown.py for an exmaple
    EOF =  !/./
    """
    expression = Forward()
115
    source_hash__ = "a410e1727fb7575e98ff8451dbf8f3bd"
116
    parser_initialization__ = "upon instantiation"
117
118
    COMMENT__ = r'#.*(?:\n|$)'
    WSP__ = mixin_comment(whitespace=r'\s*', comment=r'#.*(?:\n|$)')
119
    wspL__ = ''
120
    wspR__ = WSP__
121
    EOF = NegativeLookahead(RE('.', wR=''))
122
    list_ = Sequence(RE('\\w+'), ZeroOrMore(Sequence(Token(","), RE('\\w+'))))
123
124
125
    regexp = RE('~?/(?:[^/]|(?<=\\\\)/)*/~?')
    literal = Alternative(RE('"(?:[^"]|\\\\")*?"'), RE("'(?:[^']|\\\\')*?'"))
    symbol = RE('(?!\\d)\\w+')
126
    option = Sequence(Token("["), expression, Required(Token("]")))
127
128
    repetition = Sequence(Token("{"), expression, Required(Token("}")))
    oneormore = Sequence(Token("{"), expression, Token("}+"))
129
    regexchain = Sequence(Token("<"), expression, Required(Token(">")))
130
131
132
133
134
    group = Sequence(Token("("), expression, Required(Token(")")))
    retrieveop = Alternative(Token("::"), Token(":"))
    flowmarker = Alternative(Token("!"), Token("&"), Token("§"), Token("-!"), Token("-&"))
    factor = Alternative(Sequence(Optional(flowmarker), Optional(retrieveop), symbol, NegativeLookahead(Token("="))),
                         Sequence(Optional(flowmarker), literal), Sequence(Optional(flowmarker), regexp),
135
136
                         Sequence(Optional(flowmarker), group), Sequence(Optional(flowmarker), regexchain),
                         Sequence(Optional(flowmarker), oneormore), repetition, option)
137
138
139
140
    term = OneOrMore(factor)
    expression.set(Sequence(term, ZeroOrMore(Sequence(Token("|"), term))))
    directive = Sequence(Token("@"), Required(symbol), Required(Token("=")), Alternative(regexp, literal, list_))
    definition = Sequence(symbol, Required(Token("=")), expression)
141
    syntax = Sequence(Optional(RE('', wR='', wL=WSP__)), ZeroOrMore(Alternative(definition, directive)), Required(EOF))
142
143
144
    root__ = syntax


145
def grammar_changed(grammar_class, grammar_source: str) -> bool:
Eckhart Arnold's avatar
Eckhart Arnold committed
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
    """Returns ``True`` if ``grammar_class`` does not reflect the latest
    changes of ``grammar_source``

    Parameters:
        grammar_class:  the parser class representing the grammar
            or the file name of a compiler suite containing the grammar
        grammar_source:  File name or string representation of the
            EBNF code of the grammar

    Returns (bool):
        True, if the source text of the grammar is different from the
        source from which the grammar class was generated
    """
    grammar = load_if_file(grammar_source)
    chksum = md5(grammar, __version__)
    if isinstance(grammar_class, str):
        # grammar_class = load_compiler_suite(grammar_class)[1]
        with open(grammar_class, 'r', encoding='utf8') as f:
            pycode = f.read()
165
        m = re.search('class \w*\(Grammar\)', pycode)
Eckhart Arnold's avatar
Eckhart Arnold committed
166
167
168
169
170
171
172
173
174
175
        if m:
            m = re.search('    source_hash__ *= *"([a-z0-9]*)"',
                          pycode[m.span()[1]:])
            return not (m and m.groups() and m.groups()[-1] == chksum)
        else:
            return True
    else:
        return chksum != grammar_class.source_hash__


176
def get_ebnf_grammar() -> EBNFGrammar:
Eckhart Arnold's avatar
Eckhart Arnold committed
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
    global thread_local_ebnf_grammar_singleton
    try:
        grammar = thread_local_ebnf_grammar_singleton
        return grammar
    except NameError:
        thread_local_ebnf_grammar_singleton = EBNFGrammar()
        return thread_local_ebnf_grammar_singleton


########################################################################
#
# EBNF concrete to abstract syntax tree transformation and validation
#
########################################################################


193
194
#TODO: Add Capture and Retrieve Validation: A variable mustn't be captured twice before retrival?!?

195
EBNF_transformation_table = {
196
197
198
199
    # AST Transformations for EBNF-grammar
    "syntax":
        remove_expendables,
    "directive, definition":
200
        remove_tokens('@', '='),
Eckhart Arnold's avatar
Eckhart Arnold committed
201
    "expression":
202
        [replace_by_single_child, flatten, remove_tokens('|')],
203
204
205
206
207
    "term":
        [replace_by_single_child, flatten],  # supports both idioms:  "{ factor }+" and "factor { factor }"
    "factor, flowmarker, retrieveop":
        replace_by_single_child,
    "group":
208
209
210
        [remove_enclosing_delimiters, replace_by_single_child],
    "oneormore, repetition, option, regexchain":
        [reduce_single_child, remove_enclosing_delimiters],
211
    "symbol, literal, regexp":
212
        [remove_expendables, reduce_single_child],
213
    (TOKEN_PTYPE, WHITESPACE_PTYPE):
214
        [remove_expendables, reduce_single_child],
215
    "list_":
216
        [flatten, remove_tokens(',')],
217
    "*":
218
219
220
        [remove_expendables, replace_by_single_child]
}

221

222
EBNF_validation_table = {
223
    # Semantic validation on the AST. EXPERIMENTAL!
224
    "repetition, option, oneormore":
225
226
        [forbid('repetition', 'option', 'oneormore'),
         assert_content(r'(?!§)')]
227
}
228

229

230
def EBNFTransformer(syntax_tree: Node):
231
    for processing_table, key_func in [(EBNF_transformation_table, key_tag_name),
232
                                       (EBNF_validation_table, key_tag_name)]:
233
        traverse(syntax_tree, processing_table, key_func)
di68kap's avatar
di68kap committed
234
235


236
def get_ebnf_transformer() -> TransformationFunc:
237
    return EBNFTransformer
Eckhart Arnold's avatar
Eckhart Arnold committed
238
239
240
241
242
243
244
245


########################################################################
#
# EBNF abstract syntax tree to Python parser compilation
#
########################################################################

246
247
248

ScannerFactoryFunc = Callable[[], ScannerFunc]
ParserFactoryFunc = Callable[[], Grammar]
249
TransformerFactoryFunc = Callable[[], TransformationFunc]
250
251
252
CompilerFactoryFunc = Callable[[], Compiler]


253
SCANNER_FACTORY = '''
254
def get_scanner() -> ScannerFunc:
255
256
257
258
259
    return {NAME}Scanner
'''


GRAMMAR_FACTORY = '''
260
def get_grammar() -> {NAME}Grammar:
261
262
263
264
265
266
267
268
269
270
271
    global thread_local_{NAME}_grammar_singleton
    try:
        grammar = thread_local_{NAME}_grammar_singleton
        return grammar
    except NameError:
        thread_local_{NAME}_grammar_singleton = {NAME}Grammar()
        return thread_local_{NAME}_grammar_singleton
'''


TRANSFORMER_FACTORY = '''
272
def get_transformer() -> TransformationFunc:
273
274
275
276
277
    return {NAME}Transform
'''


COMPILER_FACTORY = '''
278
def get_compiler(grammar_name="{NAME}", grammar_source="") -> {NAME}Compiler:
279
280
281
282
283
284
285
286
287
288
289
    global thread_local_{NAME}_compiler_singleton
    try:
        compiler = thread_local_{NAME}_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_{NAME}_compiler_singleton = \\
            {NAME}Compiler(grammar_name, grammar_source)
        return thread_local_{NAME}_compiler_singleton 
'''

Eckhart Arnold's avatar
Eckhart Arnold committed
290

291
292
293
294
295
296
class EBNFCompilerError(Exception):
    """Error raised by `EBNFCompiler` class. (Not compilation errors
    in the strict sense, see `CompilationError` below)"""
    pass


297
class EBNFCompiler(Compiler):
298
299
300
301
    """Generates a Parser from an abstract syntax tree of a grammar specified
    in EBNF-Notation.
    """
    COMMENT_KEYWORD = "COMMENT__"
302
303
    WHITESPACE_KEYWORD = "WSP__"
    RESERVED_SYMBOLS = {WHITESPACE_KEYWORD, COMMENT_KEYWORD}
304
305
    AST_ERROR = "Badly structured syntax tree. " \
                "Potentially due to erroneuos AST transformation."
306
307
308
309
    PREFIX_TABLE = {'§': 'Required',
                    '&': 'Lookahead', '!': 'NegativeLookahead',
                    '-&': 'Lookbehind', '-!': 'NegativeLookbehind',
                    '::': 'Pop', ':': 'Retrieve'}
310
311
312
    WHITESPACE = {'horizontal': r'[\t ]*',  # default: horizontal
                  'linefeed': r'[ \t]*\n?(?!\s*\n)[ \t]*',
                  'vertical': r'\s*'}
313

314
    def __init__(self, grammar_name="", grammar_source=""):
Eckhart Arnold's avatar
Eckhart Arnold committed
315
        super(EBNFCompiler, self).__init__(grammar_name, grammar_source)
316
317
318
        self._reset()

    def _reset(self):
319
320
321
322
323
324
325
        self._result = ''           # type: str
        self.rules = set()          # type: Set[str]
        self.variables = set()      # type: Set[str]
        self.symbol_nodes = []      # type: List[Node]
        self.definition_names = []  # type: List[str]
        self.recursive = set()      # type: Set[str]
        self.root = ""              # type: str
326
        self.directives = {'whitespace': self.WHITESPACE['horizontal'],
327
                           'comment': '',
328
                           'literalws': ['right'],
329
                           'tokens': set(),     # alt. 'scanner_tokens'
330
                           'filter': dict()}     # alt. 'retrieve_filter'
331

Eckhart Arnold's avatar
Eckhart Arnold committed
332
    @property
333
    def result(self) -> str:
Eckhart Arnold's avatar
Eckhart Arnold committed
334
335
        return self._result

336
    def gen_scanner_skeleton(self) -> str:
337
        name = self.grammar_name + "Scanner"
338
339
        return "def %s(text):\n    return text\n" % name \
               + SCANNER_FACTORY.format(NAME=self.grammar_name)
340

341
    def gen_transformer_skeleton(self) -> str:
342
        if not self.definition_names:
Eckhart Arnold's avatar
Eckhart Arnold committed
343
344
            raise EBNFCompilerError('Compiler must be run before calling '
                                    '"gen_transformer_Skeleton()"!')
345
346
        tt_name = self.grammar_name + '_AST_transformation_table'
        tf_name = self.grammar_name + 'Transform'
di68kap's avatar
di68kap committed
347
        transtable = [tt_name + ' = {',
348
349
350
                      '    # AST Transformations for the ' +
                      self.grammar_name + '-grammar']
        for name in self.definition_names:
351
352
            transtable.append('    "' + name + '": no_transformation,')
        transtable += ['    "*": no_transformation', '}', '', tf_name +
353
                       ' = partial(traverse, processing_table=%s)' % tt_name, '']
354
        transtable += [TRANSFORMER_FACTORY.format(NAME=self.grammar_name)]
355
356
        return '\n'.join(transtable)

357
    def gen_compiler_skeleton(self) -> str:
358
359
360
        if not self.definition_names:
            raise EBNFCompilerError('Compiler has not been run before calling '
                                    '"gen_Compiler_Skeleton()"!')
361
        compiler = ['class ' + self.grammar_name + 'Compiler(Compiler):',
362
363
364
365
                    '    """Compiler for the abstract-syntax-tree of a ' +
                    self.grammar_name + ' source file.',
                    '    """', '',
                    '    def __init__(self, grammar_name="' +
Eckhart Arnold's avatar
Eckhart Arnold committed
366
                    self.grammar_name + '", grammar_source=""):',
367
                    '        super(' + self.grammar_name +
Eckhart Arnold's avatar
Eckhart Arnold committed
368
                    'Compiler, self).__init__(grammar_name, grammar_source)',
369
370
                    "        assert re.match('\w+\Z', grammar_name)", '']
        for name in self.definition_names:
371
            method_name = Compiler.derive_method_name(name)
372
            if name == self.root:
373
                compiler += ['    def ' + method_name + '(self, node):',
374
375
                             '        return node', '']
            else:
376
                compiler += ['    def ' + method_name + '(self, node):',
377
                             '        pass', '']
378
        compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
379
        return '\n'.join(compiler)
380

381
    def assemble_parser(self, definitions: List[Tuple[str, str]], root_node: Node) -> str:
382
        # fix capture of variables that have been defined before usage [sic!]
383

384
385
386
        if self.variables:
            for i in range(len(definitions)):
                if definitions[i][0] in self.variables:
387
                    definitions[i] = (definitions[i][0], 'Capture(%s)' % definitions[i][1])
388
389

        self.definition_names = [defn[0] for defn in definitions]
390
        definitions.append(('wspR__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
391
                            if 'right' in self.directives['literalws'] else "''"))
392
        definitions.append(('wspL__', self.WHITESPACE_KEYWORD
Eckhart Arnold's avatar
Eckhart Arnold committed
393
                            if 'left' in self.directives['literalws'] else "''"))
394
        definitions.append((self.WHITESPACE_KEYWORD,
395
396
397
398
399
400
401
                            ("mixin_comment(whitespace="
                             "r'{whitespace}', comment=r'{comment}')").
                            format(**self.directives)))
        definitions.append((self.COMMENT_KEYWORD, "r'{comment}'".format(**self.directives)))

        # prepare parser class header and docstring and
        # add EBNF grammar to the doc string of the parser class
402

403
        article = 'an ' if self.grammar_name[0:1] in "AaEeIiOoUu" else 'a '  # what about 'hour', 'universe' etc.?
404
        declarations = ['class ' + self.grammar_name +
405
                        'Grammar(Grammar):',
406
407
                        'r"""Parser for ' + article + self.grammar_name +
                        ' source file' +
408
                        (', with this grammar:' if self.grammar_source else '.')]
409
        definitions.append(('parser_initialization__', '"upon instatiation"'))
410
        if self.grammar_source:
411
            definitions.append(('source_hash__',
412
                                '"%s"' % md5(self.grammar_source, __version__)))
413
            declarations.append('')
414
            declarations += [line for line in self.grammar_source.split('\n')]
415
416
417
418
            while declarations[-1].strip() == '':
                declarations = declarations[:-1]
        declarations.append('"""')

419
        # add default functions for retrieve_filter filters of pop or retrieve operators
420

421
422
423
        # for symbol, fun in self.directives['filter']:
        #     declarations.append(symbol + '_filter = lambda value: value.replace("(", ")")'
        #                         '.replace("[", "]").replace("{", "}").replace(">", "<")')
424

425
        # turn definitions into declarations in reverse order
426

427
428
429
430
431
432
433
434
435
        self.root = definitions[0][0] if definitions else ""
        definitions.reverse()
        declarations += [symbol + ' = Forward()'
                         for symbol in sorted(list(self.recursive))]
        for symbol, statement in definitions:
            if symbol in self.recursive:
                declarations += [symbol + '.set(' + statement + ')']
            else:
                declarations += [symbol + ' = ' + statement]
436
        known_symbols = self.rules | self.RESERVED_SYMBOLS
437
        for nd in self.symbol_nodes:
438
            if nd.result not in known_symbols:
439
                nd.add_error("Missing production for symbol '%s'" % nd.result)
440
                root_node.error_flag = True
441
        if self.root and 'root__' not in self.rules:
442
443
            declarations.append('root__ = ' + self.root)
        declarations.append('')
Eckhart Arnold's avatar
Eckhart Arnold committed
444
445
446
        self._result = '\n    '.join(declarations) \
                       + GRAMMAR_FACTORY.format(NAME=self.grammar_name)
        return self._result
447

448
    def on_syntax(self, node: Node) -> str:
449
450
451
452
        self._reset()
        definitions = []

        # drop the wrapping sequence node
453
454
        if len(node.children) == 1 and not node.children[0].parser.name:
            node = node.children[0]
455
456

        # compile definitions and directives and collect definitions
457
        for nd in node.children:
458
            if nd.parser.name == "definition":
459
                definitions.append(self._compile(nd))
460
461
            else:
                assert nd.parser.name == "directive", nd.as_sexpr()
462
                self._compile(nd)
463
                node.error_flag = node.error_flag or nd.error_flag
464

465
        return self.assemble_parser(definitions, node)
466

467
    def on_definition(self, node: Node) -> Tuple[str, str]:
468
        rule = str(node.children[0])  # cast(str, node.children[0].result)
469
470
471
        if rule in self.rules:
            node.add_error('A rule with name "%s" has already been defined.' % rule)
        elif rule in EBNFCompiler.RESERVED_SYMBOLS:
472
473
474
475
            node.add_error('Symbol "%s" is a reserved symbol.' % rule)
        elif not sane_parser_name(rule):
            node.add_error('Illegal symbol "%s". Symbols must not start or '
                           ' end with a doube underscore "__".' % rule)
476
        elif rule in self.directives['tokens']:
477
478
479
480
481
482
483
            node.add_error('Symbol "%s" has already been defined as '
                           'a scanner token.' % rule)
        elif keyword.iskeyword(rule):
            node.add_error('Python keyword "%s" may not be used as a symbol. '
                           % rule + '(This may change in the furute.)')
        try:
            self.rules.add(rule)
484
            defn = self._compile(node.children[1])
485
            if rule in self.variables:
486
                defn = 'Capture(%s)' % defn
487
488
489
490
491
                self.variables.remove(rule)
        except TypeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + node.as_sexpr()
            node.add_error(errmsg)
            rule, defn = rule + ':error', '"' + errmsg + '"'
Eckhart Arnold's avatar
Eckhart Arnold committed
492
        return rule, defn
493
494

    @staticmethod
495
    def _check_rx(node: Node, rx: str) -> str:
496
497
498
499
500
501
502
503
504
505
506
507
        """Checks whether the string `rx` represents a valid regular
        expression. Makes sure that multiline regular expressions are
        prepended by the multiline-flag. Returns the regular expression string.
        """
        rx = rx if rx.find('\n') < 0 or rx[0:4] == '(?x)' else '(?x)' + rx
        try:
            re.compile(rx)
        except Exception as re_error:
            node.add_error("malformed regular expression %s: %s" %
                           (repr(rx), str(re_error)))
        return rx

508
    def on_directive(self, node: Node) -> str:
509
        key = str(node.children[0]).lower()  # cast(str, node.children[0].result).lower()
510
        assert key not in self.directives['tokens']
511
        if key in {'comment', 'whitespace'}:
512
513
            if node.children[1].parser.name == "list_":
                if len(node.children[1].result) != 1:
Eckhart Arnold's avatar
Eckhart Arnold committed
514
                    node.add_error('Directive "%s" must have one, but not %i values.' %
515
516
                                   (key, len(node.children[1].result)))
                value = self._compile(node.children[1]).pop()
517
518
                if key == 'whitespace' and value in EBNFCompiler.WHITESPACE:
                    value = EBNFCompiler.WHITESPACE[value]  # replace whitespace-name by regex
519
                else:
520
                    node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
521
            else:
522
523
                value = str(node.children[1]).strip("~")  # cast(str, node.children[1].result).strip("~")
                if value != str(node.children[1]):  # cast(str, node.children[1].result):
524
525
526
527
528
529
                    node.add_error("Whitespace marker '~' not allowed in definition of "
                                   "%s regular expression." % key)
                if value[0] + value[-1] in {'""', "''"}:
                    value = escape_re(value[1:-1])
                elif value[0] + value[-1] == '//':
                    value = self._check_rx(node, value[1:-1])
530
531
532
                if key == 'whitespace' and not re.match(value, ''):
                    node.add_error("Implicit whitespace should always match the empty string, "
                                   "/%s/ does not." % value)
533
            self.directives[key] = value
534

535
        elif key == 'literalws':
536
            value = {item.lower() for item in self._compile(node.children[1])}
537
            if (len(value - {'left', 'right', 'both', 'none'}) > 0
Eckhart Arnold's avatar
Eckhart Arnold committed
538
                    or ('none' in value and len(value) > 1)):
539
540
541
542
543
544
545
                node.add_error('Directive "literalws" allows the values '
                               '`left`, `right`, `both` or `none`, '
                               'but not `%s`' % ", ".join(value))
            ws = {'left', 'right'} if 'both' in value \
                else {} if 'none' in value else value
            self.directives[key] = list(ws)

546
        elif key in {'tokens', 'scanner_tokens'}:
547
            self.directives['tokens'] |= self._compile(node.children[1])
548

549
        elif key.endswith('_filter'):
550
            filter_set = self._compile(node.children[1])
551
552
553
554
            if not isinstance(filter_set, set) or len(filter_set) != 1:
                node.add_error('Directive "%s" accepts exactly on symbol, not %s'
                               % (key, str(filter_set)))
            self.directives['filter'][key[:-7]] = filter_set.pop()
555

556
557
558
        else:
            node.add_error('Unknown directive %s ! (Known ones are %s .)' %
                           (key,
559
                            ', '.join(list(self.directives.keys()))))
560
561
        return ""

562
    def non_terminal(self, node: Node, parser_class: str, custom_args: List[str]=[]) -> str:
563
564
565
        """Compiles any non-terminal, where `parser_class` indicates the Parser class
        name for the particular non-terminal.
        """
566
        arguments = [self._compile(r) for r in node.children] + custom_args
567
568
        return parser_class + '(' + ', '.join(arguments) + ')'

569
    def on_expression(self, node) -> str:
570
571
        return self.non_terminal(node, 'Alternative')

572
    def on_term(self, node) -> str:
573
574
        return self.non_terminal(node, 'Sequence')

575
    def on_factor(self, node: Node) -> str:
576
        assert node.children
577
        assert len(node.children) >= 2, node.as_sexpr()
578
        prefix = str(node.children[0])  # cast(str, node.children[0].result)
579
        custom_args = []  # type: List[str]
580
581

        if prefix in {'::', ':'}:
582
583
            assert len(node.children) == 2
            arg = node.children[-1]
584
            if arg.parser.name != 'symbol':
Eckhart Arnold's avatar
Eckhart Arnold committed
585
                node.add_error(('Retrieve Operator "%s" requires a symbol, '
586
587
                                'and not a %s.') % (prefix, str(arg.parser)))
                return str(arg.result)
588
589
            if str(arg) in self.directives['filter']:
                custom_args = ['retrieve_filter=%s' % self.directives['filter'][str(arg)]]
590
            self.variables.add(str(arg))  # cast(str, arg.result)
591

592
        elif len(node.children) > 2:
593
594
            # shift = (Node(node.parser, node.result[1].result),)
            # node.result[1].result = shift + node.result[2:]
595
596
597
598
            node.children[1].result = (Node(node.children[1].parser, node.children[1].result),) \
                                    + node.children[2:]
            node.children[1].parser = node.parser
            node.result = (node.children[0], node.children[1])
599

600
        node.result = node.children[1:]
601
602
603
604
605
        try:
            parser_class = self.PREFIX_TABLE[prefix]
            return self.non_terminal(node, parser_class, custom_args)
        except KeyError:
            node.add_error('Unknown prefix "%s".' % prefix)
606
        return ""
607

608
    def on_option(self, node) -> str:
609
610
        return self.non_terminal(node, 'Optional')

611
    def on_repetition(self, node) -> str:
612
613
        return self.non_terminal(node, 'ZeroOrMore')

614
    def on_oneormore(self, node) -> str:
615
616
        return self.non_terminal(node, 'OneOrMore')

617
    def on_regexchain(self, node) -> str:
618
619
        raise EBNFCompilerError("Not yet implemented!")

620
    def on_group(self, node) -> str:
621
622
623
        raise EBNFCompilerError("Group nodes should have been eliminated by "
                                "AST transformation!")

624
    def on_symbol(self, node: Node) -> str:
625
        result = str(node)  # ; assert result == cast(str, node.result)
626
627
        if result in self.directives['tokens']:
            return 'ScannerToken("' + result + '")'
628
        else:
629
            self.symbol_nodes.append(node)
630
631
632
            if result in self.rules:
                self.recursive.add(result)
            return result
633

634
    def on_literal(self, node) -> str:
635
        return 'Token(' + str(node).replace('\\', r'\\') + ')'  # return 'Token(' + ', '.join([node.result]) + ')' ?
636

637
    def on_regexp(self, node: Node) -> str:
638
        rx = str(node)  # ; assert rx == cast(str, node.result)
639
        name = []   # type: List[str]
640
641
        if rx[:2] == '~/':
            if not 'left' in self.directives['literalws']:
642
                name = ['wL=' + self.WHITESPACE_KEYWORD] + name
643
644
645
646
            rx = rx[1:]
        elif 'left' in self.directives['literalws']:
            name = ["wL=''"] + name
        if rx[-2:] == '/~':
Eckhart Arnold's avatar
Eckhart Arnold committed
647
            if 'right' not in self.directives['literalws']:
648
                name = ['wR=' + self.WHITESPACE_KEYWORD] + name
649
650
651
652
653
654
655
656
657
658
659
660
            rx = rx[:-1]
        elif 'right' in self.directives['literalws']:
            name = ["wR=''"] + name
        try:
            arg = repr(self._check_rx(node, rx[1:-1].replace(r'\/', '/')))
        except AttributeError as error:
            errmsg = EBNFCompiler.AST_ERROR + " (" + str(error) + ")\n" + \
                     node.as_sexpr()
            node.add_error(errmsg)
            return '"' + errmsg + '"'
        return 'RE(' + ', '.join([arg] + name) + ')'

661
    def on_list_(self, node) -> Set[str]:
662
        assert node.children
663
        return set(item.result.strip() for item in node.children)
664
665


666
def get_ebnf_compiler(grammar_name="", grammar_source="") -> EBNFCompiler:
Eckhart Arnold's avatar
Eckhart Arnold committed
667
668
669
670
671
672
673
674
    global thread_local_ebnf_compiler_singleton
    try:
        compiler = thread_local_ebnf_compiler_singleton
        compiler.set_grammar_name(grammar_name, grammar_source)
        return compiler
    except NameError:
        thread_local_ebnf_compiler_singleton = EBNFCompiler(grammar_name, grammar_source)
        return thread_local_ebnf_compiler_singleton