Notice to GitKraken users: A vulnerability has been found in the SSH key generation of GitKraken versions 7.6.0 to 8.0.0 (https://www.gitkraken.com/blog/weak-ssh-key-fix). If you use GitKraken and have generated a SSH key using one of these versions, please remove it both from your local workstation and from your LRZ GitLab profile.

21.10.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit d18f157c authored by Eckhart Arnold's avatar Eckhart Arnold
Browse files

- preprocessing tests + some bug fixes

parent f2162cfb
......@@ -18,19 +18,20 @@ implied. See the License for the specific language governing
permissions and limitations under the License.
"""
# Flat namespace for the DHParser Package. Is this a good idea...?
from .error import *
from .dsl import *
from .ebnf import *
from .parsers import *
# Flat namespace for the DHParser Package. Is this a good idea...?
from .error import *
from .parse import *
from .preprocess import *
from .stringview import *
from .syntaxtree import *
from .testing import *
from .toolkit import *
from .transform import *
from .testing import *
from .versionnumber import __version__
__author__ = "Eckhart Arnold <arnold@badw.de>"
__copyright__ = "http://www.apache.org/licenses/LICENSE-2.0"
# __all__ = ['toolkit', 'stringview', 'error', 'syntaxtree', 'parser', 'transform', 'ebnf', 'dsl', 'testing',
# 'versionnumber'] # flat namespace
# __all__ = ['toolkit', 'stringview', 'error', 'syntaxtree', 'preprocess', 'parse',
# 'transform', 'ebnf', 'dsl', 'testing', 'versionnumber']
......@@ -20,18 +20,20 @@ compilation of domain specific languages based on an EBNF-grammar.
"""
import os
from typing import Any, cast, List, Tuple, Union, Iterator, Iterable
from DHParser.ebnf import EBNFCompiler, grammar_changed, \
get_ebnf_preprocessor, get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, \
PreprocessorFactoryFunc, ParserFactoryFunc, TransformerFactoryFunc, CompilerFactoryFunc
from DHParser.error import Error, is_error, has_errors, only_errors
from DHParser.parsers import Grammar, Compiler, compile_source, nil_preprocessor, PreprocessorFunc
from DHParser.parse import Grammar, Compiler, compile_source
from DHParser.preprocess import nil_preprocessor, PreprocessorFunc
from DHParser.syntaxtree import Node, TransformationFunc
from DHParser.toolkit import logging, load_if_file, is_python_code, compile_python_object, \
re, typing
from typing import Any, cast, List, Tuple, Union, Iterator, Iterable
re
__all__ = ('GrammarError',
__all__ = ('DHPARSER_IMPORTS',
'GrammarError',
'CompilationError',
'load_compiler_suite',
'compileDSL',
......@@ -70,7 +72,7 @@ try:
except ImportError:
import re
from DHParser import logging, is_filename, load_if_file, \\
Grammar, Compiler, nil_preprocessor, \\
Grammar, Compiler, nil_preprocessor, PreprocessorToken, \\
Lookbehind, Lookahead, Alternative, Pop, Token, Synonym, AllOf, SomeOf, Unordered, \\
Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, RE, Capture, \\
ZeroOrMore, Forward, NegativeLookahead, mixin_comment, compile_source, \\
......@@ -495,14 +497,15 @@ def compile_on_disk(source_file: str, compiler_suite="", extension=".xml") -> It
+ "\n# ".join(str(error).split('\n)')))
print(result)
finally:
if f: f.close()
if f:
f.close()
return messages
def recompile_grammar(ebnf_filename, force=False) -> bool:
"""
Recompiles an ebnf-grammar if necessary, that is, if either no
Re-compiles an EBNF-grammar if necessary, that is, if either no
corresponding 'XXXXCompiler.py'-file exists or if that file is
outdated.
......
......@@ -19,18 +19,19 @@ permissions and limitations under the License.
import keyword
from collections import OrderedDict
from functools import partial
from typing import Callable, Dict, List, Set, Tuple
from DHParser.error import Error
from DHParser.parsers import Grammar, mixin_comment, nil_preprocessor, Forward, RegExp, RE, \
from DHParser.parse import Grammar, mixin_comment, Forward, RegExp, RE, \
NegativeLookahead, Alternative, Series, Option, OneOrMore, ZeroOrMore, Token, \
Compiler, PreprocessorFunc
Compiler
from DHParser.preprocess import nil_preprocessor, PreprocessorFunc
from DHParser.syntaxtree import Node, TransformationFunc, WHITESPACE_PTYPE, TOKEN_PTYPE
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name, re, typing
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name, re
from DHParser.transform import traverse, remove_brackets, \
reduce_single_child, replace_by_single_child, remove_expendables, \
remove_tokens, flatten, forbid, assert_content, remove_infix_operator
from DHParser.versionnumber import __version__
from typing import Callable, Dict, List, Set, Tuple
__all__ = ('get_ebnf_preprocessor',
'get_ebnf_grammar',
......@@ -332,7 +333,7 @@ class EBNFCompiler(Compiler):
`alternative = a | b`
Now `[str(node) for node in self.rules['alternative']]`
Now `[node.content for node in self.rules['alternative']]`
yields `['alternative = a | b', 'a', 'b']`
symbols: A mapping of symbol names to their first usage (not
......@@ -597,7 +598,7 @@ class EBNFCompiler(Compiler):
def on_definition(self, node: Node) -> Tuple[str, str]:
rule = str(node.children[0])
rule = node.children[0].content
if rule in self.rules:
first = self.rules[rule][0]
if not first._errors:
......@@ -652,7 +653,7 @@ class EBNFCompiler(Compiler):
def on_directive(self, node: Node) -> str:
key = str(node.children[0]).lower()
key = node.children[0].content.lower()
assert key not in self.directives['tokens']
if key not in self.REPEATABLE_DIRECTIVES:
......@@ -674,8 +675,9 @@ class EBNFCompiler(Compiler):
else:
node.add_error('Value "%s" not allowed for directive "%s".' % (value, key))
else:
value = str(node.children[1]).strip("~") # cast(str, node.children[1].result).strip("~")
if value != str(node.children[1]): # cast(str, node.children[1].result):
value = node.children[1].content.strip("~") # cast(str, node.children[
# 1].result).strip("~")
if value != node.children[1].content: # cast(str, node.children[1].result):
node.add_error("Whitespace marker '~' not allowed in definition of "
"%s regular expression." % key)
if value[0] + value[-1] in {'""', "''"}:
......@@ -688,11 +690,11 @@ class EBNFCompiler(Compiler):
self.directives[key] = value
elif key == 'ignorecase':
if str(node.children[1]).lower() not in {"off", "false", "no"}:
if node.children[1].content.lower() not in {"off", "false", "no"}:
self.re_flags.add('i')
# elif key == 'testing':
# value = str(node.children[1])
# value = node.children[1].content
# self.directives['testing'] = value.lower() not in {"off", "false", "no"}
elif key == 'literalws':
......@@ -708,7 +710,7 @@ class EBNFCompiler(Compiler):
elif key in {'tokens', 'preprocessor_tokens'}:
tokens = self.compile(node.children[1])
redeclared = self.directives['tokes'] & tokens
redeclared = self.directives['tokens'] & tokens
if redeclared:
node.add_error('Tokens %s have already been declared earlier. '
% str(redeclared) + 'Later declaration will be ignored',
......@@ -752,7 +754,7 @@ class EBNFCompiler(Compiler):
filtered_children = []
i = 0
for nd in node.children:
if nd.parser.ptype == TOKEN_PTYPE and str(nd) == "§":
if nd.parser.ptype == TOKEN_PTYPE and nd.content == "§":
mandatory_marker.append(i)
if i == 0:
nd.add_error('First item of a series should not be mandatory.',
......@@ -774,7 +776,7 @@ class EBNFCompiler(Compiler):
def on_factor(self, node: Node) -> str:
assert node.children
assert len(node.children) >= 2, node.as_sxpr()
prefix = str(node.children[0]) # cast(str, node.children[0].result)
prefix = node.children[0].content
custom_args = [] # type: List[str]
if prefix in {'::', ':'}:
......@@ -806,15 +808,15 @@ class EBNFCompiler(Compiler):
if len(nd.children) >= 1:
nd = nd.children[0]
while nd.parser.name == "symbol":
symlist = self.rules.get(str(nd), [])
symlist = self.rules.get(nd.content, [])
if len(symlist) == 2:
nd = symlist[1]
else:
if len(symlist) == 1:
nd = symlist[0].children[1]
break
if (nd.parser.name != "regexp" or str(nd)[:1] != '/'
or str(nd)[-1:] != '/'):
if (nd.parser.name != "regexp" or nd.content[:1] != '/'
or nd.content[-1:] != '/'):
node.add_error("Lookbehind-parser can only be used with plain RegExp-"
"parsers, not with: " + nd.parser.name + nd.parser.ptype)
......@@ -838,10 +840,6 @@ class EBNFCompiler(Compiler):
return self.non_terminal(node, 'OneOrMore')
def on_regexchain(self, node) -> str:
raise EBNFCompilerError("Not yet implemented!")
def on_group(self, node) -> str:
raise EBNFCompilerError("Group nodes should have been eliminated by "
"AST transformation!")
......@@ -851,7 +849,7 @@ class EBNFCompiler(Compiler):
assert len(node.children) == 1
nd = node.children[0]
for child in nd.children:
if child.parser.ptype == TOKEN_PTYPE and str(nd) == "§":
if child.parser.ptype == TOKEN_PTYPE and nd.content == "§":
node.add_error("Unordered parser lists cannot contain mandatory (§) items.")
args = ', '.join(self.compile(child) for child in nd.children)
if nd.parser.name == "term":
......@@ -863,7 +861,7 @@ class EBNFCompiler(Compiler):
return ""
def on_symbol(self, node: Node) -> str: # called only for symbols on the right hand side!
symbol = str(node) # ; assert result == cast(str, node.result)
symbol = node.content # ; assert result == cast(str, node.result)
if symbol in self.directives['tokens']:
return 'PreprocessorToken("' + symbol + '")'
else:
......@@ -878,11 +876,12 @@ class EBNFCompiler(Compiler):
def on_literal(self, node) -> str:
return 'Token(' + str(node).replace('\\', r'\\') + ')' # return 'Token(' + ', '.merge_children([node.result]) + ')' ?
return 'Token(' + node.content.replace('\\', r'\\') + ')' # return 'Token(' + ',
# '.merge_children([node.result]) + ')' ?
def on_regexp(self, node: Node) -> str:
rx = str(node)
rx = node.content
name = [] # type: List[str]
if rx[0] == '/' and rx[-1] == '/':
parser = 'RegExp('
......
......@@ -18,11 +18,9 @@ permissions and limitations under the License.
import bisect
import functools
from typing import Iterable, Iterator, Union, Tuple, List
from DHParser.stringview import StringView
from DHParser.toolkit import typing
from typing import Hashable, Iterable, Iterator, Union, Tuple, List
__all__ = ('Error',
'is_error',
......@@ -71,10 +69,16 @@ class Error:
@property
def level_str(self):
"""Returns a string representation of the error level, e.g. "warning".
"""
"""Returns a string representation of the error level, e.g. "warning"."""
return "Warning" if is_warning(self.code) else "Error"
def visualize(self, document: str) -> str:
"""Shows the line of the document and the position where the error
occurred."""
start = document.rfind('\n', 0, self.pos) + 1
stop = document.find('\n', self.pos)
return document[start:stop] + '\n' + ' ' * (self.pos - start) + '^\n'
def is_warning(code: int) -> bool:
"""Returns True, if error is merely a warning."""
......
......@@ -59,26 +59,20 @@ import collections
import copy
import html
import os
from functools import partial
from DHParser.error import Error, is_error, has_errors, linebreaks, line_col
from DHParser.stringview import StringView, EMPTY_STRING_VIEW
from DHParser.syntaxtree import Node, TransformationFunc, ParserBase, WHITESPACE_PTYPE, \
TOKEN_PTYPE, ZOMBIE_PARSER
from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME, \
PreprocessorFunc
from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name, \
load_if_file, re, typing
escape_control_characters, load_if_file, re, typing
from typing import Any, Callable, cast, Dict, List, Set, Tuple, Union, Optional
__all__ = ('PreprocessorFunc',
'HistoryRecord',
__all__ = ('HistoryRecord',
'Parser',
'Grammar',
'RX_PREPROCESSOR_TOKEN',
'BEGIN_TOKEN',
'END_TOKEN',
'make_token',
'nil_preprocessor',
'PreprocessorToken',
'RegExp',
'RE',
......@@ -117,9 +111,6 @@ __all__ = ('PreprocessorFunc',
########################################################################
PreprocessorFunc = Union[Callable[[str], str], partial]
LEFT_RECURSION_DEPTH = 8 # type: int
# because of python's recursion depth limit, this value ought not to be
# set too high. PyPy allows higher values than CPython
......@@ -242,7 +233,7 @@ class HistoryRecord:
def excerpt(self):
length = len(self.node) if self.node else len(self.text)
excerpt = str(self.node)[:min(length, 20)] if self.node else self.text[:20]
excerpt = excerpt.replace('\n', '\\n')
excerpt = escape_control_characters(excerpt)
if length > 20:
excerpt += '...'
return excerpt
......@@ -1007,27 +998,28 @@ class Grammar:
if html and len(log) % 100 == 0:
log.append('\n</table>\n<table>\n' + HistoryRecord.COLGROUP)
if is_logging():
assert self.history__, \
"Parser did not yet run or logging was turned off when running parser!"
if not log_file_name:
name = self.__class__.__name__
log_file_name = name[:-7] if name.lower().endswith('grammar') else name
elif log_file_name.lower().endswith('.log'):
log_file_name = log_file_name[:-4]
full_history, match_history, errors_only = [], [], []
for record in self.history__:
line = record.as_html_tr() if html else str(record)
append_line(full_history, line)
if record.node and record.node.parser.ptype != WHITESPACE_PTYPE:
append_line(match_history, line)
if record.node.error_flag:
append_line(errors_only, line)
write_log(full_history, log_file_name + '_full')
if len(full_history) > 250:
write_log(full_history[-200:], log_file_name + '_full.tail')
write_log(match_history, log_file_name + '_match')
write_log(errors_only, log_file_name + '_errors')
if not is_logging():
raise AssertionError("Cannot log history when logging is turned off!")
assert self.history__, \
"Parser did not yet run or logging was turned off when running parser!"
if not log_file_name:
name = self.__class__.__name__
log_file_name = name[:-7] if name.lower().endswith('grammar') else name
elif log_file_name.lower().endswith('.log'):
log_file_name = log_file_name[:-4]
full_history, match_history, errors_only = [], [], []
for record in self.history__:
line = record.as_html_tr() if html else str(record)
append_line(full_history, line)
if record.node and record.node.parser.ptype != WHITESPACE_PTYPE:
append_line(match_history, line)
if record.node.error_flag:
append_line(errors_only, line)
write_log(full_history, log_file_name + '_full')
if len(full_history) > 250:
write_log(full_history[-200:], log_file_name + '_full.tail')
write_log(match_history, log_file_name + '_match')
write_log(errors_only, log_file_name + '_errors')
def dsl_error_msg(parser: Parser, error_str: str) -> str:
......@@ -1059,31 +1051,6 @@ def dsl_error_msg(parser: Parser, error_str: str) -> str:
########################################################################
RX_PREPROCESSOR_TOKEN = re.compile(r'\w+')
BEGIN_TOKEN = '\x1b'
END_TOKEN = '\x1c'
def make_token(token: str, argument: str = '') -> str:
"""
Turns the ``token`` and ``argument`` into a special token that
will be caught by the `PreprocessorToken`-parser.
This function is a support function that should be used by
preprocessors to inject preprocessor tokens into the source text.
"""
assert RX_PREPROCESSOR_TOKEN.match(token)
assert argument.find(BEGIN_TOKEN) < 0
assert argument.find(END_TOKEN) < 0
return BEGIN_TOKEN + token + argument + END_TOKEN
def nil_preprocessor(text: str) -> str:
"""A preprocessor that does nothing, i.e. just returns the input."""
return text
class PreprocessorToken(Parser):
"""
Parses tokens that have been inserted by a preprocessor.
......@@ -1097,7 +1064,7 @@ class PreprocessorToken(Parser):
def __init__(self, token: str) -> None:
assert token and token.isupper()
assert RX_PREPROCESSOR_TOKEN.match(token)
assert RX_TOKEN_NAME.match(token)
super(PreprocessorToken, self).__init__(token)
def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
......@@ -1121,8 +1088,7 @@ class PreprocessorToken(Parser):
'(Most likely due to a preprocessor bug!)')
return node, text[end:]
if text[1:len(self.name) + 1] == self.name:
return Node(self, text[len(self.name) + 1:end]), \
text[end + 1:]
return Node(self, text[len(self.name) + 2:end]), text[end + 1:]
return None, text
......@@ -1157,15 +1123,21 @@ class RegExp(Parser):
return RegExp(regexp, self.name)
def __call__(self, text: StringView) -> Tuple[Optional[Node], StringView]:
if text[0:1] != BEGIN_TOKEN: # ESC starts a preprocessor token.
match = text.match(self.regexp)
if match:
end = text.index(match.end())
return Node(self, match.group(0), True), text[end:]
match = text.match(self.regexp)
if match:
capture = match.group(0)
end = text.index(match.end())
# regular expresseion must never match preprocessor-tokens!
# TODO: Find a better solution here, e.g. static checking/re-mangling at compile time
i = capture.find(BEGIN_TOKEN)
if i >= 0:
capture = capture[:i]
end = i
return Node(self, capture, True), text[end:]
return None, text
def __repr__(self):
return '/%s/' % self.regexp.pattern
return escape_control_characters('/%s/' % self.regexp.pattern)
class Whitespace(RegExp):
......
""" preprocess.py - preprocessing of source files for DHParser
Copyright 2016 by Eckhart Arnold (arnold@badw.de)
Bavarian Academy of Sciences an Humanities (badw.de)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
"""
import bisect
import collections
import functools
from DHParser.toolkit import typing, re
from typing import Union, Callable
__all__ = ('RX_TOKEN_NAME',
'BEGIN_TOKEN',
'TOKEN_DELIMITER',
'END_TOKEN',
'PreprocessorFunc',
'make_token',
'nil_preprocessor',
'pp_tokenized',
'tokenized_to_original_mapping',
'source_map')
BEGIN_TOKEN = '\x1b'
TOKEN_DELIMITER = '\x1c'
END_TOKEN = '\x1d'
RESERVED_TOKEN_CHARS = BEGIN_TOKEN + TOKEN_DELIMITER + END_TOKEN
RX_TOKEN_NAME = re.compile(r'\w+')
RX_TOKEN_ARGUMENT = re.compile(r'[^\x1b\x1c\x1d]*')
RX_TOKEN = re.compile(r'\x1b(?P<name>\w+)\x1c(?P<argument>[^\x1b\x1c\x1d]*)\x1d')
PreprocessorFunc = Union[Callable[[str], str], functools.partial]
def make_token(token: str, argument: str = '') -> str:
"""
Turns the ``token`` and ``argument`` into a special token that
will be caught by the `PreprocessorToken`-parser.
This function is a support function that should be used by
preprocessors to inject preprocessor tokens into the source text.
"""
assert RX_TOKEN_NAME.match(token)
assert RX_TOKEN_ARGUMENT.match(argument)
return BEGIN_TOKEN + token + TOKEN_DELIMITER + argument + END_TOKEN
def nil_preprocessor(text: str) -> str:
"""A preprocessor that does nothing, i.e. just returns the input."""
return text
def pp_tokenized(tokenized: str) -> str:
"""Returns a pretty-printable version of a document that contains tokens."""
return tokenized.replace('\x1b', '<').replace('\x1c', '|').replace('\x1d', '>')
#######################################################################
#
# Source Maps - mapping source code positions between different
# transformations of the source text
#
#######################################################################
SourceMap = collections.namedtuple('SourceMap', ['positions', 'offsets'])
def tokenized_to_original_mapping(tokenized_source: str) -> SourceMap:
"""
Generates a source map for mapping positions in a text that has
been enriched with token markers to their original positions.
Args:
tokenized_source: the source text enriched with token markers
Returns:
a source map, i.e. a list of positions and a list of corresponding
offsets. The list of positions is ordered from smallest to highest.
An offset is valid for its associated position and all following
positions until (and excluding) the next position in the list of
positions.
"""
positions, offsets = [0], [0]
o = 0
i = tokenized_source.find(BEGIN_TOKEN)
while i >= 0:
d = tokenized_source.find(TOKEN_DELIMITER, i)
e = tokenized_source.find(END_TOKEN, i)
assert 0 <= d < e
o -= (d - i + 2)
positions.extend([d + 1, e + 1])
offsets.extend([o, o - 1])
i = tokenized_source.find(BEGIN_TOKEN, e + 1)
# post conditions
assert len(positions) == len(offsets), '\n' + str(positions) + '\n' + str(offsets)
assert positions[0] == 0
assert all(positions[i] < positions[i + 1] for i in range(len(positions) - 1))
assert all(offsets[i] > offsets[i + 1] for i in range(len(offsets) - 1))
return SourceMap(positions, offsets)
def source_map(position: int, srcmap: SourceMap) -> int:
"""
Maps a position in a (pre-)processed text to its corresponding
position in the original document according to the given source map.
Args:
position: the position in the processed text
srcmap: the source map, i.e. a mapping of locations to
offset values
Returns:
the mapped position
"""
i = bisect.bisect_right(srcmap[0], position)
if i:
return position + srcmap[1][i - 1]
raise ValueError
"""cstringview.pyx - a cython-version of the stringview class for speedup
slicing strings without copying
"""stringview.py - a string class where slices are views not copies as
with the standard Python strings.
stringview.pxd - declarations for the cython Python to C compiler
to speed up handling of StringViews.
Copyright 2016 by Eckhart Arnold (arnold@badw.de)
Bavarian Academy of Sciences an Humanities (badw.de)
......@@ -24,6 +27,7 @@ time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.
"""
import collections
from DHParser.toolkit import typing
......@@ -81,7 +85,7 @@ def real_indices(begin: Optional[int],
class StringView(collections.abc.Sized):
"""
A rudimentary StringView class, just enough for the use cases
in parsers.py. The difference between a StringView and the python
in parse.py. The difference between a StringView and the python
builtin strings is that StringView-objects do slicing without
copying, i.e. slices are just a view on a section of the sliced
string.
......
......@@ -399,7 +399,6 @@ class Node(collections.abc.Sized):
return self