Currently job artifacts in CI/CD pipelines on LRZ GitLab never expire. Starting from Wed 26.1.2022 the default expiration time will be 30 days (GitLab default). Currently existing artifacts in already completed jobs will not be affected by the change. The latest artifacts for all jobs in the latest successful pipelines will be kept. More information: https://gitlab.lrz.de/help/user/admin_area/settings/continuous_integration.html#default-artifacts-expiration

Commit a471a1e0 authored by di68kap's avatar di68kap
Browse files

- parsers.py: string slicing replaces by toolkit.StringView

parent 481891e3
......@@ -77,7 +77,8 @@ except ImportError:
from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name
from DHParser.syntaxtree import WHITESPACE_PTYPE, TOKEN_PTYPE, ZOMBIE_PARSER, ParserBase, \
Node, TransformationFunc
from DHParser.toolkit import TextView, load_if_file, error_messages, line_col
from DHParser.toolkit import StringView, EMPTY_STRING_VIEW, sv_match, sv_index, sv_search, \
load_if_file, error_messages, line_col
__all__ = ('PreprocessorFunc',
'HistoryRecord',
......@@ -161,7 +162,7 @@ class HistoryRecord:
# type: List['Parser']
self.node = node # type: Node
self.remaining = remaining # type: int
document = call_stack[-1].grammar.document__ if call_stack else ''
document = call_stack[-1].grammar.document__.text if call_stack else ''
self.line_col = line_col(document, len(document) - remaining) # type: Tuple[int, int]
def __str__(self):
......@@ -229,11 +230,13 @@ def add_parser_guard(parser_func):
that takes care of memoizing, left recursion and optionally tracing
(aka "history tracking") of parser calls. Returns the wrapped call.
"""
def guarded_call(parser: 'Parser', text: str) -> Tuple[Node, str]:
def guarded_call(parser: 'Parser', text: StringView) -> Tuple[Node, StringView]:
assert isinstance(text, StringView)
def memoized(parser, location):
node = parser.visited[location]
rlen = location - (0 if node is None else node.len)
rest = TextView(grammar.document__, -rlen) if rlen else ''
rest = grammar.document__[-rlen:] if rlen else EMPTY_STRING_VIEW
return node, rest
# NOTE: An older and simpler implementation of memoization
# relied on `parser.visited[location] == node, rest`. Although,
......@@ -267,6 +270,7 @@ def add_parser_guard(parser_func):
# run original __call__ method
node, rest = parser_func(parser, text)
assert isinstance(rest, StringView)
if node is None:
# retrieve an earlier match result (from left recursion) if it exists
......@@ -302,7 +306,7 @@ def add_parser_guard(parser_func):
node = Node(None, text[:min(10, max(1, text.find("\n")))] + " ...")
node.add_error("maximum recursion depth of parser reached; "
"potentially due to too many errors!")
rest = ''
rest = EMPTY_STRING_VIEW
return node, rest
......@@ -409,7 +413,7 @@ class Parser(ParserBase, metaclass=ParserMetaClass):
self.cycle_detection = set() # type: Set[Callable]
return self
def __call__(self, text: TextView) -> Tuple[Node, TextView]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
"""Applies the parser to the given `text` and returns a node with
the results or None as well as the text at the position right behind
the matching string."""
......@@ -724,8 +728,8 @@ class Grammar:
def _reset__(self):
self.document__ = "" # type: str
self._reversed__ = "" # type: str
self.document__ = EMPTY_STRING_VIEW # type: StringView
self._reversed__ = EMPTY_STRING_VIEW # type: StringView
# variables stored and recalled by Capture and Retrieve parsers
self.variables__ = dict() # type: Dict[str, List[str]]
self.rollback__ = [] # type: List[Tuple[int, Callable]]
......@@ -742,7 +746,7 @@ class Grammar:
@property
def reversed__(self) -> str:
if not self._reversed__:
self._reversed__ = self.document__[::-1]
self._reversed__ = StringView(self.document__.text[::-1])
return self._reversed__
......@@ -784,13 +788,13 @@ class Grammar:
else:
self._dirty_flag__ = True
self.history_tracking__ = is_logging()
self.document__ = document
self.last_rb__loc__ = len(document) + 1 # rollback location
self.document__ = StringView(document)
self.last_rb__loc__ = len(self.document__) + 1 # rollback location
parser = self[start_parser] if isinstance(start_parser, str) else start_parser
assert parser.grammar == self, "Cannot run parsers from a different grammar object!" \
" %s vs. %s" % (str(self), str(parser.grammar))
stitches = [] # type: List[Node]
rest = document
rest = self.document__
if not rest:
result, ignore = parser(rest)
if result is None:
......@@ -883,7 +887,7 @@ class Grammar:
document.
"""
def prepare_line(record):
excerpt = self.document__.__getitem__(record.extent)[:25].replace('\n', '\\n')
excerpt = self.document__.text.__getitem__(record.extent)[:25].replace('\n', '\\n')
excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
return record.stack, record.status, excerpt
......@@ -985,7 +989,7 @@ class PreprocessorToken(Parser):
assert RX_PREPROCESSOR_TOKEN.match(token)
super(PreprocessorToken, self).__init__(token)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
if text[0:1] == BEGIN_TOKEN:
end = text.find(END_TOKEN, 1)
if end < 0:
......@@ -1040,10 +1044,10 @@ class RegExp(Parser):
regexp = self.regexp.pattern
return RegExp(regexp, self.name)
def __call__(self, text: str) -> Tuple[Node, str]:
match = text[0:1] != BEGIN_TOKEN and self.regexp.match(text) # ESC starts a preprocessor token.
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
match = text[0:1] != BEGIN_TOKEN and sv_match(self.regexp, text) # ESC starts a preprocessor token.
if match:
end = match.end()
end = sv_index(match.end(), text)
return Node(self, text[:end]), text[end:]
return None, text
......@@ -1114,9 +1118,9 @@ class RE(Parser):
regexp = self.main.regexp.pattern
return self.__class__(regexp, self.wL, self.wR, self.name)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
# assert self.main.regexp.pattern != "@"
t = text # type: str
t = text # type: StringView
wL, t = self.wspLeft(t)
main, t = self.main(t)
if main:
......@@ -1264,7 +1268,7 @@ class Optional(UnaryOperator):
"Nesting options with required elements is contradictory: " \
"%s(%s)" % (str(name), str(parser.name))
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
node, text = self.parser(text)
if node:
return Node(self, node), text
......@@ -1289,7 +1293,7 @@ class ZeroOrMore(Optional):
EBNF-Notation: `{ ... }`
EBNF-Example: `sentence = { /\w+,?/ } "."`
"""
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
results = () # type: Tuple[Node, ...]
n = len(text) + 1
while text and len(text) < n:
......@@ -1314,9 +1318,9 @@ class OneOrMore(UnaryOperator):
"Use ZeroOrMore instead of nesting OneOrMore and Optional: " \
"%s(%s)" % (str(name), str(parser.name))
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
results = () # type: Tuple[Node, ...]
text_ = text # type: str
text_ = text # type: StringView
n = len(text) + 1
while text_ and len(text_) < n:
n = len(text_)
......@@ -1340,9 +1344,9 @@ class Series(NaryOperator):
super(Series, self).__init__(*parsers, name=name)
assert len(self.parsers) >= 1
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
results = () # type: Tuple[Node, ...]
text_ = text # type: str
text_ = text # type: StringView
for parser in self.parsers:
node, text_ = parser(text_)
if not node:
......@@ -1400,7 +1404,7 @@ class Alternative(NaryOperator):
assert all(not isinstance(p, Optional) for p in self.parsers[:-1])
self.been_here = dict() # type: Dict[int, int]
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
for parser in self.parsers:
node, text_ = parser(text)
if node:
......@@ -1447,11 +1451,13 @@ class FlowOperator(UnaryOperator):
class Required(FlowOperator):
# Add constructor that checks for logical errors, like `Required(Optional(...))` constructs ?
def __call__(self, text: str) -> Tuple[Node, str]:
RX_ARGUMENT = re.compile(r'\s(\S)')
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
node, text_ = self.parser(text)
if not node:
m = re.search(r'\s(\S)', text)
i = max(1, m.regs[1][0]) if m else 1
m = sv_search(Required.RX_ARGUMENT, text) # re.search(r'\s(\S)', text)
i = max(1, sv_index(m.regs[1][0], text)) if m else 1
node = Node(self, text[:i])
text_ = text[i:]
# assert False, "*"+text[:i]+"*"
......@@ -1467,7 +1473,7 @@ class Lookahead(FlowOperator):
def __init__(self, parser: Parser, name: str = '') -> None:
super(Lookahead, self).__init__(parser, name)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
node, text_ = self.parser(text)
if self.sign(node is not None):
return Node(self, ''), text
......@@ -1512,9 +1518,9 @@ class Lookbehind(FlowOperator):
self.regexp = p.main.regexp if isinstance(p, RE) else p.regexp
super(Lookbehind, self).__init__(parser, name)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
backwards_text = self.grammar.reversed__[len(text):] # self.grammar.document__[-len(text) - 1::-1]
if self.sign(self.regexp.match(backwards_text)):
if self.sign(sv_match(self.regexp, backwards_text)):
return Node(self, ''), text
else:
return None, text
......@@ -1548,7 +1554,7 @@ class Capture(UnaryOperator):
def __init__(self, parser: Parser, name: str = '') -> None:
super(Capture, self).__init__(parser, name)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
node, text_ = self.parser(text)
if node:
stack = self.grammar.variables__.setdefault(self.name, [])
......@@ -1590,13 +1596,13 @@ class Retrieve(Parser):
def __deepcopy__(self, memo):
return self.__class__(self.symbol, self.filter, self.name)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
return self.call(text) # allow call method to be called from subclass circumventing the parser guard
def __repr__(self):
return ':' + self.symbol.repr
def call(self, text: str) -> Tuple[Node, str]:
def call(self, text: StringView) -> Tuple[Node, StringView]:
try:
stack = self.grammar.variables__[self.symbol.name]
value = self.filter(stack)
......@@ -1612,7 +1618,7 @@ class Retrieve(Parser):
class Pop(Retrieve):
"""STILL EXPERIMENTAL!!!"""
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
nd, txt = super(Pop, self).call(text) # call() instead of __call__() to avoid parser guard
if nd and not nd.error_flag:
stack = self.grammar.variables__[self.symbol.name]
......@@ -1644,7 +1650,7 @@ class Synonym(UnaryOperator):
class, in which case it would be unclear whether the parser
RE('\d\d\d\d') carries the name 'JAHRESZAHL' or 'jahr'.
"""
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
node, text = self.parser(text)
if node:
return Node(self, node), text
......@@ -1684,7 +1690,7 @@ class Forward(Parser):
duplicate.set(parser)
return duplicate
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
return self.parser(text)
def __repr__(self):
......
......@@ -31,7 +31,7 @@ except ImportError:
from .typing34 import AbstractSet, Any, ByteString, Callable, cast, Container, Dict, \
Iterator, List, NamedTuple, Sequence, Union, Text, Tuple
from DHParser.toolkit import is_logging, log_dir, TextView, line_col, identity
from DHParser.toolkit import is_logging, log_dir, StringView, line_col, identity
__all__ = ('WHITESPACE_PTYPE',
'MockParser',
......@@ -129,8 +129,8 @@ ZOMBIE_PARSER = ZombieParser()
Error = NamedTuple('Error', [('pos', int), ('msg', str)])
ChildrenType = Tuple['Node', ...]
StrictResultType = Union[ChildrenType, TextView, str]
ResultType = Union[ChildrenType, 'Node', TextView, str, None]
StrictResultType = Union[ChildrenType, StringView, str]
ResultType = Union[ChildrenType, 'Node', StringView, str, None]
def flatten_sxpr(sxpr: str) -> str:
......@@ -189,6 +189,7 @@ class Node:
__slots__ = ['_result', 'children', '_errors', '_len', '_pos', 'parser', 'error_flag']
def __init__(self, parser, result: ResultType) -> None:
"""Initializes the ``Node``-object with the ``Parser``-Instance
that generated the node and the parser's result.
......@@ -251,7 +252,7 @@ class Node:
# or isinstance(result, Node)
# or isinstance(result, str)), str(result)
self._result = (result,) if isinstance(result, Node) else str(result) \
if isinstance(result, TextView) else result or '' # type: StrictResultType
if isinstance(result, StringView) else result or '' # type: StrictResultType
self.children = cast(ChildrenType, self._result) \
if isinstance(self._result, tuple) else cast(ChildrenType, ()) # type: ChildrenType
self.error_flag = any(r.error_flag for r in self.children) # type: bool
......
......@@ -43,14 +43,18 @@ except ImportError:
import sys
try:
from typing import Any, List, Tuple, Optional
from typing import Any, List, Tuple, Collection, Union, Optional
except ImportError:
from .typing34 import Any, List, Tuple, Optional
from .typing34 import Any, List, Tuple, Collection, Union, Optional
__all__ = ('logging',
'is_logging',
'log_dir',
'logfile_basename',
'StringView',
'sv_match',
'sv_index',
'sv_search',
# 'supress_warnings',
# 'warnings',
# 'repr_call',
......@@ -150,22 +154,93 @@ def clear_logs(logfile_types={'.cst', '.ast', '.log'}):
os.rmdir(log_dirname)
class TextView:
__slots__ = ['text', 'begin', 'end']
class StringView:
""""A rudimentary StringView class, just enough for the use cases
in parswer.py.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.
"""
__slots__ = ['text', 'begin', 'end', 'len']
def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
self.text = text # type: str
self.begin = begin or 0 # type: int # TODO: Negative Values!!!
self.end = end or len(text) # type: int
self.begin, self.end = StringView.real_indices(begin, end, len(text))
self.len = max(self.end - self.begin, 0)
@staticmethod
def real_indices(begin, end, len):
def pack(index, len):
index = index if index >= 0 else index + len
return 0 if index < 0 else len if index > len else index
if begin is None: begin = 0
if end is None: end = len
return pack(begin, len), pack(end, len)
def __bool__(self):
return bool(self.text) and self.end > self.begin
def __len__(self):
return self.len
def __str__(self):
return self.text[self.begin:self.end]
def __getitem__(self, index):
assert isinstance(index, slice), "Minimal implementation of TextView just allows slicing."
start = index.start or 0
stop = index.stop or (self.end - self.begin)
return TextView(self.text, self.begin + start, self.begin + stop)
assert isinstance(index, slice), "As of now, StringView only allows slicing."
assert index.step is None or index.step == 1, \
"Step sizes other than 1 are not yet supported by StringView"
start, stop = StringView.real_indices(index.start, index.stop, self.len)
return StringView(self.text, self.begin + start, self.begin + stop)
def __eq__(self, other):
return str(self) == str(other) # PERFORMANCE WARNING: This creates copies of the strings
def find(self, sub, start=None, end=None) -> int:
if start is None and end is None:
return self.text.find(sub, self.begin, self.end) - self.begin
else:
start, end = StringView.real_indices(start, end, self.len)
return self.text.find(sub, self.begin + start, self.begin + end) - self.begin
def startswith(self, prefix: str, start:int = 0, end:Optional[int] = None) -> bool:
start += self.begin
end = self.end if end is None else self.begin + end
return self.text.startswith(prefix, start, end)
def sv_match(regex, sv: StringView):
return regex.match(sv.text, pos=sv.begin, endpos=sv.end)
def sv_index(absolute_index: Union[int, Collection], sv: StringView) -> Union[int, tuple]:
"""
Converts the an index into string watched by a StringView object
to an index relativ to the string view object, e.g.:
>>> sv = StringView('xxIxx')[2:3]
>>> match = sv_match(re.compile('I'), sv)
>>> match.end()
3
>>> sv_index(match.end(), sv)
1
"""
try:
return absolute_index - sv.begin
except TypeError:
return tuple(index - sv.begin for index in absolute_index)
def sv_search(regex, sv: StringView):
return regex.search(sv.text, pos=sv.begin, endpos=sv.end)
EMPTY_STRING_VIEW = StringView('')
# def repr_call(f, parameter_list) -> str:
......
......@@ -49,7 +49,7 @@ def fail_on_error(src, result):
sys.exit(1)
def test():
def tst_func():
with toolkit.logging(False):
files = os.listdir('testdata')
files.sort()
......@@ -87,7 +87,8 @@ def mem_profile(func):
print(stat)
if __name__ == "__main__":
cpu_profile(test)
cpu_profile(tst_func)
......@@ -26,4 +26,5 @@ sys.path.extend(['../', './'])
if __name__ == "__main__":
from DHParser.testing import runner
runner("", globals())
\ No newline at end of file
runner("", globals())
......@@ -24,7 +24,7 @@ from functools import partial
sys.path.extend(['../', './'])
from DHParser.toolkit import is_logging, logging, compile_python_object
from DHParser.toolkit import is_logging, logging, StringView, compile_python_object
from DHParser.parser import compile_source, Retrieve, Grammar, Forward, Token, ZeroOrMore, RE, \
RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler
......@@ -152,7 +152,7 @@ class TestRegex:
assert result
assert not messages, str(messages)
parser = compile_python_object(DHPARSER_IMPORTS + result, '\w+Grammar$')()
node, rest = parser.regex('abc+def')
node, rest = parser.regex(StringView('abc+def'))
assert rest == ''
assert node.parser.name == "regex"
assert str(node) == 'abc+def'
......
......@@ -97,6 +97,7 @@ class TestNode:
transform = get_ebnf_transformer()
compiler = get_ebnf_compiler()
tree = parser(ebnf)
print(tree.as_sxpr())
tree_copy = copy.deepcopy(tree)
transform(tree_copy)
res1 = compiler(tree_copy)
......
......@@ -23,9 +23,112 @@ limitations under the License.
import concurrent.futures
import os
import sys
try:
import regex as re
except ImportError:
import re
sys.path.extend(['../', './'])
from DHParser.toolkit import load_if_file, logging, log_dir, is_logging
from DHParser.toolkit import load_if_file, logging, log_dir, is_logging, StringView, \
sv_match, sv_search, EMPTY_STRING_VIEW
class TestStringView:
def test_real_indices(self):
assert StringView.real_indices(3, 5, 10) == (3, 5)
assert StringView.real_indices(None, None, 10) == (0, 10)
assert StringView.real_indices(-2, -1, 10) == (8, 9)
assert StringView.real_indices(-3, 11, 10) == (7, 10)
assert StringView.real_indices(-5, -12, 10) == (5, 0)
assert StringView.real_indices(-12, -5, 10) == (0, 5)
assert StringView.real_indices(7, 6, 10) == (7, 6)
assert StringView.real_indices(None, 0, 10) == (0, 0)
def test_creation(self):
s = "0123456789"
assert str(StringView(s)) == s
assert str(StringView(s, 3, 4)) == '3'
assert str(StringView(s, -4)) == '6789'
def test_equality(self):
s = "0123456789"
assert StringView(s) == s
assert StringView(s, 3, 4) == '3'
assert StringView(s, -4) == '6789'
def test_slicing(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert sv == '0123456789'
assert sv[3:4] == '3'
assert sv[-3:-1] == '78'
assert sv[4:3] == ''
assert sv[:4] == '0123'
assert sv[4:] == '456789'
assert sv[-2:] == '89'
assert sv[:-5] == '01234'
assert isinstance(sv[3:5], StringView)
def test_len(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert len(sv) == 10
assert sv.len == 10
assert len(sv[5:5]) == 0
assert len(sv[7:4]) == 0
assert len(sv[-12:-2]) == 8
assert len(sv[-12:12]) == 10
def test_bool(self):
assert not StringView('')
assert StringView('x')
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert not sv[5:4]
assert sv[4:5], str(sv[4:5])
assert not sv[3:3]
assert not sv[12:13]
assert sv[0:20]
def test_sv_match(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert sv_match(re.compile(r'\d'), sv)
assert sv_match(re.compile(r'\d+'), sv)
assert not sv_match(re.compile(r' '), sv)
assert sv_match(re.compile(r'45'), sv[4:])
def test_sv_search(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert sv_search(re.compile(r'5'), sv)
assert not sv_search(re.compile(r' '), sv)
assert sv_search(re.compile(r'5'), sv[5:])
assert not sv_search(re.compile(r'9'), sv[:9])
def test_find(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert sv.find('5') == 5
assert sv.find(' ') < 0
assert sv.find('0', 1) < 0
assert sv.find('9', 0, 8) < 0
assert sv.find('45', 1, 8) == 4
def test_startswith(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert sv.startswith('012')
assert sv.startswith('123', 1)
assert not sv.startswith('123', 1, 3)
def test_EMPTY_STRING_VIEW(self):
assert len(EMPTY_STRING_VIEW) == 0
assert EMPTY_STRING_VIEW.find('x') < 0
assert not sv_match(re.compile(r'x'), EMPTY_STRING_VIEW)
assert sv_match(re.compile(r'.*'), EMPTY_STRING_VIEW)
assert len(EMPTY_STRING_VIEW[0:1]) == 0