Commit a471a1e0 authored by di68kap's avatar di68kap

- parsers.py: string slicing replaces by toolkit.StringView

parent 481891e3
......@@ -77,7 +77,8 @@ except ImportError:
from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name
from DHParser.syntaxtree import WHITESPACE_PTYPE, TOKEN_PTYPE, ZOMBIE_PARSER, ParserBase, \
Node, TransformationFunc
from DHParser.toolkit import TextView, load_if_file, error_messages, line_col
from DHParser.toolkit import StringView, EMPTY_STRING_VIEW, sv_match, sv_index, sv_search, \
load_if_file, error_messages, line_col
__all__ = ('PreprocessorFunc',
'HistoryRecord',
......@@ -161,7 +162,7 @@ class HistoryRecord:
# type: List['Parser']
self.node = node # type: Node
self.remaining = remaining # type: int
document = call_stack[-1].grammar.document__ if call_stack else ''
document = call_stack[-1].grammar.document__.text if call_stack else ''
self.line_col = line_col(document, len(document) - remaining) # type: Tuple[int, int]
def __str__(self):
......@@ -229,11 +230,13 @@ def add_parser_guard(parser_func):
that takes care of memoizing, left recursion and optionally tracing
(aka "history tracking") of parser calls. Returns the wrapped call.
"""
def guarded_call(parser: 'Parser', text: str) -> Tuple[Node, str]:
def guarded_call(parser: 'Parser', text: StringView) -> Tuple[Node, StringView]:
assert isinstance(text, StringView)
def memoized(parser, location):
node = parser.visited[location]
rlen = location - (0 if node is None else node.len)
rest = TextView(grammar.document__, -rlen) if rlen else ''
rest = grammar.document__[-rlen:] if rlen else EMPTY_STRING_VIEW
return node, rest
# NOTE: An older and simpler implementation of memoization
# relied on `parser.visited[location] == node, rest`. Although,
......@@ -267,6 +270,7 @@ def add_parser_guard(parser_func):
# run original __call__ method
node, rest = parser_func(parser, text)
assert isinstance(rest, StringView)
if node is None:
# retrieve an earlier match result (from left recursion) if it exists
......@@ -302,7 +306,7 @@ def add_parser_guard(parser_func):
node = Node(None, text[:min(10, max(1, text.find("\n")))] + " ...")
node.add_error("maximum recursion depth of parser reached; "
"potentially due to too many errors!")
rest = ''
rest = EMPTY_STRING_VIEW
return node, rest
......@@ -409,7 +413,7 @@ class Parser(ParserBase, metaclass=ParserMetaClass):
self.cycle_detection = set() # type: Set[Callable]
return self
def __call__(self, text: TextView) -> Tuple[Node, TextView]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
"""Applies the parser to the given `text` and returns a node with
the results or None as well as the text at the position right behind
the matching string."""
......@@ -724,8 +728,8 @@ class Grammar:
def _reset__(self):
self.document__ = "" # type: str
self._reversed__ = "" # type: str
self.document__ = EMPTY_STRING_VIEW # type: StringView
self._reversed__ = EMPTY_STRING_VIEW # type: StringView
# variables stored and recalled by Capture and Retrieve parsers
self.variables__ = dict() # type: Dict[str, List[str]]
self.rollback__ = [] # type: List[Tuple[int, Callable]]
......@@ -742,7 +746,7 @@ class Grammar:
@property
def reversed__(self) -> str:
if not self._reversed__:
self._reversed__ = self.document__[::-1]
self._reversed__ = StringView(self.document__.text[::-1])
return self._reversed__
......@@ -784,13 +788,13 @@ class Grammar:
else:
self._dirty_flag__ = True
self.history_tracking__ = is_logging()
self.document__ = document
self.last_rb__loc__ = len(document) + 1 # rollback location
self.document__ = StringView(document)
self.last_rb__loc__ = len(self.document__) + 1 # rollback location
parser = self[start_parser] if isinstance(start_parser, str) else start_parser
assert parser.grammar == self, "Cannot run parsers from a different grammar object!" \
" %s vs. %s" % (str(self), str(parser.grammar))
stitches = [] # type: List[Node]
rest = document
rest = self.document__
if not rest:
result, ignore = parser(rest)
if result is None:
......@@ -883,7 +887,7 @@ class Grammar:
document.
"""
def prepare_line(record):
excerpt = self.document__.__getitem__(record.extent)[:25].replace('\n', '\\n')
excerpt = self.document__.text.__getitem__(record.extent)[:25].replace('\n', '\\n')
excerpt = "'%s'" % excerpt if len(excerpt) < 25 else "'%s...'" % excerpt
return record.stack, record.status, excerpt
......@@ -985,7 +989,7 @@ class PreprocessorToken(Parser):
assert RX_PREPROCESSOR_TOKEN.match(token)
super(PreprocessorToken, self).__init__(token)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
if text[0:1] == BEGIN_TOKEN:
end = text.find(END_TOKEN, 1)
if end < 0:
......@@ -1040,10 +1044,10 @@ class RegExp(Parser):
regexp = self.regexp.pattern
return RegExp(regexp, self.name)
def __call__(self, text: str) -> Tuple[Node, str]:
match = text[0:1] != BEGIN_TOKEN and self.regexp.match(text) # ESC starts a preprocessor token.
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
match = text[0:1] != BEGIN_TOKEN and sv_match(self.regexp, text) # ESC starts a preprocessor token.
if match:
end = match.end()
end = sv_index(match.end(), text)
return Node(self, text[:end]), text[end:]
return None, text
......@@ -1114,9 +1118,9 @@ class RE(Parser):
regexp = self.main.regexp.pattern
return self.__class__(regexp, self.wL, self.wR, self.name)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
# assert self.main.regexp.pattern != "@"
t = text # type: str
t = text # type: StringView
wL, t = self.wspLeft(t)
main, t = self.main(t)
if main:
......@@ -1264,7 +1268,7 @@ class Optional(UnaryOperator):
"Nesting options with required elements is contradictory: " \
"%s(%s)" % (str(name), str(parser.name))
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
node, text = self.parser(text)
if node:
return Node(self, node), text
......@@ -1289,7 +1293,7 @@ class ZeroOrMore(Optional):
EBNF-Notation: `{ ... }`
EBNF-Example: `sentence = { /\w+,?/ } "."`
"""
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
results = () # type: Tuple[Node, ...]
n = len(text) + 1
while text and len(text) < n:
......@@ -1314,9 +1318,9 @@ class OneOrMore(UnaryOperator):
"Use ZeroOrMore instead of nesting OneOrMore and Optional: " \
"%s(%s)" % (str(name), str(parser.name))
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
results = () # type: Tuple[Node, ...]
text_ = text # type: str
text_ = text # type: StringView
n = len(text) + 1
while text_ and len(text_) < n:
n = len(text_)
......@@ -1340,9 +1344,9 @@ class Series(NaryOperator):
super(Series, self).__init__(*parsers, name=name)
assert len(self.parsers) >= 1
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
results = () # type: Tuple[Node, ...]
text_ = text # type: str
text_ = text # type: StringView
for parser in self.parsers:
node, text_ = parser(text_)
if not node:
......@@ -1400,7 +1404,7 @@ class Alternative(NaryOperator):
assert all(not isinstance(p, Optional) for p in self.parsers[:-1])
self.been_here = dict() # type: Dict[int, int]
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
for parser in self.parsers:
node, text_ = parser(text)
if node:
......@@ -1447,11 +1451,13 @@ class FlowOperator(UnaryOperator):
class Required(FlowOperator):
# Add constructor that checks for logical errors, like `Required(Optional(...))` constructs ?
def __call__(self, text: str) -> Tuple[Node, str]:
RX_ARGUMENT = re.compile(r'\s(\S)')
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
node, text_ = self.parser(text)
if not node:
m = re.search(r'\s(\S)', text)
i = max(1, m.regs[1][0]) if m else 1
m = sv_search(Required.RX_ARGUMENT, text) # re.search(r'\s(\S)', text)
i = max(1, sv_index(m.regs[1][0], text)) if m else 1
node = Node(self, text[:i])
text_ = text[i:]
# assert False, "*"+text[:i]+"*"
......@@ -1467,7 +1473,7 @@ class Lookahead(FlowOperator):
def __init__(self, parser: Parser, name: str = '') -> None:
super(Lookahead, self).__init__(parser, name)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
node, text_ = self.parser(text)
if self.sign(node is not None):
return Node(self, ''), text
......@@ -1512,9 +1518,9 @@ class Lookbehind(FlowOperator):
self.regexp = p.main.regexp if isinstance(p, RE) else p.regexp
super(Lookbehind, self).__init__(parser, name)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
backwards_text = self.grammar.reversed__[len(text):] # self.grammar.document__[-len(text) - 1::-1]
if self.sign(self.regexp.match(backwards_text)):
if self.sign(sv_match(self.regexp, backwards_text)):
return Node(self, ''), text
else:
return None, text
......@@ -1548,7 +1554,7 @@ class Capture(UnaryOperator):
def __init__(self, parser: Parser, name: str = '') -> None:
super(Capture, self).__init__(parser, name)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
node, text_ = self.parser(text)
if node:
stack = self.grammar.variables__.setdefault(self.name, [])
......@@ -1590,13 +1596,13 @@ class Retrieve(Parser):
def __deepcopy__(self, memo):
return self.__class__(self.symbol, self.filter, self.name)
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
return self.call(text) # allow call method to be called from subclass circumventing the parser guard
def __repr__(self):
return ':' + self.symbol.repr
def call(self, text: str) -> Tuple[Node, str]:
def call(self, text: StringView) -> Tuple[Node, StringView]:
try:
stack = self.grammar.variables__[self.symbol.name]
value = self.filter(stack)
......@@ -1612,7 +1618,7 @@ class Retrieve(Parser):
class Pop(Retrieve):
"""STILL EXPERIMENTAL!!!"""
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
nd, txt = super(Pop, self).call(text) # call() instead of __call__() to avoid parser guard
if nd and not nd.error_flag:
stack = self.grammar.variables__[self.symbol.name]
......@@ -1644,7 +1650,7 @@ class Synonym(UnaryOperator):
class, in which case it would be unclear whether the parser
RE('\d\d\d\d') carries the name 'JAHRESZAHL' or 'jahr'.
"""
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
node, text = self.parser(text)
if node:
return Node(self, node), text
......@@ -1684,7 +1690,7 @@ class Forward(Parser):
duplicate.set(parser)
return duplicate
def __call__(self, text: str) -> Tuple[Node, str]:
def __call__(self, text: StringView) -> Tuple[Node, StringView]:
return self.parser(text)
def __repr__(self):
......
......@@ -31,7 +31,7 @@ except ImportError:
from .typing34 import AbstractSet, Any, ByteString, Callable, cast, Container, Dict, \
Iterator, List, NamedTuple, Sequence, Union, Text, Tuple
from DHParser.toolkit import is_logging, log_dir, TextView, line_col, identity
from DHParser.toolkit import is_logging, log_dir, StringView, line_col, identity
__all__ = ('WHITESPACE_PTYPE',
'MockParser',
......@@ -129,8 +129,8 @@ ZOMBIE_PARSER = ZombieParser()
Error = NamedTuple('Error', [('pos', int), ('msg', str)])
ChildrenType = Tuple['Node', ...]
StrictResultType = Union[ChildrenType, TextView, str]
ResultType = Union[ChildrenType, 'Node', TextView, str, None]
StrictResultType = Union[ChildrenType, StringView, str]
ResultType = Union[ChildrenType, 'Node', StringView, str, None]
def flatten_sxpr(sxpr: str) -> str:
......@@ -189,6 +189,7 @@ class Node:
__slots__ = ['_result', 'children', '_errors', '_len', '_pos', 'parser', 'error_flag']
def __init__(self, parser, result: ResultType) -> None:
"""Initializes the ``Node``-object with the ``Parser``-Instance
that generated the node and the parser's result.
......@@ -251,7 +252,7 @@ class Node:
# or isinstance(result, Node)
# or isinstance(result, str)), str(result)
self._result = (result,) if isinstance(result, Node) else str(result) \
if isinstance(result, TextView) else result or '' # type: StrictResultType
if isinstance(result, StringView) else result or '' # type: StrictResultType
self.children = cast(ChildrenType, self._result) \
if isinstance(self._result, tuple) else cast(ChildrenType, ()) # type: ChildrenType
self.error_flag = any(r.error_flag for r in self.children) # type: bool
......
......@@ -43,14 +43,18 @@ except ImportError:
import sys
try:
from typing import Any, List, Tuple, Optional
from typing import Any, List, Tuple, Collection, Union, Optional
except ImportError:
from .typing34 import Any, List, Tuple, Optional
from .typing34 import Any, List, Tuple, Collection, Union, Optional
__all__ = ('logging',
'is_logging',
'log_dir',
'logfile_basename',
'StringView',
'sv_match',
'sv_index',
'sv_search',
# 'supress_warnings',
# 'warnings',
# 'repr_call',
......@@ -150,22 +154,93 @@ def clear_logs(logfile_types={'.cst', '.ast', '.log'}):
os.rmdir(log_dirname)
class TextView:
__slots__ = ['text', 'begin', 'end']
class StringView:
""""A rudimentary StringView class, just enough for the use cases
in parswer.py.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.
"""
__slots__ = ['text', 'begin', 'end', 'len']
def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
self.text = text # type: str
self.begin = begin or 0 # type: int # TODO: Negative Values!!!
self.end = end or len(text) # type: int
self.begin, self.end = StringView.real_indices(begin, end, len(text))
self.len = max(self.end - self.begin, 0)
@staticmethod
def real_indices(begin, end, len):
def pack(index, len):
index = index if index >= 0 else index + len
return 0 if index < 0 else len if index > len else index
if begin is None: begin = 0
if end is None: end = len
return pack(begin, len), pack(end, len)
def __bool__(self):
return bool(self.text) and self.end > self.begin
def __len__(self):
return self.len
def __str__(self):
return self.text[self.begin:self.end]
def __getitem__(self, index):
assert isinstance(index, slice), "Minimal implementation of TextView just allows slicing."
start = index.start or 0
stop = index.stop or (self.end - self.begin)
return TextView(self.text, self.begin + start, self.begin + stop)
assert isinstance(index, slice), "As of now, StringView only allows slicing."
assert index.step is None or index.step == 1, \
"Step sizes other than 1 are not yet supported by StringView"
start, stop = StringView.real_indices(index.start, index.stop, self.len)
return StringView(self.text, self.begin + start, self.begin + stop)
def __eq__(self, other):
return str(self) == str(other) # PERFORMANCE WARNING: This creates copies of the strings
def find(self, sub, start=None, end=None) -> int:
if start is None and end is None:
return self.text.find(sub, self.begin, self.end) - self.begin
else:
start, end = StringView.real_indices(start, end, self.len)
return self.text.find(sub, self.begin + start, self.begin + end) - self.begin
def startswith(self, prefix: str, start:int = 0, end:Optional[int] = None) -> bool:
start += self.begin
end = self.end if end is None else self.begin + end
return self.text.startswith(prefix, start, end)
def sv_match(regex, sv: StringView):
return regex.match(sv.text, pos=sv.begin, endpos=sv.end)
def sv_index(absolute_index: Union[int, Collection], sv: StringView) -> Union[int, tuple]:
"""
Converts the an index into string watched by a StringView object
to an index relativ to the string view object, e.g.:
>>> sv = StringView('xxIxx')[2:3]
>>> match = sv_match(re.compile('I'), sv)
>>> match.end()
3
>>> sv_index(match.end(), sv)
1
"""
try:
return absolute_index - sv.begin
except TypeError:
return tuple(index - sv.begin for index in absolute_index)
def sv_search(regex, sv: StringView):
return regex.search(sv.text, pos=sv.begin, endpos=sv.end)
EMPTY_STRING_VIEW = StringView('')
# def repr_call(f, parameter_list) -> str:
......
......@@ -49,7 +49,7 @@ def fail_on_error(src, result):
sys.exit(1)
def test():
def tst_func():
with toolkit.logging(False):
files = os.listdir('testdata')
files.sort()
......@@ -87,7 +87,8 @@ def mem_profile(func):
print(stat)
if __name__ == "__main__":
cpu_profile(test)
cpu_profile(tst_func)
......@@ -26,4 +26,5 @@ sys.path.extend(['../', './'])
if __name__ == "__main__":
from DHParser.testing import runner
runner("", globals())
\ No newline at end of file
runner("", globals())
......@@ -24,7 +24,7 @@ from functools import partial
sys.path.extend(['../', './'])
from DHParser.toolkit import is_logging, logging, compile_python_object
from DHParser.toolkit import is_logging, logging, StringView, compile_python_object
from DHParser.parser import compile_source, Retrieve, Grammar, Forward, Token, ZeroOrMore, RE, \
RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series
from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler
......@@ -152,7 +152,7 @@ class TestRegex:
assert result
assert not messages, str(messages)
parser = compile_python_object(DHPARSER_IMPORTS + result, '\w+Grammar$')()
node, rest = parser.regex('abc+def')
node, rest = parser.regex(StringView('abc+def'))
assert rest == ''
assert node.parser.name == "regex"
assert str(node) == 'abc+def'
......
......@@ -97,6 +97,7 @@ class TestNode:
transform = get_ebnf_transformer()
compiler = get_ebnf_compiler()
tree = parser(ebnf)
print(tree.as_sxpr())
tree_copy = copy.deepcopy(tree)
transform(tree_copy)
res1 = compiler(tree_copy)
......
......@@ -23,9 +23,112 @@ limitations under the License.
import concurrent.futures
import os
import sys
try:
import regex as re
except ImportError:
import re
sys.path.extend(['../', './'])
from DHParser.toolkit import load_if_file, logging, log_dir, is_logging
from DHParser.toolkit import load_if_file, logging, log_dir, is_logging, StringView, \
sv_match, sv_search, EMPTY_STRING_VIEW
class TestStringView:
def test_real_indices(self):
assert StringView.real_indices(3, 5, 10) == (3, 5)
assert StringView.real_indices(None, None, 10) == (0, 10)
assert StringView.real_indices(-2, -1, 10) == (8, 9)
assert StringView.real_indices(-3, 11, 10) == (7, 10)
assert StringView.real_indices(-5, -12, 10) == (5, 0)
assert StringView.real_indices(-12, -5, 10) == (0, 5)
assert StringView.real_indices(7, 6, 10) == (7, 6)
assert StringView.real_indices(None, 0, 10) == (0, 0)
def test_creation(self):
s = "0123456789"
assert str(StringView(s)) == s
assert str(StringView(s, 3, 4)) == '3'
assert str(StringView(s, -4)) == '6789'
def test_equality(self):
s = "0123456789"
assert StringView(s) == s
assert StringView(s, 3, 4) == '3'
assert StringView(s, -4) == '6789'
def test_slicing(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert sv == '0123456789'
assert sv[3:4] == '3'
assert sv[-3:-1] == '78'
assert sv[4:3] == ''
assert sv[:4] == '0123'
assert sv[4:] == '456789'
assert sv[-2:] == '89'
assert sv[:-5] == '01234'
assert isinstance(sv[3:5], StringView)
def test_len(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert len(sv) == 10
assert sv.len == 10
assert len(sv[5:5]) == 0
assert len(sv[7:4]) == 0
assert len(sv[-12:-2]) == 8
assert len(sv[-12:12]) == 10
def test_bool(self):
assert not StringView('')
assert StringView('x')
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert not sv[5:4]
assert sv[4:5], str(sv[4:5])
assert not sv[3:3]
assert not sv[12:13]
assert sv[0:20]
def test_sv_match(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert sv_match(re.compile(r'\d'), sv)
assert sv_match(re.compile(r'\d+'), sv)
assert not sv_match(re.compile(r' '), sv)
assert sv_match(re.compile(r'45'), sv[4:])
def test_sv_search(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert sv_search(re.compile(r'5'), sv)
assert not sv_search(re.compile(r' '), sv)
assert sv_search(re.compile(r'5'), sv[5:])
assert not sv_search(re.compile(r'9'), sv[:9])
def test_find(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert sv.find('5') == 5
assert sv.find(' ') < 0
assert sv.find('0', 1) < 0
assert sv.find('9', 0, 8) < 0
assert sv.find('45', 1, 8) == 4
def test_startswith(self):
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert sv.startswith('012')
assert sv.startswith('123', 1)
assert not sv.startswith('123', 1, 3)
def test_EMPTY_STRING_VIEW(self):
assert len(EMPTY_STRING_VIEW) == 0
assert EMPTY_STRING_VIEW.find('x') < 0
assert not sv_match(re.compile(r'x'), EMPTY_STRING_VIEW)
assert sv_match(re.compile(r'.*'), EMPTY_STRING_VIEW)
assert len(EMPTY_STRING_VIEW[0:1]) == 0
class TestToolkit:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment