Commit bcaa931b authored by eckhart's avatar eckhart
Browse files

- slightly better cython optimizations

parent 5249535b
......@@ -748,7 +748,7 @@ class Grammar:
this is done lazily.
"""
if not self._reversed__:
self._reversed__ = StringView(self.document__.text[::-1])
self._reversed__ = StringView(self.document__.get_text()[::-1])
return self._reversed__
......
......@@ -9,73 +9,20 @@ import cython
# type hints for Cython python -> C compiler to speed up the most
# critical code paths of stringview.py.
cdef int first_char(text, int begin, int end)
cdef int last_char(text, int begin, int end)
cdef int first_char(str text, int begin, int end)
cdef int last_char(str text, int begin, int end)
cdef int pack_index(int index, int length)
@cython.locals(cbegin=cython.int, cend=cython.int)
cdef real_indices(begin, end, int length)
# cpdef real_indices(begin, end, int length)
cpdef real_indices(begin, end, int length)
# cdefs for class StringView: https://cython.readthedocs.io/en/latest/src/tutorial/pure.html
cdef class StringView:
cdef str text
cdef int begin, end, len
cdef str fullstring
cpdef __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None
cpdef __bool__(self) -> bool
cpdef __len__(self) -> int
cpdef __str__(self) -> str
cpdef __eq__(self, other) -> bool
cpdef __hash__(self) -> int
cpdef __add__(self, other) -> Union[str, 'StringView']
cpdef __radd__(self, other) -> Union[str, 'StringView']
@cython.locals(start=cython.int, stop=cython.int)
cpdef __getitem__(self, index: Optional[slice, int]) -> StringView
cpdef count(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int
cpdef find(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int
cpdef rfind(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int
cpdef startswith(self, prefix: str, start: int = 0, end: Optional[int] = None) -> bool
cpdef endswith(self, suffix: str, start: int = 0, end: Optional[int] = None) -> bool
cpdef match(self, regex, flags=0)
cpdef index(self, absolute_index: int) -> int
cpdef indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]
cpdef search(self, regex)
cpdef finditer(self, regex)
@cython.locals(begin=cython.int, end=cython.int)
cpdef strip(self)
@cython.locals(begin=cython.int)
cpdef lstrip(self)
@cython.locals(end=cython.int)
cpdef rstrip(self)
@cython.locals(length=cython.int, k=cython.int, i=cython.int)
cpdef split(self, sep=None)
cdef str _text
cdef int _begin, _end, _len
cdef str _fullstring
cpdef replace(self, old, new)
......@@ -37,8 +37,16 @@ import collections
from DHParser.toolkit import typing
from typing import Optional, Union, Iterable, Tuple
try:
import cython
cython_optimized = cython.compiled # type: bool
except ImportError:
# import DHParser.Shadow as cython
cython_optimized = False # type: bool
import DHParser.shadow_cython as cython
__all__ = ('StringView', 'EMPTY_STRING_VIEW')
__all__ = ('StringView', 'EMPTY_STRING_VIEW', 'cython_optimized')
def first_char(text, begin: int, end: int) -> int:
......@@ -80,7 +88,7 @@ def pack_index(index: int, length: int) -> int:
def real_indices(begin: Optional[int],
end: Optional[int],
length) -> Tuple[int, int]: # "length: int" fails with cython!?
length) -> Tuple[int, int]:
"""Returns the tuple of real (i.e. positive) indices from the slice
indices `begin`, `end`, assuming a string of size `length`.
"""
......@@ -89,7 +97,7 @@ def real_indices(begin: Optional[int],
return pack_index(cbegin, length), pack_index(cend, length)
class StringView: # (collections.abc.Sized):
class StringView: # collections.abc.Sized
"""
A rudimentary StringView class, just enough for the use cases
in parse.py. The difference between a StringView and the python
......@@ -97,33 +105,33 @@ class StringView: # (collections.abc.Sized):
copying, i.e. slices are just a view on a section of the sliced
string.
"""
__slots__ = ['text', 'begin', 'end', 'len', 'fullstring']
__slots__ = ['_text', '_begin', '_end', '_len', '_fullstring']
def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
# assert isinstance(text, str)
self.text = text # type: str
self.begin, self.end = real_indices(begin, end, len(text))
self.len = max(self.end - self.begin, 0) # type: int
if (self.begin == 0 and self.len == len(self.text)):
self.fullstring = self.text # type: str
self._text = text # type: str
self._begin, self._end = real_indices(begin, end, len(text))
self._len = max(self._end - self._begin, 0) # type: int
if (self._begin == 0 and self._len == len(self._text)):
self._fullstring = self._text # type: str
else:
self.fullstring = ''
self._fullstring = ''
def __bool__(self) -> bool:
return self.end > self.begin # and bool(self.text)
return self._end > self._begin # and bool(self.text)
def __len__(self) -> int:
return self.len
return self._len
def __str__(self) -> str:
# PERFORMANCE WARNING: This creates a copy of the string-slice
if self.fullstring: # optimization: avoid slicing/copying
return self.fullstring
if self._fullstring: # optimization: avoid slicing/copying
return self._fullstring
# since the slice is being copyied now, anyway, the copy might
# as well be stored in the string view
# return self.text[self.begin:self.end] # use this for debugging!
self.fullstring = self.text[self.begin:self.end]
return self.fullstring
self._fullstring = self._text[self._begin:self._end]
return self._fullstring
def __eq__(self, other) -> bool:
# PERFORMANCE WARNING: This creates copies of the strings
......@@ -145,28 +153,43 @@ class StringView: # (collections.abc.Sized):
else:
return StringView(str(other) + str(self))
@cython.locals(start=cython.int, end=cython.int)
def __getitem__(self, index: Union[slice, int]) -> 'StringView':
# assert isinstance(index, slice), "As of now, StringView only allows slicing."
# assert index.step is None or index.step == 1, \
# "Step sizes other than 1 are not yet supported by StringView"
try:
start, stop = real_indices(index.start, index.stop, self.len)
return StringView(self.text, self.begin + start, self.begin + stop)
start, stop = real_indices(index.start, index.stop, self._len)
return StringView(self._text, self._begin + start, self._begin + stop)
except AttributeError:
return StringView(self.text, self.begin + index, self.begin + index + 1)
return StringView(self._text, self._begin + index, self._begin + index + 1)
def get_begin(self) -> int:
"""Returns the offset of the StringView. This is needed to correct
the absolute offsets that the match objects of regular expression
objects return.
"""
return self._begin
def get_text(self) -> str:
"""Returns the underlying string."""
return self._text
def count(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
"""Returns the number of non-overlapping occurrences of substring
`sub` in StringView S[start:end]. Optional arguments start and end
are interpreted as in slice notation.
"""
if self.fullstring:
return self.fullstring.count(sub, start, end)
if self._fullstring:
if cython_optimized:
return self._fullstring.count(sub, start or 0, self._len if end is None else end)
else:
return self._fullstring.count(sub, start, end)
elif start is None and end is None:
return self.text.count(sub, self.begin, self.end)
return self._text.count(sub, self._begin, self._end)
else:
start, end = real_indices(start, end, self.len)
return self.text.count(sub, self.begin + start, self.begin + end)
start, end = real_indices(start, end, self._len)
return self._text.count(sub, self._begin + start, self._begin + end)
def find(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
"""Returns the lowest index in S where substring `sub` is found,
......@@ -174,13 +197,16 @@ class StringView: # (collections.abc.Sized):
arguments `start` and `end` are interpreted as in slice notation.
Returns -1 on failure.
"""
if self.fullstring:
return self.fullstring.find(sub, start, end)
if self._fullstring:
if cython_optimized:
return self._fullstring.find(sub, start or 0, self._len if end is None else end)
else:
return self._fullstring.find(sub, start, end)
elif start is None and end is None:
return self.text.find(sub, self.begin, self.end) - self.begin
return self._text.find(sub, self._begin, self._end) - self._begin
else:
start, end = real_indices(start, end, self.len)
return self.text.find(sub, self.begin + start, self.begin + end) - self.begin
start, end = real_indices(start, end, self._len)
return self._text.find(sub, self._begin + start, self._begin + end) - self._begin
def rfind(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
"""Returns the highest index in S where substring `sub` is found,
......@@ -188,13 +214,16 @@ class StringView: # (collections.abc.Sized):
arguments `start` and `end` are interpreted as in slice notation.
Returns -1 on failure.
"""
if self.fullstring:
return self.fullstring.rfind(sub, start, end)
if self._fullstring:
if cython_optimized:
return self._fullstring.rfind(sub, start or 0, self._len if end is None else end)
else:
return self._fullstring.rfind(sub, start, end)
if start is None and end is None:
return self.text.rfind(sub, self.begin, self.end) - self.begin
return self._text.rfind(sub, self._begin, self._end) - self._begin
else:
start, end = real_indices(start, end, self.len)
return self.text.rfind(sub, self.begin + start, self.begin + end) - self.begin
start, end = real_indices(start, end, self._len)
return self._text.rfind(sub, self._begin + start, self._begin + end) - self._begin
def startswith(self,
prefix: str,
......@@ -204,9 +233,9 @@ class StringView: # (collections.abc.Sized):
With optional `start`, test S beginning at that position.
With optional `end`, stop comparing S at that position.
"""
start += self.begin
end = self.end if end is None else self.begin + end
return self.text.startswith(prefix, start, end)
start += self._begin
end = self._end if end is None else self._begin + end
return self._text.startswith(prefix, start, end)
def endswith(self,
suffix: str,
......@@ -216,17 +245,17 @@ class StringView: # (collections.abc.Sized):
With optional `start`, test S beginning at that position.
With optional `end`, stop comparing S at that position.
"""
start += self.begin
end = self.end if end is None else self.begin + end
return self.text.endswith(suffix, start, end)
start += self._begin
end = self._end if end is None else self._begin + end
return self._text.endswith(suffix, start, end)
def match(self, regex, flags=0):
def match(self, regex, flags: int = 0):
"""Executes `regex.match` on the StringView object and returns the
result, which is either a match-object or None. Keep in mind that
match.end(), match.span() etc. are mapped to the underlying text,
not the StringView-object!!!
"""
return regex.match(self.text, pos=self.begin, endpos=self.end)
return regex.match(self._text, pos=self._begin, endpos=self._end)
def index(self, absolute_index: int) -> int:
"""Converts an index for a string watched by a StringView object
......@@ -240,13 +269,13 @@ class StringView: # (collections.abc.Sized):
>>> sv.index(match.end())
1
"""
return absolute_index - self.begin
return absolute_index - self._begin
def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
"""Converts indices for a string watched by a StringView object
to indices relative to the string view object. See also: `sv_index()`
"""
return tuple(index - self.begin for index in absolute_indices)
return tuple(index - self._begin for index in absolute_indices)
def search(self, regex):
"""Executes regex.search on the StringView object and returns the
......@@ -254,32 +283,32 @@ class StringView: # (collections.abc.Sized):
match.end(), match.span() etc. are mapped to the underlying text,
not the StringView-object!!!
"""
return regex.search(self.text, pos=self.begin, endpos=self.end)
return regex.search(self._text, pos=self._begin, endpos=self._end)
def finditer(self, regex):
"""Executes regex.finditer on the StringView object and returns the
iterator of match objects. Keep in mind that match.end(), match.span()
etc. are mapped to the underlying text, not the StringView-object!!!
"""
return regex.finditer(self.text, pos=self.begin, endpos=self.end)
return regex.finditer(self._text, pos=self._begin, endpos=self._end)
def strip(self):
"""Returns a copy of the StringView `self` with leading and trailing
whitespace removed.
"""
begin = first_char(self.text, self.begin, self.end) - self.begin
end = last_char(self.text, self.begin, self.end) - self.begin
return self if begin == 0 and end == self.len else self[begin:end]
begin = first_char(self._text, self._begin, self._end) - self._begin
end = last_char(self._text, self._begin, self._end) - self._begin
return self if begin == 0 and end == self._len else self[begin:end]
def lstrip(self):
"""Returns a copy of `self` with leading whitespace removed."""
begin = first_char(self.text, self.begin, self.end) - self.begin
begin = first_char(self._text, self._begin, self._end) - self._begin
return self if begin == 0 else self[begin:]
def rstrip(self):
"""Returns a copy of `self` with trailing whitespace removed."""
end = last_char(self.text, self.begin, self.end) - self.begin
return self if end == self.len else self[:end]
end = last_char(self._text, self._begin, self._end) - self._begin
return self if end == self._len else self[:end]
def split(self, sep=None):
"""Returns a list of the words in `self`, using `sep` as the
......@@ -287,18 +316,18 @@ class StringView: # (collections.abc.Sized):
whitespace string is a separator and empty strings are
removed from the result.
"""
if self.fullstring:
return self.fullstring.split(sep)
if self._fullstring:
return self._fullstring.split(sep)
else:
pieces = []
length = len(sep)
k = 0
i = self.find(sep, k)
while i >= 0:
pieces.append(self.text[self.begin + k: self.begin + i])
pieces.append(self._text[self._begin + k: self._begin + i])
k = i + length
i = self.find(sep, k)
pieces.append(self.text[self.begin + k: self.end])
pieces.append(self._text[self._begin + k: self._end])
return pieces
def replace(self, old, new):
......
......@@ -952,7 +952,7 @@ def parse_sxpr(sxpr: Union[str, StringView]) -> Node:
if match is None:
raise AssertionError('Malformed S-expression Node-tagname or identifier expected, '
'not "%s"' % sxpr[:40].replace('\n', ''))
end = match.end() - sxpr.begin
end = match.end() - sxpr.get_begin()
tagname = sxpr[:end]
name, class_name = (tagname.split(':') + [''])[:2]
sxpr = sxpr[end:].strip()
......@@ -986,14 +986,14 @@ def parse_sxpr(sxpr: Union[str, StringView]) -> Node:
for qtmark in ['"""', "'''", '"', "'"]:
match = sxpr.match(re.compile(qtmark + r'.*?' + qtmark, re.DOTALL))
if match:
end = match.end() - sxpr.begin
end = match.end() - sxpr.get_begin()
i = len(qtmark)
lines.append(str(sxpr[i:end - i]))
sxpr = sxpr[end:].strip()
break
else:
match = sxpr.match(re.compile(r'(?:(?!\)).)*', re.DOTALL))
end = match.end() - sxpr.begin
end = match.end() - sxpr.get_begin()
lines.append(str(sxpr[:end]))
sxpr = sxpr[end:]
result = "\n".join(lines)
......@@ -1026,7 +1026,7 @@ def parse_xml(xml: Union[str, StringView]) -> Node:
for match in s.finditer(re.compile(r'\s*(?P<attr>\w+)\s*=\s*"(?P<value>.*)"\s*')):
d = match.groupdict()
attributes[d['attr']] = d['value']
restart = match.end() - s.begin
restart = match.end() - s.get_begin()
return (s[restart:], attributes)
def parse_opening_tag(s: StringView) -> Tuple[StringView, str, OrderedDict, bool]:
......@@ -1038,7 +1038,7 @@ def parse_xml(xml: Union[str, StringView]) -> Node:
match = s.match(re.compile(r'<\s*(?P<tagname>[\w:]+)\s*'))
assert match
tagname = match.groupdict()['tagname']
section = s[match.end() - s.begin:]
section = s[match.end() - s.get_begin():]
s, attributes = parse_attributes(section)
i = s.find('>')
assert i >= 0
......@@ -1051,7 +1051,7 @@ def parse_xml(xml: Union[str, StringView]) -> Node:
match = s.match(re.compile(r'</\s*(?P<tagname>[\w:]+)>'))
assert match
tagname = match.groupdict()['tagname']
return s[match.end() - s.begin:], tagname
return s[match.end() - s.get_begin():], tagname
def parse_leaf_content(s: StringView) -> Tuple[StringView, StringView]:
"""Parses a piece of the content of a tag, just until the next opening,
......
......@@ -234,7 +234,7 @@ def create_project(path: str):
def selftest() -> bool:
"""Run a simple self-text of DHParser.
"""Run a simple self-test of DHParser.
"""
print("DHParser selftest...")
print("\nSTAGE I: Trying to compile EBNF-Grammar:\n")
......
......@@ -7,7 +7,7 @@ import multiprocessing
import os
import platform
import time
sys
import sys
def run_tests(command):
testtype = 'DOCTEST' if command.find('doctest') >= 0 else 'UNITTEST'
......
......@@ -67,7 +67,7 @@ class TestStringView:
s = " 0123456789 "
sv = StringView(s, 1, -1)
assert len(sv) == 10
assert sv.len == 10
# assert sv.len == 10
assert len(sv[5:5]) == 0
assert len(sv[7:4]) == 0
assert len(sv[-12:-2]) == 8
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment