Commit e90ff942 authored by eckhart's avatar eckhart
Browse files

- README.txt updated

parent cf39d143
"""cstringview.pyx - a cython-version of the stringview class for speedup
slicing strings without copying
Copyright 2016 by Eckhart Arnold (
Bavarian Academy of Sciences an Humanities (
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
implied. See the License for the specific language governing
permissions and limitations under the License.
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See:
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.
import collections
from typing import Optional, Iterable, Tuple
__all__ = ('StringView', 'EMPTY_STRING_VIEW')
cdef inline int pack_index(int index, int len):
index = index if index >= 0 else index + len
return 0 if index < 0 else len if index > len else index
cdef real_indices(begin, end, int len):
cdef int begin_i = 0 if begin is None else begin
cdef int end_i = len if end is None else end
return pack_index(begin_i, len), pack_index(end_i, len)
class StringView(
A rudimentary StringView class, just enough for the use cases
in The difference between a StringView and the python
builtin strings is that StringView-objects do slicing without
copying, i.e. slices are just a view on a section of the sliced
__slots__ = ['text', 'begin', 'end', 'len', 'fullstring_flag']
def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
self.text = text
self.begin, self.end = real_indices(begin, end, len(text))
self.len = max(self.end - self.begin, 0)
self.fullstring_flag = (self.begin == 0 and self.len == len(self.text))
def __bool__(self):
return self.end > self.begin # and bool(self.text)
def __len__(self):
return self.len
def __str__(self):
if self.fullstring_flag: # optimization: avoid slicing/copying
return self.text
# since the slice is being copyied now, anyway, the copy might
# as well be stored in the string view
self.text = self.text[self.begin:self.end]
self.begin = 0
self.len = len(self.text)
self.end = self.len
self.fullstring_flag = True
return self.text
def __eq__(self, other):
return len(other) == len(self) and str(self) == str(other) # PERFORMANCE WARNING: This creates copies of the strings
def __hash__(self):
return hash(str(self)) # PERFORMANCE WARNING: This creates a copy of the string-slice
def __add__(self, other):
if isinstance(other, str):
return (str(self) + other)
return StringView(str(self) + str(other))
def __radd__(self, other):
if isinstance(other, str):
return (other + str(self))
return StringView(str(other) + str(self))
def __getitem__(self, index):
# assert isinstance(index, slice), "As of now, StringView only allows slicing."
# assert index.step is None or index.step == 1, \
# "Step sizes other than 1 are not yet supported by StringView"
start, stop = real_indices(index.start, index.stop, self.len)
return StringView(self.text, self.begin + start, self.begin + stop)
def count(self, sub, start=None, end=None) -> int:
if self.fullstring_flag:
return self.text.count(sub, start, end)
elif start is None and end is None:
return self.text.count(sub, self.begin, self.end)
start, end = real_indices(start, end, self.len)
return self.text.count(sub, self.begin + start, self.begin + end)
def find(self, sub, start=None, end=None) -> int:
if self.fullstring_flag:
return self.text.find(sub, start, end)
elif start is None and end is None:
return self.text.find(sub, self.begin, self.end) - self.begin
start, end = real_indices(start, end, self.len)
return self.text.find(sub, self.begin + start, self.begin + end) - self.begin
def rfind(self, sub, start=None, end=None) -> int:
if self.fullstring_flag:
return self.text.rfind(sub, start, end)
if start is None and end is None:
return self.text.rfind(sub, self.begin, self.end) - self.begin
start, end = real_indices(start, end, self.len)
return self.text.rfind(sub, self.begin + start, self.begin + end) - self.begin
def startswith(self, prefix: str, start: int = 0, end: Optional[int] = None) -> bool:
start += self.begin
end = self.end if end is None else self.begin + end
return self.text.startswith(prefix, start, end)
def match(self, regex):
return regex.match(self.text, pos=self.begin, endpos=self.end)
def index(self, absolute_index: int) -> int:
Converts an index for a string watched by a StringView object
to an index relative to the string view object, e.g.:
>>> sv = StringView('xxIxx')[2:3]
>>> match = sv.match(re.compile('I'))
>>> match.end()
>>> sv.index(match.end())
return absolute_index - self.begin
def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
"""Converts indices for a string watched by a StringView object
to indices relative to the string view object. See also: `sv_index()`
return tuple(index - self.begin for index in absolute_indices)
def search(self, regex):
return, pos=self.begin, endpos=self.end)
def strip(self):
cdef int begin, end
if self.fullstring_flag:
return self.text.strip()
begin = self.begin
end = self.end
while begin < end and self.text[begin] in ' \n\t':
begin += 1
while end > begin and self.text[end] in ' \n\t':
end -= 1
return self.text[begin:end]
# return str(self).strip() # PERFORMANCE WARNING: This creates a copy of the string
def split(self, sep=None):
cdef int i, k, l
if self.fullstring_flag:
return self.text.split(sep)
pieces = []
l = len(sep)
k = 0
i = self.find(sep, k)
while i >= 0:
pieces.append(self.text[self.begin + k : self.begin + i])
k = i + l
i = self.find(sep, k)
pieces.append(self.text[self.begin + k : self.end])
return pieces
# return str(self).split(sep, maxsplit) # PERFORMANCE WARNING: This creates a copy of the string
EMPTY_STRING_VIEW = StringView('')
# This module has been shamelessly stolen from the Cython Python
# to C compiler ( so that DHParser does not
# cause an import error on machines where Cython is not installed
# The copyrigh to Cython is held by Greg Ewing. It is licensed
# under the Apache 2.0 licnese. (see for mor information
# cython.* namespace for pure mode.
from __future__ import absolute_import
__version__ = "0.27"
from __builtin__ import basestring
except ImportError:
basestring = str
# BEGIN shameless copy from Cython/minivect/
class _ArrayType(object):
is_array = True
subtypes = ['dtype']
def __init__(self, dtype, ndim, is_c_contig=False, is_f_contig=False,
inner_contig=False, broadcasting=None):
self.dtype = dtype
self.ndim = ndim
self.is_c_contig = is_c_contig
self.is_f_contig = is_f_contig
self.inner_contig = inner_contig or is_c_contig or is_f_contig
self.broadcasting = broadcasting
def __repr__(self):
axes = [":"] * self.ndim
if self.is_c_contig:
axes[-1] = "::1"
elif self.is_f_contig:
axes[0] = "::1"
return "%s[%s]" % (self.dtype, ", ".join(axes))
def index_type(base_type, item):
Support array type creation by slicing, e.g. double[:, :] specifies
a 2D strided array of doubles. The syntax is the same as for
Cython memoryviews.
class InvalidTypeSpecification(Exception):
def verify_slice(s):
if s.start or s.stop or s.step not in (None, 1):
raise InvalidTypeSpecification(
"Only a step of 1 may be provided to indicate C or "
"Fortran contiguity")
if isinstance(item, tuple):
step_idx = None
for idx, s in enumerate(item):
if s.step and (step_idx or idx not in (0, len(item) - 1)):
raise InvalidTypeSpecification(
"Step may only be provided once, and only in the "
"first or last dimension.")
if s.step == 1:
step_idx = idx
return _ArrayType(base_type, len(item),
is_c_contig=step_idx == len(item) - 1,
is_f_contig=step_idx == 0)
elif isinstance(item, slice):
return _ArrayType(base_type, 1, is_c_contig=bool(item.step))
# int[8] etc.
assert int(item) == item # array size must be a plain integer
array(base_type, item)
# END shameless copy
compiled = False
_Unspecified = object()
# Function decorators
def _empty_decorator(x):
return x
def locals(**arg_types):
return _empty_decorator
def test_assert_path_exists(*paths):
return _empty_decorator
def test_fail_if_path_exists(*paths):
return _empty_decorator
class _EmptyDecoratorAndManager(object):
def __call__(self, x):
return x
def __enter__(self):
def __exit__(self, exc_type, exc_value, traceback):
class _Optimization(object):
cclass = ccall = cfunc = _EmptyDecoratorAndManager()
returns = wraparound = boundscheck = initializedcheck = nonecheck = \
overflowcheck = embedsignature = cdivision = cdivision_warnings = \
always_allows_keywords = profile = linetrace = infer_type = \
unraisable_tracebacks = freelist = \
lambda _: _EmptyDecoratorAndManager()
exceptval = lambda _=None, check=True: _EmptyDecoratorAndManager()
optimization = _Optimization()
overflowcheck.fold = optimization.use_switch = \
optimization.unpack_method_calls = lambda arg: _EmptyDecoratorAndManager()
final = internal = type_version_tag = no_gc_clear = no_gc = _empty_decorator
_cython_inline = None
def inline(f, *args, **kwds):
if isinstance(f, basestring):
global _cython_inline
if _cython_inline is None:
from Cython.Build.Inline import cython_inline as _cython_inline
return _cython_inline(f, *args, **kwds)
assert len(args) == len(kwds) == 0
return f
def compile(f):
from Cython.Build.Inline import RuntimeCompiledFunction
return RuntimeCompiledFunction(f)
# Special functions
def cdiv(a, b):
q = a / b
if q < 0:
q += 1
return q
def cmod(a, b):
r = a % b
if (a * b) < 0:
r -= b
return r
# Emulated language constructs
def cast(type, *args, **kwargs):
kwargs.pop('typecheck', None)
assert not kwargs
if hasattr(type, '__call__'):
return type(*args)
return args[0]
def sizeof(arg):
return 1
def typeof(arg):
return arg.__class__.__name__
# return type(arg)
def address(arg):
return pointer(type(arg))([arg])
def declare(type=None, value=_Unspecified, **kwds):
if type not in (None, object) and hasattr(type, '__call__'):
if value is not _Unspecified:
return type(value)
return type()
return value
class _nogil(object):
"""Support for 'with nogil' statement
def __enter__(self):
def __exit__(self, exc_class, exc, tb):
return exc_class is None
nogil = _nogil()
gil = _nogil()
del _nogil
# Emulated types
class CythonMetaType(type):
def __getitem__(type, ix):
return array(type, ix)
CythonTypeObject = CythonMetaType('CythonTypeObject', (object,), {})
class CythonType(CythonTypeObject):
def _pointer(self, n=1):
for i in range(n):
self = pointer(self)
return self
class PointerType(CythonType):
def __init__(self, value=None):
if isinstance(value, (ArrayType, PointerType)):
self._items = [cast(self._basetype, a) for a in value._items]
elif isinstance(value, list):
self._items = [cast(self._basetype, a) for a in value]
elif value is None or value == 0:
self._items = []
raise ValueError
def __getitem__(self, ix):
if ix < 0:
raise IndexError("negative indexing not allowed in C")
return self._items[ix]
def __setitem__(self, ix, value):
if ix < 0:
raise IndexError("negative indexing not allowed in C")
self._items[ix] = cast(self._basetype, value)
def __eq__(self, value):
if value is None and not self._items:
return True
elif type(self) != type(value):
return False
return not self._items and not value._items
def __repr__(self):
return "%s *" % (self._basetype,)
class ArrayType(PointerType):
def __init__(self):
self._items = [None] * self._n
class StructType(CythonType):
def __init__(self, cast_from=_Unspecified, **data):
if cast_from is not _Unspecified:
# do cast
if len(data) > 0:
raise ValueError('Cannot accept keyword arguments when casting.')
if type(cast_from) is not type(self):
raise ValueError('Cannot cast from %s' % cast_from)
for key, value in cast_from.__dict__.items():
setattr(self, key, value)
for key, value in data.items():
setattr(self, key, value)
def __setattr__(self, key, value):
if key in self._members:
self.__dict__[key] = cast(self._members[key], value)
raise AttributeError("Struct has no member '%s'" % key)
class UnionType(CythonType):
def __init__(self, cast_from=_Unspecified, **data):
if cast_from is not _Unspecified:
# do type cast
if len(data) > 0:
raise ValueError('Cannot accept keyword arguments when casting.')
if isinstance(cast_from, dict):
datadict = cast_from
elif type(cast_from) is type(self):
datadict = cast_from.__dict__
raise ValueError('Cannot cast from %s' % cast_from)
datadict = data
if len(datadict) > 1:
raise AttributeError("Union can only store one field at a time.")
for key, value in datadict.items():
setattr(self, key, value)
def __setattr__(self, key, value):
if key in '__dict__':
CythonType.__setattr__(self, key, value)
elif key in self._members:
self.__dict__ = {key: cast(self._members[key], value)}
raise AttributeError("Union has no member '%s'" % key)
def pointer(basetype):
class PointerInstance(PointerType):
_basetype = basetype
return PointerInstance
def array(basetype, n):
class ArrayInstance(ArrayType):
_basetype = basetype
_n = n
return ArrayInstance
def struct(**members):
class StructInstance(StructType):
_members = members
for key in members:
setattr(StructInstance, key, None)
return StructInstance
def union(**members):
class UnionInstance(UnionType):
_members = members
for key in members:
setattr(UnionInstance, key, None)
return UnionInstance
class typedef(CythonType):
def __init__(self, type, name=None):
self._basetype = type = name
def __call__(self, *arg):
value = cast(self._basetype, *arg)
return value
def __repr__(self):
return or str(self._basetype)
__getitem__ = index_type
class _FusedType(CythonType):
def fused_type(*args):
if not args:
raise TypeError("Expected at least one type as argument")
# Find the numeric type with biggest rank if all types are numeric
rank = -1
for type in args:
if type not in (py_int, py_long, py_float, py_complex):
if type_ordering.index(type) > rank:
result_type = type
return result_type
# Not a simple numeric type, return a fused type instance. The result
# isn't really meant to be used, as we can't keep track of the context in
# pure-mode. Casting won't do anything in this case.
return _FusedType()
def _specialized_from_args(signatures, args, kwargs):
"Perhaps this should be implemented in a TreeFragment in Cython code"
raise Exception("yet to be implemented")
py_int = typedef(int, "int")
py_long = typedef(long, "long")
except NameError: # Py3
py_long = typedef(int, "long")