11.3.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

stringview.py 10.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# stringview.py - a string class where slices are views not copies as
#                    with the standard Python strings.
#
#    stringview.pxd - declarations for the cython Python to C compiler
#                    to speed up handling of StringViews.
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
di68kap's avatar
di68kap committed
21

22 23 24 25 26 27 28 29 30 31 32 33
"""
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.

It is recommended to compile this modules with the Cython-compiler for
speedup. The modules comes with a ``stringview.pxd`` that contains some type
declarations to fully exploit the potential of the Cython-compiler.
"""
34

35
import collections
di68kap's avatar
di68kap committed
36 37

from DHParser.toolkit import typing
38
from typing import Optional, Union, Iterable, Tuple
39

40

41
__all__ = ('StringView', 'EMPTY_STRING_VIEW')
42 43


44 45 46 47
def first_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
     `text` within the bounds [begin, end].
    """
48 49 50 51 52
    while begin < end and text[begin] in ' \n\t':
        begin += 1
    return begin


53 54 55 56
def last_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
    `text` within the bounds [begin, end].
    """
57 58 59 60 61
    while end > begin and text[end] in ' \n\t':
        end -= 1
    return end


62 63 64 65 66 67 68 69 70 71 72 73 74 75
def pack_index(index: int, length: int) -> int:
    """Transforms `index` into a positive index counting from the beginning
    of the string, capping it at the boundaries [0, len].
    Examples:
    >>> pack_index(-1, 5)
    4
    >>> pack_index(6, 5)
    5
    >>> pack_index(-7, 5)
    0
    """
    # assert length >= 0
    index = index if index >= 0 else index + length
    return 0 if index < 0 else length if index > length else index
76 77


78 79 80 81 82 83
def real_indices(begin: Optional[int],
                 end: Optional[int],
                 length) -> Tuple[int, int]:   # "length: int" fails with cython!?
    """Returns the tuple of real (i.e. positive) indices from the slice
    indices `begin`,  `end`, assuming a string of size `length`.
    """
84
    cbegin = 0 if begin is None else begin
85 86
    cend = length if end is None else end
    return pack_index(cbegin, length), pack_index(cend, length)
87 88 89 90 91


class StringView(collections.abc.Sized):
    """
    A rudimentary StringView class, just enough for the use cases
92
    in parse.py. The difference between a StringView and the python
93 94 95 96 97 98 99
    builtin strings is that StringView-objects do slicing without
    copying, i.e. slices are just a view on a section of the sliced
    string.
    """
    __slots__ = ['text', 'begin', 'end', 'len', 'fullstring_flag']

    def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
eckhart's avatar
eckhart committed
100
        assert isinstance(text, str)
101 102 103 104 105 106 107 108 109 110 111 112
        self.text = text  # type: str
        self.begin, self.end = real_indices(begin, end, len(text))
        self.len = max(self.end - self.begin, 0)  # type: int
        self.fullstring_flag = (self.begin == 0 and self.len == len(self.text))  # type: bool

    def __bool__(self):
        return self.end > self.begin  # and bool(self.text)

    def __len__(self):
        return self.len

    def __str__(self):
113
        # PERFORMANCE WARNING: This creates a copy of the string-slice
114 115 116 117 118 119 120 121 122 123 124 125
        if self.fullstring_flag:  # optimization: avoid slicing/copying
            return self.text
        # since the slice is being copyied now, anyway, the copy might
        # as well be stored in the string view
        self.text = self.text[self.begin:self.end]
        self.begin = 0
        self.len = len(self.text)
        self.end = self.len
        self.fullstring_flag = True
        return self.text

    def __eq__(self, other):
126 127
        # PERFORMANCE WARNING: This creates copies of the strings
        return len(other) == len(self) and str(self) == str(other)
128 129

    def __hash__(self):
130 131
        # PERFORMANCE WARNING: This creates a copy of the string-slice
        return hash(str(self))
132 133 134

    def __add__(self, other):
        if isinstance(other, str):
135
            return str(self) + other
136 137 138 139 140
        else:
            return StringView(str(self) + str(other))

    def __radd__(self, other):
        if isinstance(other, str):
141
            return other + str(self)
142 143 144 145 146 147 148
        else:
            return StringView(str(other) + str(self))

    def __getitem__(self, index):
        # assert isinstance(index, slice), "As of now, StringView only allows slicing."
        # assert index.step is None or index.step == 1, \
        #     "Step sizes other than 1 are not yet supported by StringView"
149
        start, stop = real_indices(index.start, index.stop, self.len)
150 151
        return StringView(self.text, self.begin + start, self.begin + stop)

152 153 154 155 156
    def count(self, sub: str, start=None, end=None) -> int:
        """Returns the number of non-overlapping occurrences of substring
        `sub` in StringView S[start:end].  Optional arguments start and end
        are interpreted as in slice notation.
        """
157 158 159 160 161 162 163 164
        if self.fullstring_flag:
            return self.text.count(sub, start, end)
        elif start is None and end is None:
            return self.text.count(sub, self.begin, self.end)
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.count(sub, self.begin + start, self.begin + end)

165 166 167 168 169 170
    def find(self, sub: str, start=None, end=None) -> int:
        """Returns the lowest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
171 172 173 174 175 176 177 178
        if self.fullstring_flag:
            return self.text.find(sub, start, end)
        elif start is None and end is None:
            return self.text.find(sub, self.begin, self.end) - self.begin
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.find(sub, self.begin + start, self.begin + end) - self.begin

179 180 181 182 183 184
    def rfind(self, sub: str, start=None, end=None) -> int:
        """Returns the highest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
185 186 187 188 189 190 191 192
        if self.fullstring_flag:
            return self.text.rfind(sub, start, end)
        if start is None and end is None:
            return self.text.rfind(sub, self.begin, self.end) - self.begin
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.rfind(sub, self.begin + start, self.begin + end) - self.begin

193 194 195 196 197 198 199 200 201
    def startswith(self,
                   prefix: Union[str, Tuple[str, ...]],
                   start: int = 0,
                   end: Optional[int] = None) -> bool:
        """Return True if S starts with the specified prefix, False otherwise.
        With optional `start`, test S beginning at that position.
        With optional `end`, stop comparing S at that position.
        prefix can also be a tuple of strings to try.
        """
202 203 204 205 206
        start += self.begin
        end = self.end if end is None else self.begin + end
        return self.text.startswith(prefix, start, end)

    def match(self, regex):
207 208 209
        """Executes `regex.match` on the StringView object and returns the
        result, which is either a match-object or None.
        """
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
        return regex.match(self.text, pos=self.begin, endpos=self.end)

    def index(self, absolute_index: int) -> int:
        """
        Converts an index for a string watched by a StringView object
        to an index relative to the string view object, e.g.:
        >>> import re
        >>> sv = StringView('xxIxx')[2:3]
        >>> match = sv.match(re.compile('I'))
        >>> match.end()
        3
        >>> sv.index(match.end())
        1
        """
        return absolute_index - self.begin

    def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
        """Converts indices for a string watched by a StringView object
        to indices relative to the string view object. See also: `sv_index()`
        """
        return tuple(index - self.begin for index in absolute_indices)

    def search(self, regex):
233 234 235
        """Executes regex.search on the StringView object and returns the
        result, which is either a match-object or None.
        """
236 237 238
        return regex.search(self.text, pos=self.begin, endpos=self.end)

    def strip(self):
239 240 241
        """Returns a copy of the StringView `self` with leading and trailing
        whitespace removed.
        """
242 243 244
        if self.fullstring_flag:
            return self.text.strip()
        else:
245 246
            begin = first_char(self.text, self.begin, self.end)
            end = last_char(self.text, self.begin, self.end)
247 248 249
            return self.text[begin:end]

    def split(self, sep=None):
250 251 252 253 254
        """Returns a list of the words in `self`, using `sep` as the
        delimiter string.  If `sep` is not specified or is None, any
        whitespace string is a separator and empty strings are
        removed from the result.
        """
255 256 257 258 259 260 261 262
        if self.fullstring_flag:
            return self.text.split(sep)
        else:
            pieces = []
            l = len(sep)
            k = 0
            i = self.find(sep, k)
            while i >= 0:
263
                pieces.append(self.text[self.begin + k: self.begin + i])
264 265
                k = i + l
                i = self.find(sep, k)
266
            pieces.append(self.text[self.begin + k: self.end])
267 268
            return pieces

269 270 271 272
    def replace(self, old, new):
        """Returns a string where `old` is replaced by `new`."""
        return str(self).replace(old, new)

273

274
EMPTY_STRING_VIEW = StringView('')