11.3.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

stringview.py 11.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# stringview.py - a string class where slices are views not copies as
#                    with the standard Python strings.
#
#    stringview.pxd - declarations for the cython Python to C compiler
#                    to speed up handling of StringViews.
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
di68kap's avatar
di68kap committed
21

22 23 24 25 26 27 28 29 30 31 32 33
"""
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.

It is recommended to compile this modules with the Cython-compiler for
speedup. The modules comes with a ``stringview.pxd`` that contains some type
declarations to fully exploit the potential of the Cython-compiler.
"""
34

35
import collections
di68kap's avatar
di68kap committed
36 37

from DHParser.toolkit import typing
38
from typing import Optional, Union, Iterable, Tuple
39

40

41
__all__ = ('StringView', 'EMPTY_STRING_VIEW')
42 43


44 45 46 47
def first_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
     `text` within the bounds [begin, end].
    """
48 49 50 51 52
    while begin < end and text[begin] in ' \n\t':
        begin += 1
    return begin


53 54 55 56
def last_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
    `text` within the bounds [begin, end].
    """
57
    while end > begin and text[end-1] in ' \n\t':
58 59 60 61
        end -= 1
    return end


62 63 64 65 66 67 68 69 70 71 72 73 74 75
def pack_index(index: int, length: int) -> int:
    """Transforms `index` into a positive index counting from the beginning
    of the string, capping it at the boundaries [0, len].
    Examples:
    >>> pack_index(-1, 5)
    4
    >>> pack_index(6, 5)
    5
    >>> pack_index(-7, 5)
    0
    """
    # assert length >= 0
    index = index if index >= 0 else index + length
    return 0 if index < 0 else length if index > length else index
76 77


78 79 80 81 82 83
def real_indices(begin: Optional[int],
                 end: Optional[int],
                 length) -> Tuple[int, int]:   # "length: int" fails with cython!?
    """Returns the tuple of real (i.e. positive) indices from the slice
    indices `begin`,  `end`, assuming a string of size `length`.
    """
84
    cbegin = 0 if begin is None else begin
85 86
    cend = length if end is None else end
    return pack_index(cbegin, length), pack_index(cend, length)
87 88 89 90 91


class StringView(collections.abc.Sized):
    """
    A rudimentary StringView class, just enough for the use cases
92
    in parse.py. The difference between a StringView and the python
93 94 95 96
    builtin strings is that StringView-objects do slicing without
    copying, i.e. slices are just a view on a section of the sliced
    string.
    """
97
    __slots__ = ['text', 'begin', 'end', 'len', 'fullstring']
98 99

    def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
100
        # assert isinstance(text, str)
101 102 103
        self.text = text  # type: str
        self.begin, self.end = real_indices(begin, end, len(text))
        self.len = max(self.end - self.begin, 0)  # type: int
104 105 106 107
        if (self.begin == 0 and self.len == len(self.text)):
            self.fullstring = self.text  # type: str
        else:
            self.fullstring = ''
108 109 110 111 112 113 114 115

    def __bool__(self):
        return self.end > self.begin  # and bool(self.text)

    def __len__(self):
        return self.len

    def __str__(self):
116
        # PERFORMANCE WARNING: This creates a copy of the string-slice
117 118
        if self.fullstring:  # optimization: avoid slicing/copying
            return self.fullstring
119 120
        # since the slice is being copyied now, anyway, the copy might
        # as well be stored in the string view
121 122
        self.fullstring = self.text[self.begin:self.end]
        return self.fullstring
123 124

    def __eq__(self, other):
125 126
        # PERFORMANCE WARNING: This creates copies of the strings
        return len(other) == len(self) and str(self) == str(other)
127 128

    def __hash__(self):
129 130
        # PERFORMANCE WARNING: This creates a copy of the string-slice
        return hash(str(self))
131 132 133

    def __add__(self, other):
        if isinstance(other, str):
134
            return str(self) + other
135 136 137 138 139
        else:
            return StringView(str(self) + str(other))

    def __radd__(self, other):
        if isinstance(other, str):
140
            return other + str(self)
141 142 143 144 145 146 147
        else:
            return StringView(str(other) + str(self))

    def __getitem__(self, index):
        # assert isinstance(index, slice), "As of now, StringView only allows slicing."
        # assert index.step is None or index.step == 1, \
        #     "Step sizes other than 1 are not yet supported by StringView"
148 149 150 151 152
        try:
            start, stop = real_indices(index.start, index.stop, self.len)
            return StringView(self.text, self.begin + start, self.begin + stop)
        except AttributeError:
            return self.text[self.begin + index]
153

154 155 156 157 158
    def count(self, sub: str, start=None, end=None) -> int:
        """Returns the number of non-overlapping occurrences of substring
        `sub` in StringView S[start:end].  Optional arguments start and end
        are interpreted as in slice notation.
        """
159 160
        if self.fullstring:
            return self.fullstring.count(sub, start, end)
161 162 163 164 165 166
        elif start is None and end is None:
            return self.text.count(sub, self.begin, self.end)
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.count(sub, self.begin + start, self.begin + end)

167 168 169 170 171 172
    def find(self, sub: str, start=None, end=None) -> int:
        """Returns the lowest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
173 174
        if self.fullstring:
            return self.fullstring.find(sub, start, end)
175 176 177 178 179 180
        elif start is None and end is None:
            return self.text.find(sub, self.begin, self.end) - self.begin
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.find(sub, self.begin + start, self.begin + end) - self.begin

181 182 183 184 185 186
    def rfind(self, sub: str, start=None, end=None) -> int:
        """Returns the highest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
187 188
        if self.fullstring:
            return self.fullstring.rfind(sub, start, end)
189 190 191 192 193 194
        if start is None and end is None:
            return self.text.rfind(sub, self.begin, self.end) - self.begin
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.rfind(sub, self.begin + start, self.begin + end) - self.begin

195 196 197 198 199 200 201 202 203
    def startswith(self,
                   prefix: Union[str, Tuple[str, ...]],
                   start: int = 0,
                   end: Optional[int] = None) -> bool:
        """Return True if S starts with the specified prefix, False otherwise.
        With optional `start`, test S beginning at that position.
        With optional `end`, stop comparing S at that position.
        prefix can also be a tuple of strings to try.
        """
204 205 206 207
        start += self.begin
        end = self.end if end is None else self.begin + end
        return self.text.startswith(prefix, start, end)

208
    def match(self, regex, flags=0):
209 210
        """Executes `regex.match` on the StringView object and returns the
        result, which is either a match-object or None.
211 212
        WARNING:  match.end(), match.span() etc. are mapped to the underlying text,
                  not the StringView-object!!!
213
        """
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
        return regex.match(self.text, pos=self.begin, endpos=self.end)

    def index(self, absolute_index: int) -> int:
        """
        Converts an index for a string watched by a StringView object
        to an index relative to the string view object, e.g.:
        >>> import re
        >>> sv = StringView('xxIxx')[2:3]
        >>> match = sv.match(re.compile('I'))
        >>> match.end()
        3
        >>> sv.index(match.end())
        1
        """
        return absolute_index - self.begin

    def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
        """Converts indices for a string watched by a StringView object
        to indices relative to the string view object. See also: `sv_index()`
        """
        return tuple(index - self.begin for index in absolute_indices)

    def search(self, regex):
237 238
        """Executes regex.search on the StringView object and returns the
        result, which is either a match-object or None.
239 240
        WARNING:  match.end(), match.span() etc. are mapped to the underlying text,
                  not the StringView-object!!!
241
        """
242 243
        return regex.search(self.text, pos=self.begin, endpos=self.end)

244 245 246 247 248 249 250 251
    def finditer(self, regex):
        """Executes regex.finditer on the StringView object and returns the
        iterator of match objects.
        WARNING:  match.end(), match.span() etc. are mapped to the underlying text,
                  not the StringView-object!!!
        """
        return regex.finditer(self.text, pos=self.begin, endpos=self.end)

252
    def strip(self):
253 254 255
        """Returns a copy of the StringView `self` with leading and trailing
        whitespace removed.
        """
256 257 258 259 260 261 262 263 264 265 266 267 268
        begin = first_char(self.text, self.begin, self.end) - self.begin
        end = last_char(self.text, self.begin, self.end) - self.begin
        return self if begin == 0 and end == self.len else self[begin:end]

    def lstrip(self):
        """Returns a copy of `self` with leading whitespace removed."""
        begin = first_char(self.text, self.begin, self.end) - self.begin
        return self if begin == 0 else self[begin:]

    def rstrip(self):
        """Returns a copy of `self` with trailing whitespace removed."""
        end = last_char(self.text, self.begin, self.end) - self.begin
        return self if end == self.len else self[:end]
269 270

    def split(self, sep=None):
271 272 273 274 275
        """Returns a list of the words in `self`, using `sep` as the
        delimiter string.  If `sep` is not specified or is None, any
        whitespace string is a separator and empty strings are
        removed from the result.
        """
276 277
        if self.fullstring:
            return self.fullstring.split(sep)
278 279 280 281 282 283
        else:
            pieces = []
            l = len(sep)
            k = 0
            i = self.find(sep, k)
            while i >= 0:
284
                pieces.append(self.text[self.begin + k: self.begin + i])
285 286
                k = i + l
                i = self.find(sep, k)
287
            pieces.append(self.text[self.begin + k: self.end])
288 289
            return pieces

290 291 292 293
    def replace(self, old, new):
        """Returns a string where `old` is replaced by `new`."""
        return str(self).replace(old, new)

294

295
EMPTY_STRING_VIEW = StringView('')