11.3.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

stringview.py 12.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# stringview.py - a string class where slices are views not copies as
#                    with the standard Python strings.
#
#    stringview.pxd - declarations for the cython Python to C compiler
#                    to speed up handling of StringViews.
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
di68kap's avatar
di68kap committed
21

22 23 24 25 26 27 28 29 30 31 32 33
"""
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.

It is recommended to compile this modules with the Cython-compiler for
speedup. The modules comes with a ``stringview.pxd`` that contains some type
declarations to fully exploit the potential of the Cython-compiler.
"""
34

35
import collections
di68kap's avatar
di68kap committed
36 37

from DHParser.toolkit import typing
38
from typing import Optional, Union, Iterable, Tuple
39

40

41
__all__ = ('StringView', 'EMPTY_STRING_VIEW')
42 43


44 45 46 47
def first_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
     `text` within the bounds [begin, end].
    """
48 49 50 51 52
    while begin < end and text[begin] in ' \n\t':
        begin += 1
    return begin


53 54 55 56
def last_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
    `text` within the bounds [begin, end].
    """
eckhart's avatar
eckhart committed
57
    while end > begin and text[end - 1] in ' \n\t':
58 59 60 61
        end -= 1
    return end


62 63 64 65 66 67 68 69 70 71 72 73 74 75
def pack_index(index: int, length: int) -> int:
    """Transforms `index` into a positive index counting from the beginning
    of the string, capping it at the boundaries [0, len].
    Examples:
    >>> pack_index(-1, 5)
    4
    >>> pack_index(6, 5)
    5
    >>> pack_index(-7, 5)
    0
    """
    # assert length >= 0
    index = index if index >= 0 else index + length
    return 0 if index < 0 else length if index > length else index
76 77


78 79 80 81 82 83
def real_indices(begin: Optional[int],
                 end: Optional[int],
                 length) -> Tuple[int, int]:   # "length: int" fails with cython!?
    """Returns the tuple of real (i.e. positive) indices from the slice
    indices `begin`,  `end`, assuming a string of size `length`.
    """
84
    cbegin = 0 if begin is None else begin
85 86
    cend = length if end is None else end
    return pack_index(cbegin, length), pack_index(cend, length)
87 88 89 90 91


class StringView(collections.abc.Sized):
    """
    A rudimentary StringView class, just enough for the use cases
92
    in parse.py. The difference between a StringView and the python
93 94 95 96
    builtin strings is that StringView-objects do slicing without
    copying, i.e. slices are just a view on a section of the sliced
    string.
    """
97
    __slots__ = ['text', 'begin', 'end', 'len', 'fullstring']
98 99

    def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
100
        # assert isinstance(text, str)
101 102 103
        self.text = text  # type: str
        self.begin, self.end = real_indices(begin, end, len(text))
        self.len = max(self.end - self.begin, 0)  # type: int
104 105 106 107
        if (self.begin == 0 and self.len == len(self.text)):
            self.fullstring = self.text  # type: str
        else:
            self.fullstring = ''
108 109 110 111 112 113 114 115

    def __bool__(self):
        return self.end > self.begin  # and bool(self.text)

    def __len__(self):
        return self.len

    def __str__(self):
116
        # PERFORMANCE WARNING: This creates a copy of the string-slice
117 118
        if self.fullstring:  # optimization: avoid slicing/copying
            return self.fullstring
119 120
        # since the slice is being copyied now, anyway, the copy might
        # as well be stored in the string view
di68kap's avatar
di68kap committed
121
        # return self.text[self.begin:self.end]  # use this for debugging!
122 123
        self.fullstring = self.text[self.begin:self.end]
        return self.fullstring
124 125

    def __eq__(self, other):
126 127
        # PERFORMANCE WARNING: This creates copies of the strings
        return len(other) == len(self) and str(self) == str(other)
128 129

    def __hash__(self):
130 131
        # PERFORMANCE WARNING: This creates a copy of the string-slice
        return hash(str(self))
132 133 134

    def __add__(self, other):
        if isinstance(other, str):
135
            return str(self) + other
136 137 138 139 140
        else:
            return StringView(str(self) + str(other))

    def __radd__(self, other):
        if isinstance(other, str):
141
            return other + str(self)
142 143 144 145 146 147 148
        else:
            return StringView(str(other) + str(self))

    def __getitem__(self, index):
        # assert isinstance(index, slice), "As of now, StringView only allows slicing."
        # assert index.step is None or index.step == 1, \
        #     "Step sizes other than 1 are not yet supported by StringView"
149 150 151 152 153
        try:
            start, stop = real_indices(index.start, index.stop, self.len)
            return StringView(self.text, self.begin + start, self.begin + stop)
        except AttributeError:
            return self.text[self.begin + index]
154

155 156 157 158 159
    def count(self, sub: str, start=None, end=None) -> int:
        """Returns the number of non-overlapping occurrences of substring
        `sub` in StringView S[start:end].  Optional arguments start and end
        are interpreted as in slice notation.
        """
160 161
        if self.fullstring:
            return self.fullstring.count(sub, start, end)
162 163 164 165 166 167
        elif start is None and end is None:
            return self.text.count(sub, self.begin, self.end)
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.count(sub, self.begin + start, self.begin + end)

168 169 170 171 172 173
    def find(self, sub: str, start=None, end=None) -> int:
        """Returns the lowest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
174 175
        if self.fullstring:
            return self.fullstring.find(sub, start, end)
176 177 178 179 180 181
        elif start is None and end is None:
            return self.text.find(sub, self.begin, self.end) - self.begin
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.find(sub, self.begin + start, self.begin + end) - self.begin

182 183 184 185 186 187
    def rfind(self, sub: str, start=None, end=None) -> int:
        """Returns the highest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
188 189
        if self.fullstring:
            return self.fullstring.rfind(sub, start, end)
190 191 192 193 194 195
        if start is None and end is None:
            return self.text.rfind(sub, self.begin, self.end) - self.begin
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.rfind(sub, self.begin + start, self.begin + end) - self.begin

196
    def startswith(self,
197
                   prefix: str,
198 199 200 201 202 203
                   start: int = 0,
                   end: Optional[int] = None) -> bool:
        """Return True if S starts with the specified prefix, False otherwise.
        With optional `start`, test S beginning at that position.
        With optional `end`, stop comparing S at that position.
        """
204 205 206 207
        start += self.begin
        end = self.end if end is None else self.begin + end
        return self.text.startswith(prefix, start, end)

208 209 210 211 212 213 214 215 216 217 218 219
    def endswith(self,
                 suffix: str,
                 start: int = 0,
                 end: Optional[int] = None) -> bool:
        """Return True if S ends with the specified suufix, False otherwise.
        With optional `start`, test S beginning at that position.
        With optional `end`, stop comparing S at that position.
        """
        start += self.begin
        end = self.end if end is None else self.begin + end
        return self.text.endswith(suffix, start, end)

220
    def match(self, regex, flags=0):
221
        """Executes `regex.match` on the StringView object and returns the
eckhart's avatar
eckhart committed
222 223 224
        result, which is either a match-object or None. Keep in mind that
        match.end(), match.span() etc. are mapped to the underlying text,
        not the StringView-object!!!
225
        """
226 227 228
        return regex.match(self.text, pos=self.begin, endpos=self.end)

    def index(self, absolute_index: int) -> int:
eckhart's avatar
eckhart committed
229 230 231 232 233 234 235 236 237 238
        """Converts an index for a string watched by a StringView object
        to an index relative to the string view object, e.g.::

            >>> import re
            >>> sv = StringView('xxIxx')[2:3]
            >>> match = sv.match(re.compile('I'))
            >>> match.end()
            3
            >>> sv.index(match.end())
            1
239 240 241 242 243 244 245 246 247 248
        """
        return absolute_index - self.begin

    def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
        """Converts indices for a string watched by a StringView object
        to indices relative to the string view object. See also: `sv_index()`
        """
        return tuple(index - self.begin for index in absolute_indices)

    def search(self, regex):
249
        """Executes regex.search on the StringView object and returns the
eckhart's avatar
eckhart committed
250 251 252
        result, which is either a match-object or None. Keep in mind that
        match.end(), match.span() etc. are mapped to the underlying text,
        not the StringView-object!!!
253
        """
254 255
        return regex.search(self.text, pos=self.begin, endpos=self.end)

256 257
    def finditer(self, regex):
        """Executes regex.finditer on the StringView object and returns the
eckhart's avatar
eckhart committed
258 259
        iterator of match objects. Keep in mind that match.end(), match.span()
        etc. are mapped to the underlying text, not the StringView-object!!!
260 261 262
        """
        return regex.finditer(self.text, pos=self.begin, endpos=self.end)

263
    def strip(self):
264 265 266
        """Returns a copy of the StringView `self` with leading and trailing
        whitespace removed.
        """
267 268 269 270 271 272 273 274 275 276 277 278 279
        begin = first_char(self.text, self.begin, self.end) - self.begin
        end = last_char(self.text, self.begin, self.end) - self.begin
        return self if begin == 0 and end == self.len else self[begin:end]

    def lstrip(self):
        """Returns a copy of `self` with leading whitespace removed."""
        begin = first_char(self.text, self.begin, self.end) - self.begin
        return self if begin == 0 else self[begin:]

    def rstrip(self):
        """Returns a copy of `self` with trailing whitespace removed."""
        end = last_char(self.text, self.begin, self.end) - self.begin
        return self if end == self.len else self[:end]
280 281

    def split(self, sep=None):
282 283 284 285 286
        """Returns a list of the words in `self`, using `sep` as the
        delimiter string.  If `sep` is not specified or is None, any
        whitespace string is a separator and empty strings are
        removed from the result.
        """
287 288
        if self.fullstring:
            return self.fullstring.split(sep)
289 290
        else:
            pieces = []
eckhart's avatar
eckhart committed
291
            length = len(sep)
292 293 294
            k = 0
            i = self.find(sep, k)
            while i >= 0:
295
                pieces.append(self.text[self.begin + k: self.begin + i])
eckhart's avatar
eckhart committed
296
                k = i + length
297
                i = self.find(sep, k)
298
            pieces.append(self.text[self.begin + k: self.end])
299 300
            return pieces

301 302 303 304
    def replace(self, old, new):
        """Returns a string where `old` is replaced by `new`."""
        return str(self).replace(old, new)

305

306
EMPTY_STRING_VIEW = StringView('')