11.3.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

stringview.py 13.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# stringview.py - a string class where slices are views not copies as
#                    with the standard Python strings.
#
#    stringview.pxd - declarations for the cython Python to C compiler
#                    to speed up handling of StringViews.
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
di68kap's avatar
di68kap committed
21

22 23 24 25 26 27 28 29 30 31 32 33
"""
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.

It is recommended to compile this modules with the Cython-compiler for
speedup. The modules comes with a ``stringview.pxd`` that contains some type
declarations to fully exploit the potential of the Cython-compiler.
"""
34

35
import collections
di68kap's avatar
di68kap committed
36 37

from DHParser.toolkit import typing
38
from typing import Optional, Union, Iterable, Tuple
39

40 41 42 43 44 45 46
try:
    import cython
    cython_optimized = cython.compiled  # type: bool
except ImportError:
    # import DHParser.Shadow as cython
    cython_optimized = False            # type: bool
    import DHParser.shadow_cython as cython
47

48 49

__all__ = ('StringView', 'EMPTY_STRING_VIEW', 'cython_optimized')
50 51


52 53 54 55
def first_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
     `text` within the bounds [begin, end].
    """
56 57 58 59 60
    while begin < end and text[begin] in ' \n\t':
        begin += 1
    return begin


61 62 63 64
def last_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
    `text` within the bounds [begin, end].
    """
eckhart's avatar
eckhart committed
65
    while end > begin and text[end - 1] in ' \n\t':
66 67 68 69
        end -= 1
    return end


70 71 72 73 74 75 76 77 78 79 80 81 82
def pack_index(index: int, length: int) -> int:
    """Transforms `index` into a positive index counting from the beginning
    of the string, capping it at the boundaries [0, len].
    Examples:
    >>> pack_index(-1, 5)
    4
    >>> pack_index(6, 5)
    5
    >>> pack_index(-7, 5)
    0
    """
    # assert length >= 0
    index = index if index >= 0 else index + length
83 84 85
    # TODO: Test the following code for speedup
    # if index < 0:
    #     index += length
86
    return 0 if index < 0 else length if index > length else index
87 88


89 90
def real_indices(begin: Optional[int],
                 end: Optional[int],
91
                 length) -> Tuple[int, int]:
92 93 94
    """Returns the tuple of real (i.e. positive) indices from the slice
    indices `begin`,  `end`, assuming a string of size `length`.
    """
95
    cbegin = 0 if begin is None else begin
96 97
    cend = length if end is None else end
    return pack_index(cbegin, length), pack_index(cend, length)
98 99


100
class StringView:  # collections.abc.Sized
101 102
    """
    A rudimentary StringView class, just enough for the use cases
103
    in parse.py. The difference between a StringView and the python
104 105 106 107
    builtin strings is that StringView-objects do slicing without
    copying, i.e. slices are just a view on a section of the sliced
    string.
    """
108
    __slots__ = ['_text', '_begin', '_end', '_len', '_fullstring']
109 110

    def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
111
        # assert isinstance(text, str)
112 113 114
        self._text = text  # type: str
        self._begin, self._end = real_indices(begin, end, len(text))
        self._len = max(self._end - self._begin, 0)  # type: int
Eckhart Arnold's avatar
Eckhart Arnold committed
115 116 117 118 119
        self._fullstring = ''  # type: str
        # if (self._begin == 0 and self._len == len(self._text)):
        #     self._fullstring = self._text  # type: str
        # else:
        #     self._fullstring = ''
120

121
    def __bool__(self) -> bool:
122
        return self._end > self._begin  # and bool(self.text)
123

124
    def __len__(self) -> int:
125
        return self._len
126

127
    def __str__(self) -> str:
128
        # PERFORMANCE WARNING: This creates a copy of the string-slice
129 130
        if self._fullstring:  # optimization: avoid slicing/copying
            return self._fullstring
131 132
        # since the slice is being copyied now, anyway, the copy might
        # as well be stored in the string view
di68kap's avatar
di68kap committed
133
        # return self.text[self.begin:self.end]  # use this for debugging!
134 135
        self._fullstring = self._text[self._begin:self._end]
        return self._fullstring
136

137
    def __eq__(self, other) -> bool:
138 139
        # PERFORMANCE WARNING: This creates copies of the strings
        return len(other) == len(self) and str(self) == str(other)
140

141
    def __hash__(self) -> int:
142 143
        # PERFORMANCE WARNING: This creates a copy of the string-slice
        return hash(str(self))
144

145
    def __add__(self, other) -> Union[str, 'StringView']:
146
        if isinstance(other, str):
147
            return str(self) + other
148 149 150
        else:
            return StringView(str(self) + str(other))

151
    def __radd__(self, other) -> Union[str, 'StringView']:
152
        if isinstance(other, str):
153
            return other + str(self)
154 155 156
        else:
            return StringView(str(other) + str(self))

eckhart's avatar
eckhart committed
157
    @cython.locals(start=cython.int, stop=cython.int)
158
    def __getitem__(self, index: Union[slice, int]) -> 'StringView':
159 160 161
        # assert isinstance(index, slice), "As of now, StringView only allows slicing."
        # assert index.step is None or index.step == 1, \
        #     "Step sizes other than 1 are not yet supported by StringView"
162
        try:
163 164
            start, stop = real_indices(index.start, index.stop, self._len)
            return StringView(self._text, self._begin + start, self._begin + stop)
165
        except AttributeError:
166 167 168 169 170
            return StringView(self._text, self._begin + index, self._begin + index + 1)

    def get_text(self) -> str:
        """Returns the underlying string."""
        return self._text
171

172
    @cython.locals(_start=cython.int, _end=cython.int)
173
    def count(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
174 175 176 177
        """Returns the number of non-overlapping occurrences of substring
        `sub` in StringView S[start:end].  Optional arguments start and end
        are interpreted as in slice notation.
        """
178 179 180 181 182
        if self._fullstring:
            if cython_optimized:
                return self._fullstring.count(sub, start or 0, self._len if end is None else end)
            else:
                return self._fullstring.count(sub, start, end)
183
        elif start is None and end is None:
184
            return self._text.count(sub, self._begin, self._end)
185
        else:
186 187
            _start, _end = real_indices(start, end, self._len)
            return self._text.count(sub, self._begin + _start, self._begin + _end)
188

di68kap's avatar
di68kap committed
189
    @cython.locals(_start=cython.int, _end=cython.int)
190
    def find(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
191 192 193 194 195
        """Returns the lowest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
196 197 198 199 200
        if self._fullstring:
            if cython_optimized:
                return self._fullstring.find(sub, start or 0, self._len if end is None else end)
            else:
                return self._fullstring.find(sub, start, end)
201
        elif start is None and end is None:
202
            return self._text.find(sub, self._begin, self._end) - self._begin
203
        else:
di68kap's avatar
di68kap committed
204 205
            _start, _end = real_indices(start, end, self._len)
            return self._text.find(sub, self._begin + _start, self._begin + _end) - self._begin
206

207
    @cython.locals(_start=cython.int, _end=cython.int)
208
    def rfind(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
209 210 211 212 213
        """Returns the highest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
214 215 216 217 218
        if self._fullstring:
            if cython_optimized:
                return self._fullstring.rfind(sub, start or 0, self._len if end is None else end)
            else:
                return self._fullstring.rfind(sub, start, end)
219
        if start is None and end is None:
220
            return self._text.rfind(sub, self._begin, self._end) - self._begin
221
        else:
222 223
            _start, _end = real_indices(start, end, self._len)
            return self._text.rfind(sub, self._begin + _start, self._begin + _end) - self._begin
224

225
    def startswith(self,
226
                   prefix: str,
227 228 229 230 231 232
                   start: int = 0,
                   end: Optional[int] = None) -> bool:
        """Return True if S starts with the specified prefix, False otherwise.
        With optional `start`, test S beginning at that position.
        With optional `end`, stop comparing S at that position.
        """
233 234 235
        start += self._begin
        end = self._end if end is None else self._begin + end
        return self._text.startswith(prefix, start, end)
236

237

238 239 240 241 242 243 244 245
    def endswith(self,
                 suffix: str,
                 start: int = 0,
                 end: Optional[int] = None) -> bool:
        """Return True if S ends with the specified suufix, False otherwise.
        With optional `start`, test S beginning at that position.
        With optional `end`, stop comparing S at that position.
        """
246 247 248
        start += self._begin
        end = self._end if end is None else self._begin + end
        return self._text.endswith(suffix, start, end)
249

250
    def match(self, regex, flags: int = 0):
251
        """Executes `regex.match` on the StringView object and returns the
eckhart's avatar
eckhart committed
252 253 254
        result, which is either a match-object or None. Keep in mind that
        match.end(), match.span() etc. are mapped to the underlying text,
        not the StringView-object!!!
255
        """
256
        return regex.match(self._text, pos=self._begin, endpos=self._end)
257 258

    def index(self, absolute_index: int) -> int:
eckhart's avatar
eckhart committed
259 260 261 262 263 264 265 266 267 268
        """Converts an index for a string watched by a StringView object
        to an index relative to the string view object, e.g.::

            >>> import re
            >>> sv = StringView('xxIxx')[2:3]
            >>> match = sv.match(re.compile('I'))
            >>> match.end()
            3
            >>> sv.index(match.end())
            1
269
        """
270
        return absolute_index - self._begin
271 272 273 274 275

    def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
        """Converts indices for a string watched by a StringView object
        to indices relative to the string view object. See also: `sv_index()`
        """
276
        return tuple(index - self._begin for index in absolute_indices)
277 278

    def search(self, regex):
279
        """Executes regex.search on the StringView object and returns the
eckhart's avatar
eckhart committed
280 281 282
        result, which is either a match-object or None. Keep in mind that
        match.end(), match.span() etc. are mapped to the underlying text,
        not the StringView-object!!!
283
        """
284
        return regex.search(self._text, pos=self._begin, endpos=self._end)
285

286 287
    def finditer(self, regex):
        """Executes regex.finditer on the StringView object and returns the
eckhart's avatar
eckhart committed
288 289
        iterator of match objects. Keep in mind that match.end(), match.span()
        etc. are mapped to the underlying text, not the StringView-object!!!
290
        """
291
        return regex.finditer(self._text, pos=self._begin, endpos=self._end)
292

293
    @cython.locals(begin=cython.int, end=cython.int)
294
    def strip(self):
295 296 297
        """Returns a copy of the StringView `self` with leading and trailing
        whitespace removed.
        """
298 299 300
        begin = first_char(self._text, self._begin, self._end) - self._begin
        end = last_char(self._text, self._begin, self._end) - self._begin
        return self if begin == 0 and end == self._len else self[begin:end]
301

302
    @cython.locals(begin=cython.int)
303 304
    def lstrip(self):
        """Returns a copy of `self` with leading whitespace removed."""
305
        begin = first_char(self._text, self._begin, self._end) - self._begin
306 307
        return self if begin == 0 else self[begin:]

308
    @cython.locals(end=cython.int)
309 310
    def rstrip(self):
        """Returns a copy of `self` with trailing whitespace removed."""
311 312
        end = last_char(self._text, self._begin, self._end) - self._begin
        return self if end == self._len else self[:end]
313

314
    @cython.locals(length=cython.int, i=cython.int, k=cython.int)
315
    def split(self, sep=None):
316 317 318 319 320
        """Returns a list of the words in `self`, using `sep` as the
        delimiter string.  If `sep` is not specified or is None, any
        whitespace string is a separator and empty strings are
        removed from the result.
        """
321 322
        if self._fullstring:
            return self._fullstring.split(sep)
323 324
        else:
            pieces = []
eckhart's avatar
eckhart committed
325
            length = len(sep)
326 327 328
            k = 0
            i = self.find(sep, k)
            while i >= 0:
329
                pieces.append(self._text[self._begin + k: self._begin + i])
eckhart's avatar
eckhart committed
330
                k = i + length
331
                i = self.find(sep, k)
332
            pieces.append(self._text[self._begin + k: self._end])
333 334
            return pieces

335 336 337 338
    def replace(self, old, new):
        """Returns a string where `old` is replaced by `new`."""
        return str(self).replace(old, new)

339

340
EMPTY_STRING_VIEW = StringView('')