stringview.py 13.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# stringview.py - a string class where slices are views not copies as
#                    with the standard Python strings.
#
#    stringview.pxd - declarations for the cython Python to C compiler
#                    to speed up handling of StringViews.
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
di68kap's avatar
di68kap committed
21

22 23 24 25 26 27 28 29 30 31 32 33
"""
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.

It is recommended to compile this modules with the Cython-compiler for
speedup. The modules comes with a ``stringview.pxd`` that contains some type
declarations to fully exploit the potential of the Cython-compiler.
"""
34

35
import collections
36
from typing import Optional, Union, Iterable, Tuple
37

38 39 40 41 42
try:
    import cython
    cython_optimized = cython.compiled  # type: bool
except ImportError:
    # import DHParser.Shadow as cython
43
    cython_optimized = False
44
    import DHParser.shadow_cython as cython
45

46 47

__all__ = ('StringView', 'EMPTY_STRING_VIEW', 'cython_optimized')
48 49


50 51 52 53
def first_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
     `text` within the bounds [begin, end].
    """
54 55 56 57 58
    while begin < end and text[begin] in ' \n\t':
        begin += 1
    return begin


59 60 61 62
def last_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
    `text` within the bounds [begin, end].
    """
eckhart's avatar
eckhart committed
63
    while end > begin and text[end - 1] in ' \n\t':
64 65 66 67
        end -= 1
    return end


68 69 70 71 72 73 74 75 76 77 78 79 80
def pack_index(index: int, length: int) -> int:
    """Transforms `index` into a positive index counting from the beginning
    of the string, capping it at the boundaries [0, len].
    Examples:
    >>> pack_index(-1, 5)
    4
    >>> pack_index(6, 5)
    5
    >>> pack_index(-7, 5)
    0
    """
    # assert length >= 0
    index = index if index >= 0 else index + length
81 82 83
    # TODO: Test the following code for speedup
    # if index < 0:
    #     index += length
84
    return 0 if index < 0 else length if index > length else index
85 86


87 88
def real_indices(begin: Optional[int],
                 end: Optional[int],
89
                 length) -> Tuple[int, int]:
90 91 92
    """Returns the tuple of real (i.e. positive) indices from the slice
    indices `begin`,  `end`, assuming a string of size `length`.
    """
93
    cbegin = 0 if begin is None else begin
94 95
    cend = length if end is None else end
    return pack_index(cbegin, length), pack_index(cend, length)
96 97


98
class StringView:  # collections.abc.Sized
99 100
    """
    A rudimentary StringView class, just enough for the use cases
101
    in parse.py. The difference between a StringView and the python
102 103 104 105
    builtin strings is that StringView-objects do slicing without
    copying, i.e. slices are just a view on a section of the sliced
    string.
    """
106
    __slots__ = ['_text', '_begin', '_end', '_len', '_fullstring']
107 108

    def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
109
        # assert isinstance(text, str)
110 111 112
        self._text = text  # type: str
        self._begin, self._end = real_indices(begin, end, len(text))
        self._len = max(self._end - self._begin, 0)  # type: int
Eckhart Arnold's avatar
Eckhart Arnold committed
113 114 115 116 117
        self._fullstring = ''  # type: str
        # if (self._begin == 0 and self._len == len(self._text)):
        #     self._fullstring = self._text  # type: str
        # else:
        #     self._fullstring = ''
118

119
    def __bool__(self) -> bool:
120
        return self._end > self._begin  # and bool(self.text)
121

122
    def __len__(self) -> int:
123
        return self._len
124

125
    def __str__(self) -> str:
126
        # PERFORMANCE WARNING: This creates a copy of the string-slice
127 128
        if self._fullstring:  # optimization: avoid slicing/copying
            return self._fullstring
129 130
        # since the slice is being copyied now, anyway, the copy might
        # as well be stored in the string view
di68kap's avatar
di68kap committed
131
        # return self.text[self.begin:self.end]  # use this for debugging!
132 133
        self._fullstring = self._text[self._begin:self._end]
        return self._fullstring
134

135
    def __eq__(self, other) -> bool:
136 137
        # PERFORMANCE WARNING: This creates copies of the strings
        return len(other) == len(self) and str(self) == str(other)
138

139
    def __hash__(self) -> int:
140 141
        # PERFORMANCE WARNING: This creates a copy of the string-slice
        return hash(str(self))
142

143
    def __add__(self, other) -> Union[str, 'StringView']:
144
        if isinstance(other, str):
145
            return str(self) + other
146 147 148
        else:
            return StringView(str(self) + str(other))

149
    def __radd__(self, other) -> Union[str, 'StringView']:
150
        if isinstance(other, str):
151
            return other + str(self)
152 153 154
        else:
            return StringView(str(other) + str(self))

eckhart's avatar
eckhart committed
155
    @cython.locals(start=cython.int, stop=cython.int)
156
    def __getitem__(self, index: Union[slice, int]) -> 'StringView':
157 158 159
        # assert isinstance(index, slice), "As of now, StringView only allows slicing."
        # assert index.step is None or index.step == 1, \
        #     "Step sizes other than 1 are not yet supported by StringView"
160
        try:
161 162
            start, stop = real_indices(index.start, index.stop, self._len)
            return StringView(self._text, self._begin + start, self._begin + stop)
163
        except AttributeError:
164 165 166 167 168
            return StringView(self._text, self._begin + index, self._begin + index + 1)

    def get_text(self) -> str:
        """Returns the underlying string."""
        return self._text
169

170
    @cython.locals(_start=cython.int, _end=cython.int)
171
    def count(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
172 173 174 175
        """Returns the number of non-overlapping occurrences of substring
        `sub` in StringView S[start:end].  Optional arguments start and end
        are interpreted as in slice notation.
        """
176 177 178 179 180
        if self._fullstring:
            if cython_optimized:
                return self._fullstring.count(sub, start or 0, self._len if end is None else end)
            else:
                return self._fullstring.count(sub, start, end)
181
        elif start is None and end is None:
182
            return self._text.count(sub, self._begin, self._end)
183
        else:
184 185
            _start, _end = real_indices(start, end, self._len)
            return self._text.count(sub, self._begin + _start, self._begin + _end)
186

di68kap's avatar
di68kap committed
187
    @cython.locals(_start=cython.int, _end=cython.int)
188
    def find(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
189 190 191 192 193
        """Returns the lowest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
194 195 196 197 198
        if self._fullstring:
            if cython_optimized:
                return self._fullstring.find(sub, start or 0, self._len if end is None else end)
            else:
                return self._fullstring.find(sub, start, end)
199
        elif start is None and end is None:
200
            return self._text.find(sub, self._begin, self._end) - self._begin
201
        else:
di68kap's avatar
di68kap committed
202 203
            _start, _end = real_indices(start, end, self._len)
            return self._text.find(sub, self._begin + _start, self._begin + _end) - self._begin
204

205
    @cython.locals(_start=cython.int, _end=cython.int)
206
    def rfind(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
207 208 209 210 211
        """Returns the highest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
212 213 214 215 216
        if self._fullstring:
            if cython_optimized:
                return self._fullstring.rfind(sub, start or 0, self._len if end is None else end)
            else:
                return self._fullstring.rfind(sub, start, end)
217
        if start is None and end is None:
218
            return self._text.rfind(sub, self._begin, self._end) - self._begin
219
        else:
220 221
            _start, _end = real_indices(start, end, self._len)
            return self._text.rfind(sub, self._begin + _start, self._begin + _end) - self._begin
222

223
    def startswith(self,
224
                   prefix: str,
225 226 227 228 229 230
                   start: int = 0,
                   end: Optional[int] = None) -> bool:
        """Return True if S starts with the specified prefix, False otherwise.
        With optional `start`, test S beginning at that position.
        With optional `end`, stop comparing S at that position.
        """
231 232 233
        start += self._begin
        end = self._end if end is None else self._begin + end
        return self._text.startswith(prefix, start, end)
234

235

236 237 238 239 240 241 242 243
    def endswith(self,
                 suffix: str,
                 start: int = 0,
                 end: Optional[int] = None) -> bool:
        """Return True if S ends with the specified suufix, False otherwise.
        With optional `start`, test S beginning at that position.
        With optional `end`, stop comparing S at that position.
        """
244 245 246
        start += self._begin
        end = self._end if end is None else self._begin + end
        return self._text.endswith(suffix, start, end)
247

248
    def match(self, regex, flags: int = 0):
249
        """Executes `regex.match` on the StringView object and returns the
eckhart's avatar
eckhart committed
250 251 252
        result, which is either a match-object or None. Keep in mind that
        match.end(), match.span() etc. are mapped to the underlying text,
        not the StringView-object!!!
253
        """
254
        return regex.match(self._text, pos=self._begin, endpos=self._end)
255 256

    def index(self, absolute_index: int) -> int:
eckhart's avatar
eckhart committed
257 258 259 260 261 262 263 264 265 266
        """Converts an index for a string watched by a StringView object
        to an index relative to the string view object, e.g.::

            >>> import re
            >>> sv = StringView('xxIxx')[2:3]
            >>> match = sv.match(re.compile('I'))
            >>> match.end()
            3
            >>> sv.index(match.end())
            1
267
        """
268
        return absolute_index - self._begin
269 270 271 272 273

    def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
        """Converts indices for a string watched by a StringView object
        to indices relative to the string view object. See also: `sv_index()`
        """
274
        return tuple(index - self._begin for index in absolute_indices)
275 276

    def search(self, regex):
277
        """Executes regex.search on the StringView object and returns the
eckhart's avatar
eckhart committed
278 279 280
        result, which is either a match-object or None. Keep in mind that
        match.end(), match.span() etc. are mapped to the underlying text,
        not the StringView-object!!!
281
        """
282
        return regex.search(self._text, pos=self._begin, endpos=self._end)
283

284 285
    def finditer(self, regex):
        """Executes regex.finditer on the StringView object and returns the
eckhart's avatar
eckhart committed
286 287
        iterator of match objects. Keep in mind that match.end(), match.span()
        etc. are mapped to the underlying text, not the StringView-object!!!
288
        """
289
        return regex.finditer(self._text, pos=self._begin, endpos=self._end)
290

291
    @cython.locals(begin=cython.int, end=cython.int)
292
    def strip(self):
293 294 295
        """Returns a copy of the StringView `self` with leading and trailing
        whitespace removed.
        """
296 297 298
        begin = first_char(self._text, self._begin, self._end) - self._begin
        end = last_char(self._text, self._begin, self._end) - self._begin
        return self if begin == 0 and end == self._len else self[begin:end]
299

300
    @cython.locals(begin=cython.int)
301 302
    def lstrip(self):
        """Returns a copy of `self` with leading whitespace removed."""
303
        begin = first_char(self._text, self._begin, self._end) - self._begin
304 305
        return self if begin == 0 else self[begin:]

306
    @cython.locals(end=cython.int)
307 308
    def rstrip(self):
        """Returns a copy of `self` with trailing whitespace removed."""
309 310
        end = last_char(self._text, self._begin, self._end) - self._begin
        return self if end == self._len else self[:end]
311

312
    @cython.locals(length=cython.int, i=cython.int, k=cython.int)
313
    def split(self, sep=None):
314 315 316 317 318
        """Returns a list of the words in `self`, using `sep` as the
        delimiter string.  If `sep` is not specified or is None, any
        whitespace string is a separator and empty strings are
        removed from the result.
        """
319 320
        if self._fullstring:
            return self._fullstring.split(sep)
321 322
        else:
            pieces = []
eckhart's avatar
eckhart committed
323
            length = len(sep)
324 325 326
            k = 0
            i = self.find(sep, k)
            while i >= 0:
327
                pieces.append(self._text[self._begin + k: self._begin + i])
eckhart's avatar
eckhart committed
328
                k = i + length
329
                i = self.find(sep, k)
330
            pieces.append(self._text[self._begin + k: self._end])
331 332
            return pieces

333 334 335 336
    def replace(self, old, new):
        """Returns a string where `old` is replaced by `new`."""
        return str(self).replace(old, new)

337

338
EMPTY_STRING_VIEW = StringView('')