stringview.py 11.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# stringview.py - a string class where slices are views not copies as
#                    with the standard Python strings.
#
#    stringview.pxd - declarations for the cython Python to C compiler
#                    to speed up handling of StringViews.
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
di68kap's avatar
di68kap committed
21

22
23
24
25
26
27
28
29
30
31
32
33
"""
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.

It is recommended to compile this modules with the Cython-compiler for
speedup. The modules comes with a ``stringview.pxd`` that contains some type
declarations to fully exploit the potential of the Cython-compiler.
"""
34

35
import collections
di68kap's avatar
di68kap committed
36
37

from DHParser.toolkit import typing
38
from typing import Optional, Union, Iterable, Tuple
39

40

41
__all__ = ('StringView', 'EMPTY_STRING_VIEW')
42
43


44
45
46
47
def first_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
     `text` within the bounds [begin, end].
    """
48
49
50
51
52
    while begin < end and text[begin] in ' \n\t':
        begin += 1
    return begin


53
54
55
56
def last_char(text, begin: int, end: int) -> int:
    """Returns the index of the first non-whitespace character in string
    `text` within the bounds [begin, end].
    """
57
    while end > begin and text[end-1] in ' \n\t':
58
59
60
61
        end -= 1
    return end


62
63
64
65
66
67
68
69
70
71
72
73
74
75
def pack_index(index: int, length: int) -> int:
    """Transforms `index` into a positive index counting from the beginning
    of the string, capping it at the boundaries [0, len].
    Examples:
    >>> pack_index(-1, 5)
    4
    >>> pack_index(6, 5)
    5
    >>> pack_index(-7, 5)
    0
    """
    # assert length >= 0
    index = index if index >= 0 else index + length
    return 0 if index < 0 else length if index > length else index
76
77


78
79
80
81
82
83
def real_indices(begin: Optional[int],
                 end: Optional[int],
                 length) -> Tuple[int, int]:   # "length: int" fails with cython!?
    """Returns the tuple of real (i.e. positive) indices from the slice
    indices `begin`,  `end`, assuming a string of size `length`.
    """
84
    cbegin = 0 if begin is None else begin
85
86
    cend = length if end is None else end
    return pack_index(cbegin, length), pack_index(cend, length)
87
88
89
90
91


class StringView(collections.abc.Sized):
    """
    A rudimentary StringView class, just enough for the use cases
92
    in parse.py. The difference between a StringView and the python
93
94
95
96
    builtin strings is that StringView-objects do slicing without
    copying, i.e. slices are just a view on a section of the sliced
    string.
    """
97
    __slots__ = ['text', 'begin', 'end', 'len', 'fullstring']
98
99

    def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
100
        # assert isinstance(text, str)
101
102
103
        self.text = text  # type: str
        self.begin, self.end = real_indices(begin, end, len(text))
        self.len = max(self.end - self.begin, 0)  # type: int
104
105
106
107
        if (self.begin == 0 and self.len == len(self.text)):
            self.fullstring = self.text  # type: str
        else:
            self.fullstring = ''
108
109
110
111
112
113
114
115

    def __bool__(self):
        return self.end > self.begin  # and bool(self.text)

    def __len__(self):
        return self.len

    def __str__(self):
116
        # PERFORMANCE WARNING: This creates a copy of the string-slice
117
118
        if self.fullstring:  # optimization: avoid slicing/copying
            return self.fullstring
119
120
        # since the slice is being copyied now, anyway, the copy might
        # as well be stored in the string view
di68kap's avatar
di68kap committed
121
        # return self.text[self.begin:self.end]  # use this for debugging!
122
123
        self.fullstring = self.text[self.begin:self.end]
        return self.fullstring
124
125

    def __eq__(self, other):
126
127
        # PERFORMANCE WARNING: This creates copies of the strings
        return len(other) == len(self) and str(self) == str(other)
128
129

    def __hash__(self):
130
131
        # PERFORMANCE WARNING: This creates a copy of the string-slice
        return hash(str(self))
132
133
134

    def __add__(self, other):
        if isinstance(other, str):
135
            return str(self) + other
136
137
138
139
140
        else:
            return StringView(str(self) + str(other))

    def __radd__(self, other):
        if isinstance(other, str):
141
            return other + str(self)
142
143
144
145
146
147
148
        else:
            return StringView(str(other) + str(self))

    def __getitem__(self, index):
        # assert isinstance(index, slice), "As of now, StringView only allows slicing."
        # assert index.step is None or index.step == 1, \
        #     "Step sizes other than 1 are not yet supported by StringView"
149
150
151
152
153
        try:
            start, stop = real_indices(index.start, index.stop, self.len)
            return StringView(self.text, self.begin + start, self.begin + stop)
        except AttributeError:
            return self.text[self.begin + index]
154

155
156
157
158
159
    def count(self, sub: str, start=None, end=None) -> int:
        """Returns the number of non-overlapping occurrences of substring
        `sub` in StringView S[start:end].  Optional arguments start and end
        are interpreted as in slice notation.
        """
160
161
        if self.fullstring:
            return self.fullstring.count(sub, start, end)
162
163
164
165
166
167
        elif start is None and end is None:
            return self.text.count(sub, self.begin, self.end)
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.count(sub, self.begin + start, self.begin + end)

168
169
170
171
172
173
    def find(self, sub: str, start=None, end=None) -> int:
        """Returns the lowest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
174
175
        if self.fullstring:
            return self.fullstring.find(sub, start, end)
176
177
178
179
180
181
        elif start is None and end is None:
            return self.text.find(sub, self.begin, self.end) - self.begin
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.find(sub, self.begin + start, self.begin + end) - self.begin

182
183
184
185
186
187
    def rfind(self, sub: str, start=None, end=None) -> int:
        """Returns the highest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
188
189
        if self.fullstring:
            return self.fullstring.rfind(sub, start, end)
190
191
192
193
194
195
        if start is None and end is None:
            return self.text.rfind(sub, self.begin, self.end) - self.begin
        else:
            start, end = real_indices(start, end, self.len)
            return self.text.rfind(sub, self.begin + start, self.begin + end) - self.begin

196
197
198
199
200
201
202
203
204
    def startswith(self,
                   prefix: Union[str, Tuple[str, ...]],
                   start: int = 0,
                   end: Optional[int] = None) -> bool:
        """Return True if S starts with the specified prefix, False otherwise.
        With optional `start`, test S beginning at that position.
        With optional `end`, stop comparing S at that position.
        prefix can also be a tuple of strings to try.
        """
205
206
207
208
        start += self.begin
        end = self.end if end is None else self.begin + end
        return self.text.startswith(prefix, start, end)

209
    def match(self, regex, flags=0):
210
        """Executes `regex.match` on the StringView object and returns the
eckhart's avatar
eckhart committed
211
212
213
        result, which is either a match-object or None. Keep in mind that
        match.end(), match.span() etc. are mapped to the underlying text,
        not the StringView-object!!!
214
        """
215
216
217
        return regex.match(self.text, pos=self.begin, endpos=self.end)

    def index(self, absolute_index: int) -> int:
eckhart's avatar
eckhart committed
218
219
220
221
222
223
224
225
226
227
        """Converts an index for a string watched by a StringView object
        to an index relative to the string view object, e.g.::

            >>> import re
            >>> sv = StringView('xxIxx')[2:3]
            >>> match = sv.match(re.compile('I'))
            >>> match.end()
            3
            >>> sv.index(match.end())
            1
228
229
230
231
232
233
234
235
236
237
        """
        return absolute_index - self.begin

    def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
        """Converts indices for a string watched by a StringView object
        to indices relative to the string view object. See also: `sv_index()`
        """
        return tuple(index - self.begin for index in absolute_indices)

    def search(self, regex):
238
        """Executes regex.search on the StringView object and returns the
eckhart's avatar
eckhart committed
239
240
241
        result, which is either a match-object or None. Keep in mind that
        match.end(), match.span() etc. are mapped to the underlying text,
        not the StringView-object!!!
242
        """
243
244
        return regex.search(self.text, pos=self.begin, endpos=self.end)

245
246
    def finditer(self, regex):
        """Executes regex.finditer on the StringView object and returns the
eckhart's avatar
eckhart committed
247
248
        iterator of match objects. Keep in mind that match.end(), match.span()
        etc. are mapped to the underlying text, not the StringView-object!!!
249
250
251
        """
        return regex.finditer(self.text, pos=self.begin, endpos=self.end)

252
    def strip(self):
253
254
255
        """Returns a copy of the StringView `self` with leading and trailing
        whitespace removed.
        """
256
257
258
259
260
261
262
263
264
265
266
267
268
        begin = first_char(self.text, self.begin, self.end) - self.begin
        end = last_char(self.text, self.begin, self.end) - self.begin
        return self if begin == 0 and end == self.len else self[begin:end]

    def lstrip(self):
        """Returns a copy of `self` with leading whitespace removed."""
        begin = first_char(self.text, self.begin, self.end) - self.begin
        return self if begin == 0 else self[begin:]

    def rstrip(self):
        """Returns a copy of `self` with trailing whitespace removed."""
        end = last_char(self.text, self.begin, self.end) - self.begin
        return self if end == self.len else self[:end]
269
270

    def split(self, sep=None):
271
272
273
274
275
        """Returns a list of the words in `self`, using `sep` as the
        delimiter string.  If `sep` is not specified or is None, any
        whitespace string is a separator and empty strings are
        removed from the result.
        """
276
277
        if self.fullstring:
            return self.fullstring.split(sep)
278
279
280
281
282
283
        else:
            pieces = []
            l = len(sep)
            k = 0
            i = self.find(sep, k)
            while i >= 0:
284
                pieces.append(self.text[self.begin + k: self.begin + i])
285
286
                k = i + l
                i = self.find(sep, k)
287
            pieces.append(self.text[self.begin + k: self.end])
288
289
            return pieces

290
291
292
293
    def replace(self, old, new):
        """Returns a string where `old` is replaced by `new`."""
        return str(self).replace(old, new)

294

295
EMPTY_STRING_VIEW = StringView('')