stringview.py 20.2 KB
Newer Older
1
2
3
# stringview.py - a string class where slices are views not copies as
#                    with the standard Python strings.
#
4
# stringview.pxd - declarations for the cython Python to C compiler
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#                    to speed up handling of StringViews.
#
# Copyright 2016  by Eckhart Arnold (arnold@badw.de)
#                 Bavarian Academy of Sciences an Humanities (badw.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.  See the License for the specific language governing
# permissions and limitations under the License.
di68kap's avatar
di68kap committed
21

22
23
24
25
26
27
28
29
"""
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.

di68kap's avatar
di68kap committed
30
It is recommended to compile this module with the Cython-compiler for
31
speedup. The modules comes with a ``stringview.pxd`` that contains some type
di68kap's avatar
di68kap committed
32
declarations to more fully exploit the benefits of the Cython-compiler.
33
"""
34

35
from typing import Optional, Union, Iterable, Tuple, List, Sequence, cast
36

37
38
39
40
41
try:
    import cython
    cython_optimized = cython.compiled  # type: bool
except ImportError:
    # import DHParser.Shadow as cython
42
    cython_optimized = False
43
    import DHParser.externallibs.shadow_cython as cython
44

45

46
__all__ = ('StringView', 'real_indices', 'EMPTY_STRING_VIEW', 'TextBuffer')
47
48


di68kap's avatar
di68kap committed
49
50
51
52
@cython.cfunc
@cython.returns(cython.int)
@cython.locals(begin=cython.int, end=cython.int)
def first_char(text: str, begin: int, end: int, chars: str) -> int:
53
54
55
    """Returns the index of the first non-whitespace character in string
     `text` within the bounds [begin, end].
    """
56
    while begin < end and text[begin] in chars:
57
58
59
60
        begin += 1
    return begin


di68kap's avatar
di68kap committed
61
62
63
64
@cython.cfunc
@cython.returns(cython.int)
@cython.locals(begin=cython.int, end=cython.int)
def last_char(text: str, begin: int, end: int, chars: str) -> int:
eckhart's avatar
typo    
eckhart committed
65
    """Returns the index of the last non-whitespace character in string
66
67
    `text` within the bounds [begin, end].
    """
68
    while end > begin and text[end - 1] in chars:
69
70
71
72
        end -= 1
    return end


di68kap's avatar
di68kap committed
73
74
75
@cython.cfunc
@cython.returns(cython.int)
@cython.locals(index=cython.int, length=cython.int)
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def pack_index(index: int, length: int) -> int:
    """Transforms `index` into a positive index counting from the beginning
    of the string, capping it at the boundaries [0, len].
    Examples:
    >>> pack_index(-1, 5)
    4
    >>> pack_index(6, 5)
    5
    >>> pack_index(-7, 5)
    0
    """
    # assert length >= 0
    index = index if index >= 0 else index + length
    return 0 if index < 0 else length if index > length else index
90
91


92
93
@cython.cfunc
@cython.returns((cython.int, cython.int))
di68kap's avatar
di68kap committed
94
@cython.locals(cbegin=cython.int, cend=cython.int, length=cython.int)
95
96
97
def fast_real_indices(begin: Optional[int],
                      end: Optional[int],
                      length: int) -> Tuple[int, int]:
98
99
100
    """Returns the tuple of real (i.e. positive) indices from the slice
    indices `begin`,  `end`, assuming a string of size `length`.
    """
di68kap's avatar
di68kap committed
101
102
    cbegin = 0 if begin is None else begin
    # cbegin = begin if begin else 0
103
104
    cend = length if end is None else end
    return pack_index(cbegin, length), pack_index(cend, length)
105
106


107
108
109
def real_indices(begin: Optional[int],
                 end: Optional[int],
                 length: int) -> Tuple[int, int]:
110
    """Python callable real-indices function for testing."""
111
    return fast_real_indices(begin, end, length)
112
113


114
class StringView:  # collections.abc.Sized
115
116
    """
    A rudimentary StringView class, just enough for the use cases
117
    in parse.py. The difference between a StringView and the python
118
119
120
121
    builtin strings is that StringView-objects do slicing without
    copying, i.e. slices are just a view on a section of the sliced
    string.
    """
122
    __slots__ = ['_text', '_begin', '_end', '_len', '_fullstring']
123

di68kap's avatar
di68kap committed
124
    @cython.locals(_begin=cython.int, _end=cython.int, _len=cython.int)
125
    def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
126
        # assert isinstance(text, str)
di68kap's avatar
di68kap committed
127
128
129
130
131
132
        # self._text = text  # type: str
        # self._begin, self._end = fast_real_indices(begin, end, len(text))
        # self._len = self._end - self._begin  # type: int
        # if self._len < 0:
        #     self._len = 0
        # self._fullstring = ''  # type: str
133
        self._text = text  # type: str
di68kap's avatar
di68kap committed
134
135
136
137
138
139
140
        _begin, _end = fast_real_indices(begin, end, len(text))
        _len = _end - _begin  # type: int
        if _len < 0:
            _len = 0
        self._begin = _begin   # type: int
        self._end = _end       # type: int
        self._len = _len       # type: int
Eckhart Arnold's avatar
Eckhart Arnold committed
141
        self._fullstring = ''  # type: str
142

143
    def __bool__(self) -> bool:
di68kap's avatar
di68kap committed
144
        return self._len != 0  # self._end > self._begin  # and bool(self.text)
145

146
    def __len__(self) -> int:
147
        return self._len
148

149
    def __str__(self) -> str:
150
        """PERFORMANCE WARNING: This creates a copy of the string-slice!"""
di68kap's avatar
di68kap committed
151
152
153
        _fullstring = self._fullstring  # type: str
        if _fullstring or self._len == 0:  # optimization: avoid slicing/copying
            return _fullstring
154
        # since the slice is being copied now, anyway, the copy might
155
        # as well be stored in the string view
di68kap's avatar
di68kap committed
156
        # return self.text[self.begin:self.end]  # use this for debugging!
di68kap's avatar
di68kap committed
157
158
159
        _fullstring = self._text[self._begin:self._end]
        self._fullstring = _fullstring
        return _fullstring
160

161
    def __repr__(self) -> str:
162
        """PERFORMANCE WARNING: This creates a copy of the string-slice!"""
163
164
        return repr(str(self))

di68kap's avatar
di68kap committed
165
    @cython.locals(_len=cython.int)
166
    def __eq__(self, other) -> bool:
167
        """PERFORMANCE WARNING: This create copies of the compared string-slices!"""
168
        # one string copy could be avoided by using find...
di68kap's avatar
di68kap committed
169
170
171
172
173
        # return len(other) == self._len and str(self) == str(other)
        _len = self._len
        if len(other) == _len:
            if _len == 0:
                return True
174
175
176
            if isinstance(other, StringView) \
                    and self._text is other._text and self._begin == other._begin:
                return True
di68kap's avatar
di68kap committed
177
178
179
180
181
182
183
            _fullstring = self._fullstring  # type: str
            if _fullstring:
                return _fullstring == str(other)
            _fullstring = self._text[self._begin:self._end]
            self._fullstring = _fullstring
            return _fullstring == str(other)
        return False
184

185
    def __hash__(self) -> int:
186
        """PERFORMANCE WARNING: This creates a copy of the string-slice!"""
187
        return hash(str(self))
188

189
    def __add__(self, other) -> Union[str, 'StringView']:
190
        if isinstance(other, str):
191
            return str(self) + other
192
193
194
        else:
            return StringView(str(self) + str(other))

195
    def __radd__(self, other) -> Union[str, 'StringView']:
196
        if isinstance(other, str):
197
            return other + str(self)
198
199
200
        else:
            return StringView(str(other) + str(self))

di68kap's avatar
di68kap committed
201
    @cython.locals(start=cython.int, stop=cython.int, _begin=cython.int, _index=cython.int)
202
    def __getitem__(self, index: Union[slice, int]) -> 'StringView':
203
204
205
        # assert isinstance(index, slice), "As of now, StringView only allows slicing."
        # assert index.step is None or index.step == 1, \
        #     "Step sizes other than 1 are not yet supported by StringView"
206
        try:
207
            start, stop = fast_real_indices(index.start, index.stop, self._len)
di68kap's avatar
di68kap committed
208
209
            _begin = self._begin  # type: int
            return StringView(self._text, _begin + start, _begin + stop)
210
        except AttributeError:
di68kap's avatar
di68kap committed
211
212
213
214
215
            _index = index  # type: int
            if _index >= self._len:
                raise IndexError("StringView index %i out of range 0 - %i" % (_index, self._len))
            _begin = self._begin
            return StringView(self._text, self._begin + _index, self._begin + _index + 1)
di68kap's avatar
di68kap committed
216
            # return self._text[self._begin + index] # leads to type errors
217
218
219
220

    def get_text(self) -> str:
        """Returns the underlying string."""
        return self._text
221

222
    @cython.locals(_start=cython.int, _end=cython.int)
223
    def count(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
224
225
226
227
        """Returns the number of non-overlapping occurrences of substring
        `sub` in StringView S[start:end].  Optional arguments start and end
        are interpreted as in slice notation.
        """
228
229
230
231
232
        if self._fullstring:
            if cython_optimized:
                return self._fullstring.count(sub, start or 0, self._len if end is None else end)
            else:
                return self._fullstring.count(sub, start, end)
233
        elif start is None and end is None:
234
            return self._text.count(sub, self._begin, self._end)
235
        else:
236
            _start, _end = fast_real_indices(start, end, self._len)
237
            return self._text.count(sub, self._begin + _start, self._begin + _end)
238

di68kap's avatar
di68kap committed
239
    @cython.locals(_start=cython.int, _end=cython.int, _begin=cython.int)
240
    def find(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
241
242
243
244
245
        """Returns the lowest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
246
247
248
249
250
        if self._fullstring:
            if cython_optimized:
                return self._fullstring.find(sub, start or 0, self._len if end is None else end)
            else:
                return self._fullstring.find(sub, start, end)
251
        elif start is None and end is None:
di68kap's avatar
di68kap committed
252
253
            _begin = self._begin  # type: int
            return max(self._text.find(sub, _begin, self._end) - _begin, -1)
254
        else:
di68kap's avatar
di68kap committed
255
            _begin = self._begin  # type: int
256
            _start, _end = fast_real_indices(start, end, self._len)
di68kap's avatar
di68kap committed
257
            return max(self._text.find(sub, self._begin + _start, _begin + _end) - _begin, -1)
258

di68kap's avatar
di68kap committed
259
    @cython.locals(_start=cython.int, _end=cython.int, _begin=cython.int)
260
    def rfind(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
261
262
263
264
265
        """Returns the highest index in S where substring `sub` is found,
        such that `sub` is contained within S[start:end].  Optional
        arguments `start` and `end` are interpreted as in slice notation.
        Returns -1 on failure.
        """
266
267
268
269
270
        if self._fullstring:
            if cython_optimized:
                return self._fullstring.rfind(sub, start or 0, self._len if end is None else end)
            else:
                return self._fullstring.rfind(sub, start, end)
271
        if start is None and end is None:
di68kap's avatar
di68kap committed
272
273
            _begin = self._begin  # type: int
            return max(self._text.rfind(sub, _begin, self._end) - _begin, -1)
274
        else:
di68kap's avatar
di68kap committed
275
            _begin = self._begin  # type: int
276
            _start, _end = fast_real_indices(start, end, self._len)
di68kap's avatar
di68kap committed
277
            return max(self._text.rfind(sub, _begin + _start, _begin + _end) - _begin, -1)
278

di68kap's avatar
di68kap committed
279
    @cython.locals(start=cython.int)
280
    def startswith(self,
281
                   prefix: str,
282
283
284
285
286
287
                   start: int = 0,
                   end: Optional[int] = None) -> bool:
        """Return True if S starts with the specified prefix, False otherwise.
        With optional `start`, test S beginning at that position.
        With optional `end`, stop comparing S at that position.
        """
288
289
290
        start += self._begin
        end = self._end if end is None else self._begin + end
        return self._text.startswith(prefix, start, end)
291

di68kap's avatar
di68kap committed
292
    @cython.locals(start=cython.int)
293
294
295
296
    def endswith(self,
                 suffix: str,
                 start: int = 0,
                 end: Optional[int] = None) -> bool:
297
        """Return True if S ends with the specified suffix, False otherwise.
298
299
300
        With optional `start`, test S beginning at that position.
        With optional `end`, stop comparing S at that position.
        """
301
302
303
        start += self._begin
        end = self._end if end is None else self._begin + end
        return self._text.endswith(suffix, start, end)
304

di68kap's avatar
di68kap committed
305
    @cython.locals(flags=cython.int)
306
    def match(self, regex, flags: int = 0):
307
        """Executes `regex.match` on the StringView object and returns the
eckhart's avatar
eckhart committed
308
309
310
        result, which is either a match-object or None. Keep in mind that
        match.end(), match.span() etc. are mapped to the underlying text,
        not the StringView-object!!!
311
        """
312
        return regex.match(self._text, pos=self._begin, endpos=self._end)
313

di68kap's avatar
di68kap committed
314
    @cython.locals(absolute_index=cython.int)
315
    def index(self, absolute_index: int) -> int:
eckhart's avatar
eckhart committed
316
317
318
319
320
321
322
323
324
325
        """Converts an index for a string watched by a StringView object
        to an index relative to the string view object, e.g.::

            >>> import re
            >>> sv = StringView('xxIxx')[2:3]
            >>> match = sv.match(re.compile('I'))
            >>> match.end()
            3
            >>> sv.index(match.end())
            1
326
        """
327
        return absolute_index - self._begin
328

di68kap's avatar
di68kap committed
329
    @cython.locals(_begin=cython.int)
330
331
332
333
    def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
        """Converts indices for a string watched by a StringView object
        to indices relative to the string view object. See also: `sv_index()`
        """
di68kap's avatar
di68kap committed
334
335
        _begin = self._begin  # type: int
        return tuple(index - _begin for index in absolute_indices)
336

di68kap's avatar
di68kap committed
337
    @cython.locals(_begin=cython.int)
338
    def search(self, regex, start: Optional[int] = None, end: Optional[int] = None):
339
        """Executes regex.search on the StringView object and returns the
eckhart's avatar
eckhart committed
340
341
342
        result, which is either a match-object or None. Keep in mind that
        match.end(), match.span() etc. are mapped to the underlying text,
        not the StringView-object!!!
343
        """
di68kap's avatar
di68kap committed
344
345
346
        _begin = self._begin  # type: int
        start = _begin if start is None else max(_begin, min(_begin + start, self._end))
        end = self._end if end is None else max(_begin, min(_begin + end, self._end))
347
        return regex.search(self._text, start, end)
348

349
350
    def finditer(self, regex):
        """Executes regex.finditer on the StringView object and returns the
eckhart's avatar
eckhart committed
351
352
        iterator of match objects. Keep in mind that match.end(), match.span()
        etc. are mapped to the underlying text, not the StringView-object!!!
353
        """
354
        return regex.finditer(self._text, pos=self._begin, endpos=self._end)
355

356
    @cython.locals(begin=cython.int, end=cython.int)
357
    def strip(self, chars: str = ' \n\r\t') -> 'StringView':
358
359
360
        """Returns a copy of the StringView `self` with leading and trailing
        whitespace removed.
        """
361
362
        begin = first_char(self._text, self._begin, self._end, chars) - self._begin
        end = last_char(self._text, self._begin, self._end, chars) - self._begin
363
        return self if begin == 0 and end == self._len else self[begin:end]
364

365
    @cython.locals(begin=cython.int)
Eckhart Arnold's avatar
Eckhart Arnold committed
366
    def lstrip(self, chars=' \n\t') -> 'StringView':
367
        """Returns a copy of `self` with leading whitespace removed."""
368
        begin = first_char(self._text, self._begin, self._end, chars) - self._begin
369
370
        return self if begin == 0 else self[begin:]

371
    @cython.locals(end=cython.int)
Eckhart Arnold's avatar
Eckhart Arnold committed
372
    def rstrip(self, chars=' \n\t') -> 'StringView':
373
        """Returns a copy of `self` with trailing whitespace removed."""
374
        end = last_char(self._text, self._begin, self._end, chars) - self._begin
375
        return self if end == self._len else self[:end]
376

377
    @cython.locals(length=cython.int, i=cython.int, k=cython.int)
378
    def split(self, sep=None) -> Sequence[Union['StringView', str]]:
379
380
381
382
383
        """Returns a list of the words in `self`, using `sep` as the
        delimiter string.  If `sep` is not specified or is None, any
        whitespace string is a separator and empty strings are
        removed from the result.
        """
384
385
        if self._fullstring:
            return self._fullstring.split(sep)
386
        else:
Eckhart Arnold's avatar
Eckhart Arnold committed
387
            pieces = []  # type: List[Union['StringView', str]]
eckhart's avatar
eckhart committed
388
            length = len(sep)
389
390
            k = 0
            i = self.find(sep, k)
391
392
393
394
395
            # while i >= 0:
            #     pieces.append(self._text[self._begin + k: self._begin + i])
            #     k = i + length
            #     i = self.find(sep, k)
            # pieces.append(self._text[self._begin + k: self._end])
396
            while i >= 0:
397
                pieces.append(self[k:i])
eckhart's avatar
eckhart committed
398
                k = i + length
399
                i = self.find(sep, k)
400
            pieces.append(self[k:])
401
402
            return pieces

Eckhart Arnold's avatar
Eckhart Arnold committed
403
    def replace(self, old, new) -> str:
404
405
406
        """Returns a string where `old` is replaced by `new`."""
        return str(self).replace(old, new)

407

408
EMPTY_STRING_VIEW = StringView('')
409
410
411
412
413
414
415
416
417
418
419
420
421


class TextBuffer:
    """TextBuffer class manages a copy of an edited text for a language
    server. The text can be changed  via incremental edits. TextBuffer
    keeps track of the state of the complete text at any point in time.
    It works line oriented and lines of text can be retrieved via
    indexing or slicing.
    """

    def __init__(self, text: Union[str, StringView], version: int = 0):
        self._text = text       # type: Union[str, StringView]
        self._buffer = []       # type: List[Union[str, StringView]]
422
        self.version = version if version >= 0 else 0  # type: int
423
424
425
426

    def _lazy_init(self):
        self._buffer = [line.strip('\r') for line in self._text.split('\n')]

427
428
    def __getitem__(self, index: Union[slice, int]) \
            -> Union[List[Union[str, StringView]], str, StringView]:
429
430
431
432
433
434
        if not self._buffer:
            self._lazy_init()
        return self._buffer.__getitem__(index)

    def __str__(self) -> str:
        if self._text:
435
436
            return str(self._text)
        return str(self.snapshot('\n'))
437

438
    @cython.locals(l1=cython.int, c1=cython.int, l2=cython.int, c2=cython.int)
439
440
    def update(self, l1: int, c1: int, l2: int, c2: int, replacement: Union[str, StringView]):
        """Replaces the text-range from line and column (l1, c1) to
441
        line and column (l2, c2) with the replacement-string.
442
443
444
445
446
447
448
449
450
        """
        if not self._buffer:
            self._lazy_init()
        lines = [line.strip('\r') for line in replacement.split('\n')]
        head = self._buffer[l1][:c1]
        tail = self._buffer[l2][c2:]
        lines[0] = head + lines[0]
        lines[-1] += tail
        self._buffer[l1:l2 + 1] = lines
451
        self._text = ''  # invalidate single-string copy
452
453
        self.version += 1

454
    @cython.locals(version=cython.int)
455
    def text_edits(self, edits: Union[list, dict], version: int = -1):
456
        """Incorporates the one or more text-edits or change-events into the text.
457
        A Text-Edit is a dictionary of this form::
458
459

            {"range": {"start": {"line": 0, "character": 0 },
460
                       "end":   {"line": 0, "character": 0 } },
461
462
463
464
465
466
             "newText": "..."}

        In case of a change-event, the key "newText" is replaced by "text".
        """
        def edit(ed: dict):
            """Weaves a single edit into the text-buffer."""
467
468
469
            rng = ed["range"]
            start = rng["start"]
            end = rng["end"]
470
471
472
473
            try:
                replacement = ed['text']
            except KeyError:
                replacement = ed['newText']
474
475
            self.update(start["line"], start["character"],
                        end["line"], end["character"], replacement)
476
477
478
479
480
481
482
483
484

        if isinstance(edits, list):
            for ed in edits:
                edit(ed)
        else:
            edit(cast(dict, edits))
        if version >= 0:
            self.version = version

485
    def snapshot(self, eol: str = '\n') -> Union[str, StringView]:
486
        """Returns the current state of the entire text, using the given
487
        end of line marker (``\\n`` or ``\\r\\n``)"""
488
489
490
491
        if self._text:
            return self._text
        self._text = eol.join(str(line) for line in self._buffer)
        return self._text