Commit f27dba11 authored by Eckhart Arnold's avatar Eckhart Arnold

- cstringview finished (some (enough?) optimizations)

parent 802fcc4a
"""cstringview.pyx - a cython-version of the stringview class for speedup
slicing strings without copying
Copyright 2016 by Eckhart Arnold (arnold@badw.de)
Bavarian Academy of Sciences an Humanities (badw.de)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.
"""
import collections
from typing import Optional, Iterable, Tuple
__all__ = ('StringView', 'EMPTY_STRING_VIEW')
cdef struct Range:
int begin
int end
cdef inline int pack_index(int index, int len):
index = index if index >= 0 else index + len
return 0 if index < 0 else len if index > len else index
cdef Range real_indices(begin, end, int len):
cdef int ibegin = 0
cdef int iend = len
if begin is not None: ibegin = begin
if end is not None: iend = end
cdef Range r
r.begin = pack_index(ibegin, len)
r.end = pack_index(iend, len)
return r
class StringView(collections.abc.Sized):
""""
A rudimentary StringView class, just enough for the use cases
in parser.py. The difference between a StringView and the python
builtin strings is that StringView-objects do slicing without
copying, i.e. slices are just a view on a section of the sliced
string.
"""
__slots__ = ['text', 'begin', 'end', 'len', 'fullstring_flag']
def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
self.text = text # type: str
self.begin = 0 # type: int
self.end = 0 # type: int
cdef Range r = real_indices(begin, end, len(text))
self.begin = r.begin
self.end = r.end
self.len = max(self.end - self.begin, 0)
self.fullstring_flag = (self.begin == 0 and self.len == len(self.text))
def __bool__(self):
return self.end > self.begin # and bool(self.text)
def __len__(self):
return self.len
def __str__(self):
if self.fullstring_flag: # optimization: avoid slicing/copying
return self.text
# since the slice is being copyied now, anyway, the copy might
# as well be stored in the string view
self.text = self.text[self.begin:self.end]
self.begin = 0
self.len = len(self.text)
self.end = self.len
self.fullstring_flag = True
return self.text
def __eq__(self, other):
return len(other) == len(self) and str(self) == str(other) # PERFORMANCE WARNING: This creates copies of the strings
def __hash__(self):
return hash(str(self)) # PERFORMANCE WARNING: This creates a copy of the string-slice
def __add__(self, other):
if isinstance(other, str):
return (str(self) + other)
else:
return StringView(str(self) + str(other))
def __radd__(self, other):
if isinstance(other, str):
return (other + str(self))
else:
return StringView(str(other) + str(self))
def __getitem__(self, index):
# assert isinstance(index, slice), "As of now, StringView only allows slicing."
# assert index.step is None or index.step == 1, \
# "Step sizes other than 1 are not yet supported by StringView"
cdef Range r = real_indices(index.start, index.stop, self.len)
start = r.begin; stop = r.end
return StringView(self.text, self.begin + start, self.begin + stop)
def count(self, sub, start=None, end=None) -> int:
cdef Range r
if self.fullstring_flag:
return self.text.count(sub, start, end)
elif start is None and end is None:
return self.text.count(sub, self.begin, self.end)
else:
r = real_indices(start, end, self.len)
start = r.begin; end = r.end
return self.text.count(sub, self.begin + start, self.begin + end)
def find(self, sub, start=None, end=None) -> int:
cdef Range r
if self.fullstring_flag:
return self.text.find(sub, start, end)
elif start is None and end is None:
return self.text.find(sub, self.begin, self.end) - self.begin
else:
r = real_indices(start, end, self.len)
start = r.begin; end = r.end
return self.text.find(sub, self.begin + start, self.begin + end) - self.begin
def rfind(self, sub, start=None, end=None) -> int:
cdef Range r
if self.fullstring_flag:
return self.text.rfind(sub, start, end)
if start is None and end is None:
return self.text.rfind(sub, self.begin, self.end) - self.begin
else:
r = real_indices(start, end, self.len)
start = r.begin; end = r.end
return self.text.rfind(sub, self.begin + start, self.begin + end) - self.begin
def startswith(self, prefix: str, start: int = 0, end: Optional[int] = None) -> bool:
start += self.begin
end = self.end if end is None else self.begin + end
return self.text.startswith(prefix, start, end)
def match(self, regex):
return regex.match(self.text, pos=self.begin, endpos=self.end)
def index(self, absolute_index: int) -> int:
"""
Converts an index for a string watched by a StringView object
to an index relative to the string view object, e.g.:
>>> sv = StringView('xxIxx')[2:3]
>>> match = sv.match(re.compile('I'))
>>> match.end()
3
>>> sv.index(match.end())
1
"""
return absolute_index - self.begin
def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
"""Converts indices for a string watched by a StringView object
to indices relative to the string view object. See also: `sv_index()`
"""
return tuple(index - self.begin for index in absolute_indices)
def search(self, regex):
return regex.search(self.text, pos=self.begin, endpos=self.end)
def strip(self):
if self.fullstring_flag:
return self.text.strip()
else:
begin = self.begin
end = self.end
while begin < end and self.text[begin] in ' \n\t':
begin += 1
while end > begin and self.text[end] in ' \n\t':
end -= 1
return self.text[begin:end]
# return str(self).strip() # PERFORMANCE WARNING: This creates a copy of the string
def split(self, sep=None):
if self.fullstring_flag:
return self.text.split(sep)
else:
pieces = []
l = len(sep)
k = 0
i = self.find(sep, k)
while i >= 0:
pieces.append(self.text[self.begin + k : self.begin + i])
k = i + l
i = self.find(sep, k)
pieces.append(self.text[self.begin + k : self.end])
return pieces
# return str(self).split(sep, maxsplit) # PERFORMANCE WARNING: This creates a copy of the string
EMPTY_STRING_VIEW = StringView('')
......@@ -29,12 +29,12 @@ from typing import Optional, Iterable, Tuple
__all__ = ('StringView', 'EMPTY_STRING_VIEW')
def pack_index(index, len):
cdef inline int pack_index(int index, int len):
index = index if index >= 0 else index + len
return 0 if index < 0 else len if index > len else index
def real_indices(begin, end, len):
cdef real_indices(begin, end, len):
if begin is None: begin = 0
if end is None: end = len
return pack_index(begin, len), pack_index(end, len)
......@@ -78,10 +78,10 @@ class StringView(collections.abc.Sized):
return self.text
def __eq__(self, other):
return str(self) == str(other) # PERFORMANCE WARNING: This creates copies of the strings
return len(other) == len(self) and str(self) == str(other) # PERFORMANCE WARNING: This creates copies of the strings
def __hash__(self):
return hash(str(self)) # PERFORMANCE WARNING: This creates a copy of the string
return hash(str(self)) # PERFORMANCE WARNING: This creates a copy of the string-slice
def __add__(self, other):
if isinstance(other, str):
......
......@@ -21,23 +21,17 @@ compilation of domain specific languages based on an EBNF-grammar.
import os
try:
import regex as re
except ImportError:
import re
try:
from typing import Any, cast, Tuple, Union, Iterator, Iterable
except ImportError:
from .typing34 import Any, cast, Tuple, Union, Iterator, Iterable
from DHParser.ebnf import EBNFCompiler, grammar_changed, \
get_ebnf_preprocessor, get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler, \
PreprocessorFactoryFunc, ParserFactoryFunc, TransformerFactoryFunc, CompilerFactoryFunc
from DHParser.toolkit import logging, load_if_file, is_python_code, compile_python_object
from DHParser.toolkit import logging, load_if_file, is_python_code, compile_python_object, \
re, typing
from DHParser.parser import Grammar, Compiler, compile_source, nil_preprocessor, PreprocessorFunc
from DHParser.syntaxtree import Node, TransformationFunc
from DHParser.error import Error, is_error, has_errors, only_errors
from typing import Any, cast, Tuple, Union, Iterator, Iterable
__all__ = ('GrammarError',
'CompilationError',
'load_compiler_suite',
......
......@@ -20,16 +20,7 @@ import keyword
from collections import OrderedDict
from functools import partial
try:
import regex as re
except ImportError:
import re
try:
from typing import Callable, Dict, List, Set, Tuple, Union
except ImportError:
from .typing34 import Callable, Dict, List, Set, Tuple, Union
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name
from DHParser.toolkit import load_if_file, escape_re, md5, sane_parser_name, re, typing
from DHParser.parser import Grammar, mixin_comment, nil_preprocessor, Forward, RegExp, RE, \
NegativeLookahead, Alternative, Series, Option, OneOrMore, ZeroOrMore, Token, \
Required, Compiler, PreprocessorFunc
......@@ -40,6 +31,8 @@ from DHParser.transform import traverse, remove_brackets, \
remove_tokens, flatten, forbid, assert_content, remove_infix_operator
from DHParser.versionnumber import __version__
from typing import Callable, Dict, List, Set, Tuple, Union
__all__ = ('get_ebnf_preprocessor',
'get_ebnf_grammar',
'get_ebnf_transformer',
......
......@@ -61,29 +61,13 @@ import copy
import os
from functools import partial
try:
import regex as re
except ImportError:
import re
try:
from typing import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union, Optional
# try:
# from typing import Collection
# except ImportError:
# pass
except ImportError:
from .typing34 import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union, Optional
from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name
from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name, load_if_file, \
re, typing
from DHParser.stringview import StringView, EMPTY_STRING_VIEW
from DHParser.syntaxtree import Node, TransformationFunc, ParserBase, WHITESPACE_PTYPE, TOKEN_PTYPE, \
ZOMBIE_PARSER
from DHParser.error import Error, is_error, has_errors, linebreaks, line_col
from DHParser.toolkit import load_if_file
try:
import pyximport; pyximport.install()
from DHParser.cstringview import StringView, EMPTY_STRING_VIEW
except ImportError:
from DHParser.stringview import StringView, EMPTY_STRING_VIEW
from typing import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union, Optional
__all__ = ('PreprocessorFunc',
'HistoryRecord',
......
"""cstringview.pyx - a cython-version of the stringview class for speedup
slicing strings without copying
Copyright 2016 by Eckhart Arnold (arnold@badw.de)
Bavarian Academy of Sciences an Humanities (badw.de)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
StringView provides string-slicing without copying.
Slicing Python-strings always yields copies of a segment of the original
string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html
However, this becomes costly (in terms of space and as a consequence also
time) when parsing longer documents. Unfortunately, Python's `memoryview`
does not work for unicode strings. Hence, the StringView class.
"""
import collections
from typing import Optional, Iterable, Tuple
__all__ = ('StringView', 'EMPTY_STRING_VIEW')
def pack_index(index, len):
index = index if index >= 0 else index + len
return 0 if index < 0 else len if index > len else index
def real_indices(begin, end, len):
if begin is None: begin = 0
if end is None: end = len
return pack_index(begin, len), pack_index(end, len)
class StringView(collections.abc.Sized):
""""
A rudimentary StringView class, just enough for the use cases
in parser.py. The difference between a StringView and the python
builtin strings is that StringView-objects do slicing without
copying, i.e. slices are just a view on a section of the sliced
string.
"""
__slots__ = ['text', 'begin', 'end', 'len', 'fullstring_flag']
def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
self.text = text # type: str
self.begin = 0 # type: int
self.end = 0 # type: int
self.begin, self.end = real_indices(begin, end, len(text))
self.len = max(self.end - self.begin, 0)
self.fullstring_flag = (self.begin == 0 and self.len == len(self.text))
def __bool__(self):
return self.end > self.begin # and bool(self.text)
def __len__(self):
return self.len
def __str__(self):
if self.fullstring_flag: # optimization: avoid slicing/copying
return self.text
# since the slice is being copyied now, anyway, the copy might
# as well be stored in the string view
self.text = self.text[self.begin:self.end]
self.begin = 0
self.len = len(self.text)
self.end = self.len
self.fullstring_flag = True
return self.text
def __eq__(self, other):
return len(other) == len(self) and str(self) == str(other) # PERFORMANCE WARNING: This creates copies of the strings
def __hash__(self):
return hash(str(self)) # PERFORMANCE WARNING: This creates a copy of the string-slice
def __add__(self, other):
if isinstance(other, str):
return (str(self) + other)
else:
return StringView(str(self) + str(other))
def __radd__(self, other):
if isinstance(other, str):
return (other + str(self))
else:
return StringView(str(other) + str(self))
def __getitem__(self, index):
# assert isinstance(index, slice), "As of now, StringView only allows slicing."
# assert index.step is None or index.step == 1, \
# "Step sizes other than 1 are not yet supported by StringView"
start, stop = real_indices(index.start, index.stop, self.len)
return StringView(self.text, self.begin + start, self.begin + stop)
def count(self, sub, start=None, end=None) -> int:
if self.fullstring_flag:
return self.text.count(sub, start, end)
elif start is None and end is None:
return self.text.count(sub, self.begin, self.end)
else:
start, end = real_indices(start, end, self.len)
return self.text.count(sub, self.begin + start, self.begin + end)
def find(self, sub, start=None, end=None) -> int:
if self.fullstring_flag:
return self.text.find(sub, start, end)
elif start is None and end is None:
return self.text.find(sub, self.begin, self.end) - self.begin
else:
start, end = real_indices(start, end, self.len)
return self.text.find(sub, self.begin + start, self.begin + end) - self.begin
def rfind(self, sub, start=None, end=None) -> int:
if self.fullstring_flag:
return self.text.rfind(sub, start, end)
if start is None and end is None:
return self.text.rfind(sub, self.begin, self.end) - self.begin
else:
start, end = real_indices(start, end, self.len)
return self.text.rfind(sub, self.begin + start, self.begin + end) - self.begin
def startswith(self, prefix: str, start: int = 0, end: Optional[int] = None) -> bool:
start += self.begin
end = self.end if end is None else self.begin + end
return self.text.startswith(prefix, start, end)
def match(self, regex):
return regex.match(self.text, pos=self.begin, endpos=self.end)
def index(self, absolute_index: int) -> int:
"""
Converts an index for a string watched by a StringView object
to an index relative to the string view object, e.g.:
>>> sv = StringView('xxIxx')[2:3]
>>> match = sv.match(re.compile('I'))
>>> match.end()
3
>>> sv.index(match.end())
1
"""
return absolute_index - self.begin
def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]:
"""Converts indices for a string watched by a StringView object
to indices relative to the string view object. See also: `sv_index()`
"""
return tuple(index - self.begin for index in absolute_indices)
def search(self, regex):
return regex.search(self.text, pos=self.begin, endpos=self.end)
def strip(self):
if self.fullstring_flag:
return self.text.strip()
else:
begin = self.begin
end = self.end
while begin < end and self.text[begin] in ' \n\t':
begin += 1
while end > begin and self.text[end] in ' \n\t':
end -= 1
return self.text[begin:end]
# return str(self).strip() # PERFORMANCE WARNING: This creates a copy of the string
def split(self, sep=None):
if self.fullstring_flag:
return self.text.split(sep)
else:
pieces = []
l = len(sep)
k = 0
i = self.find(sep, k)
while i >= 0:
pieces.append(self.text[self.begin + k : self.begin + i])
k = i + l
i = self.find(sep, k)
pieces.append(self.text[self.begin + k : self.end])
return pieces
# return str(self).split(sep, maxsplit) # PERFORMANCE WARNING: This creates a copy of the string
EMPTY_STRING_VIEW = StringView('')
"""stringview.py - a stringview class: slicing strings without copying
(This module merely passes through the Python or Cython version of
string views. The real implementations are to be found in the
pstringview.py and cstringview.pyx modules, respectively.)
Copyright 2016 by Eckhart Arnold (arnold@badw.de)
Bavarian Academy of Sciences an Humanities (badw.de)
......@@ -27,114 +30,9 @@ from typing import Optional, Iterable, Tuple
__all__ = ('StringView', 'EMPTY_STRING_VIEW')
try:
import pyximport; pyximport.install()
from DHParser.cstringview import StringView, EMPTY_STRING_VIEW
except ImportError:
from DHParser.pstringview import StringView, EMPTY_STRING_VIEW
def pack_index(index, len):
index = index if index >= 0 else index + len
return 0 if index < 0 else len if index > len else index
def real_indices(begin, end, len):
if begin is None: begin = 0
if end is None: end = len
return pack_index(begin, len), pack_index(end, len)
class StringView(collections.abc.Sized):
""""
A rudimentary StringView class, just enough for the use cases
in parser.py. The difference between a StringView and the python
builtin strings is that StringView-objects do slicing without
copying, i.e. slices are just a view on a section of the sliced
string.
"""
__slots__ = ['text', 'begin', 'end', 'len', 'fullstring_flag']
def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None:
self.text = text # type: str
self.begin = 0 # type: int
self.end = 0 # type: int
self.begin, self.end = real_indices(begin, end, len(text))
self.len = max(self.end - self.begin, 0)
self.fullstring_flag = (self.begin == 0 and self.len == len(self.text))
def __bool__(self):
return bool(self.text) and self.end > self.begin
def __len__(self):
return self.len
def __str__(self):
if self.fullstring_flag: # optimization: avoid slicing/copying
return self.text
return self.text[self.begin:self.end]
def __getitem__(self, index):
# assert isinstance(index, slice), "As of now, StringView only allows slicing."
# assert index.step is None or index.step == 1, \
# "Step sizes other than 1 are not yet supported by StringView"
start, stop = real_indices(index.start, index.stop, self.len)
return StringView(self.text, self.begin + start, self.begin + stop)
def __eq__(self, other):
return str(self) == str(other) # PERFORMANCE WARNING: This creates copies of the strings
def count(self, sub, start=None, end=None) -> int:
if self.fullstring_flag:
return self.text.count(sub, start, end)
elif start is None and end is None:
return self.text.count(sub, self.begin, self.end)
else:
start, end = real_indices(start, end, self.len)
return self.text.count(sub, self.begin + start, self.begin + end)
def find(self, sub, start=None, end=None) -> int:
if self.fullstring_flag:
return self.text.find(sub, start, end)
elif start is None and end is None:
return self.text.find(sub, self.begin, self.end) - self.begin
else:
start, end = real_indices(start, end, self.len)
return self.text.find(sub, self.begin + start, self.begin + end) - self.begin
def rfind(self, sub, start=None, end=None) -> int:
if self.fullstring_flag:
return self.text.rfind(sub, start, end)
if start is None and end is None:
return self.text.rfind(sub, self.begin, self.end) - self.begin
else:
start, end = real_indices(start, end, self.len)
return self.text.rfind(sub, self.begin + start, self.begin + end) - self.begin