From 802fcc4aa76b8c67a7d11c631b0393f34710ad7e Mon Sep 17 00:00:00 2001 From: Eckhart Arnold Date: Sun, 24 Sep 2017 16:14:04 +0200 Subject: [PATCH] - cstringview basic implementation (no optimazazions yet) --- DHParser/cstringview.pyx | 192 +++++++++++++++++++++++++++++++++++++++ DHParser/parser.py | 6 +- DHParser/syntaxtree.py | 15 ++- setup.py | 6 ++ test/test_cstringview.py | 133 +++++++++++++++++++++++++++ test/test_parser.py | 6 +- 6 files changed, 352 insertions(+), 6 deletions(-) create mode 100644 DHParser/cstringview.pyx create mode 100644 test/test_cstringview.py diff --git a/DHParser/cstringview.pyx b/DHParser/cstringview.pyx new file mode 100644 index 0000000..8c9a57f --- /dev/null +++ b/DHParser/cstringview.pyx @@ -0,0 +1,192 @@ +"""cstringview.pyx - a cython-version of the stringview class for speedup + slicing strings without copying + +Copyright 2016 by Eckhart Arnold (arnold@badw.de) + Bavarian Academy of Sciences an Humanities (badw.de) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied. See the License for the specific language governing +permissions and limitations under the License. + +StringView provides string-slicing without copying. +Slicing Python-strings always yields copies of a segment of the original +string. See: https://mail.python.org/pipermail/python-dev/2008-May/079699.html +However, this becomes costly (in terms of space and as a consequence also +time) when parsing longer documents. Unfortunately, Python's `memoryview` +does not work for unicode strings. Hence, the StringView class. +""" +import collections +from typing import Optional, Iterable, Tuple + +__all__ = ('StringView', 'EMPTY_STRING_VIEW') + + +def pack_index(index, len): + index = index if index >= 0 else index + len + return 0 if index < 0 else len if index > len else index + + +def real_indices(begin, end, len): + if begin is None: begin = 0 + if end is None: end = len + return pack_index(begin, len), pack_index(end, len) + + +class StringView(collections.abc.Sized): + """" + A rudimentary StringView class, just enough for the use cases + in parser.py. The difference between a StringView and the python + builtin strings is that StringView-objects do slicing without + copying, i.e. slices are just a view on a section of the sliced + string. + """ + + __slots__ = ['text', 'begin', 'end', 'len', 'fullstring_flag'] + + def __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None: + self.text = text # type: str + self.begin = 0 # type: int + self.end = 0 # type: int + self.begin, self.end = real_indices(begin, end, len(text)) + self.len = max(self.end - self.begin, 0) + self.fullstring_flag = (self.begin == 0 and self.len == len(self.text)) + + def __bool__(self): + return self.end > self.begin # and bool(self.text) + + def __len__(self): + return self.len + + def __str__(self): + if self.fullstring_flag: # optimization: avoid slicing/copying + return self.text + # since the slice is being copyied now, anyway, the copy might + # as well be stored in the string view + self.text = self.text[self.begin:self.end] + self.begin = 0 + self.len = len(self.text) + self.end = self.len + self.fullstring_flag = True + return self.text + + def __eq__(self, other): + return str(self) == str(other) # PERFORMANCE WARNING: This creates copies of the strings + + def __hash__(self): + return hash(str(self)) # PERFORMANCE WARNING: This creates a copy of the string + + def __add__(self, other): + if isinstance(other, str): + return (str(self) + other) + else: + return StringView(str(self) + str(other)) + + def __radd__(self, other): + if isinstance(other, str): + return (other + str(self)) + else: + return StringView(str(other) + str(self)) + + def __getitem__(self, index): + # assert isinstance(index, slice), "As of now, StringView only allows slicing." + # assert index.step is None or index.step == 1, \ + # "Step sizes other than 1 are not yet supported by StringView" + start, stop = real_indices(index.start, index.stop, self.len) + return StringView(self.text, self.begin + start, self.begin + stop) + + def count(self, sub, start=None, end=None) -> int: + if self.fullstring_flag: + return self.text.count(sub, start, end) + elif start is None and end is None: + return self.text.count(sub, self.begin, self.end) + else: + start, end = real_indices(start, end, self.len) + return self.text.count(sub, self.begin + start, self.begin + end) + + def find(self, sub, start=None, end=None) -> int: + if self.fullstring_flag: + return self.text.find(sub, start, end) + elif start is None and end is None: + return self.text.find(sub, self.begin, self.end) - self.begin + else: + start, end = real_indices(start, end, self.len) + return self.text.find(sub, self.begin + start, self.begin + end) - self.begin + + def rfind(self, sub, start=None, end=None) -> int: + if self.fullstring_flag: + return self.text.rfind(sub, start, end) + if start is None and end is None: + return self.text.rfind(sub, self.begin, self.end) - self.begin + else: + start, end = real_indices(start, end, self.len) + return self.text.rfind(sub, self.begin + start, self.begin + end) - self.begin + + def startswith(self, prefix: str, start: int = 0, end: Optional[int] = None) -> bool: + start += self.begin + end = self.end if end is None else self.begin + end + return self.text.startswith(prefix, start, end) + + def match(self, regex): + return regex.match(self.text, pos=self.begin, endpos=self.end) + + def index(self, absolute_index: int) -> int: + """ + Converts an index for a string watched by a StringView object + to an index relative to the string view object, e.g.: + >>> sv = StringView('xxIxx')[2:3] + >>> match = sv.match(re.compile('I')) + >>> match.end() + 3 + >>> sv.index(match.end()) + 1 + """ + return absolute_index - self.begin + + def indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]: + """Converts indices for a string watched by a StringView object + to indices relative to the string view object. See also: `sv_index()` + """ + return tuple(index - self.begin for index in absolute_indices) + + def search(self, regex): + return regex.search(self.text, pos=self.begin, endpos=self.end) + + def strip(self): + if self.fullstring_flag: + return self.text.strip() + else: + begin = self.begin + end = self.end + while begin < end and self.text[begin] in ' \n\t': + begin += 1 + while end > begin and self.text[end] in ' \n\t': + end -= 1 + return self.text[begin:end] + # return str(self).strip() # PERFORMANCE WARNING: This creates a copy of the string + + def split(self, sep=None): + if self.fullstring_flag: + return self.text.split(sep) + else: + pieces = [] + l = len(sep) + k = 0 + i = self.find(sep, k) + while i >= 0: + pieces.append(self.text[self.begin + k : self.begin + i]) + k = i + l + i = self.find(sep, k) + pieces.append(self.text[self.begin + k : self.end]) + return pieces + # return str(self).split(sep, maxsplit) # PERFORMANCE WARNING: This creates a copy of the string + + +EMPTY_STRING_VIEW = StringView('') diff --git a/DHParser/parser.py b/DHParser/parser.py index fdc6620..0af7834 100644 --- a/DHParser/parser.py +++ b/DHParser/parser.py @@ -75,11 +75,15 @@ except ImportError: from .typing34 import Any, Callable, cast, Dict, Iterator, List, Set, Tuple, Union, Optional from DHParser.toolkit import is_logging, log_dir, logfile_basename, escape_re, sane_parser_name -from DHParser.stringview import StringView, EMPTY_STRING_VIEW from DHParser.syntaxtree import Node, TransformationFunc, ParserBase, WHITESPACE_PTYPE, TOKEN_PTYPE, \ ZOMBIE_PARSER from DHParser.error import Error, is_error, has_errors, linebreaks, line_col from DHParser.toolkit import load_if_file +try: + import pyximport; pyximport.install() + from DHParser.cstringview import StringView, EMPTY_STRING_VIEW +except ImportError: + from DHParser.stringview import StringView, EMPTY_STRING_VIEW __all__ = ('PreprocessorFunc', 'HistoryRecord', diff --git a/DHParser/syntaxtree.py b/DHParser/syntaxtree.py index 86a7f27..915975b 100644 --- a/DHParser/syntaxtree.py +++ b/DHParser/syntaxtree.py @@ -33,8 +33,12 @@ except ImportError: Iterator, Iterable, List, NamedTuple, Sequence, Union, Text, Tuple, Hashable from DHParser.toolkit import is_logging, log_dir, identity -from DHParser.stringview import StringView from DHParser.error import Error, linebreaks, line_col +try: + import pyximport; pyximport.install() + from DHParser.cstringview import StringView +except ImportError: + from DHParser.stringview import StringView __all__ = ('ParserBase', 'WHITESPACE_PTYPE', @@ -224,7 +228,9 @@ class Node(collections.abc.Sized): def __str__(self): if self.children: return "".join(str(child) for child in self.children) - return str(self.result) + elif isinstance(self.result, StringView): + self.result = str(self.result) + return self.result def __repr__(self): @@ -277,8 +283,9 @@ class Node(collections.abc.Sized): # or isinstance(result, str)), str(result) # Possible optimization: Do not allow single nodes as argument: # assert not isinstance(result, Node) - self._result = (result,) if isinstance(result, Node) else str(result) \ - if isinstance(result, StringView) else result or '' # type: StrictResultType + self._result = (result,) if isinstance(result, Node) else result or '' # type: StrictResultType + # self._result = (result,) if isinstance(result, Node) else str(result) \ + # if isinstance(result, StringView) else result or '' # type: StrictResultType self.children = cast(ChildrenType, self._result) \ if isinstance(self._result, tuple) else cast(ChildrenType, ()) # type: ChildrenType if self.children: diff --git a/setup.py b/setup.py index 8cdb8e8..a3e9a69 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,10 @@ #from distutils.core import setup from setuptools import setup +try: + from Cython.Build import cythonize +except ImportError: + def cythonize(filename): + return [] from DHParser.versionnumber import __version__ @@ -10,6 +15,7 @@ setup( name='DHParser', version=__version__, packages=['DHParser'], + ext_modules = cythonize('DHParser/cstringview.pyx') url='https://gitlab.lrz.de/badw-it/DHParser', license='MIT License (https://opensource.org/licenses/MIT)', author='Eckhart Arnold', diff --git a/test/test_cstringview.py b/test/test_cstringview.py new file mode 100644 index 0000000..9e64e61 --- /dev/null +++ b/test/test_cstringview.py @@ -0,0 +1,133 @@ +#!/usr/bin/python3 + +"""test_stringview.py - tests of the stringview-module of DHParser + +Author: Eckhart Arnold + +Copyright 2017 Bavarian Academy of Sciences and Humanities + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import sys +try: + import regex as re +except ImportError: + import re + +sys.path.extend(['../', './']) + +import pyximport; pyximport.install() +from DHParser.cstringview import StringView, EMPTY_STRING_VIEW, real_indices + + +class TestStringView: + def test_real_indices(self): + assert real_indices(3, 5, 10) == (3, 5) + assert real_indices(None, None, 10) == (0, 10) + assert real_indices(-2, -1, 10) == (8, 9) + assert real_indices(-3, 11, 10) == (7, 10) + assert real_indices(-5, -12, 10) == (5, 0) + assert real_indices(-12, -5, 10) == (0, 5) + assert real_indices(7, 6, 10) == (7, 6) + assert real_indices(None, 0, 10) == (0, 0) + + def test_creation(self): + s = "0123456789" + assert str(StringView(s)) == s + assert str(StringView(s, 3, 4)) == '3' + assert str(StringView(s, -4)) == '6789' + + def test_equality(self): + s = "0123456789" + assert StringView(s) == s + assert StringView(s, 3, 4) == '3' + assert StringView(s, -4) == '6789' + + def test_slicing(self): + s = " 0123456789 " + sv = StringView(s, 1, -1) + assert sv == '0123456789' + assert sv[3:4] == '3' + assert sv[-3:-1] == '78' + assert sv[4:3] == '' + assert sv[:4] == '0123' + assert sv[4:] == '456789' + assert sv[-2:] == '89' + assert sv[:-5] == '01234' + assert isinstance(sv[3:5], StringView) + + def test_len(self): + s = " 0123456789 " + sv = StringView(s, 1, -1) + assert len(sv) == 10 + assert sv.len == 10 + assert len(sv[5:5]) == 0 + assert len(sv[7:4]) == 0 + assert len(sv[-12:-2]) == 8 + assert len(sv[-12:12]) == 10 + + def test_bool(self): + assert not StringView('') + assert StringView('x') + s = " 0123456789 " + sv = StringView(s, 1, -1) + assert not sv[5:4] + assert sv[4:5], str(sv[4:5]) + assert not sv[3:3] + assert not sv[12:13] + assert sv[0:20] + + def test_sv_match(self): + s = " 0123456789 " + sv = StringView(s, 1, -1) + assert sv.match(re.compile(r'\d')) + assert sv.match(re.compile(r'\d+')) + assert not sv.match(re.compile(r' ')) + assert sv[4:].match(re.compile(r'45')) + + def test_sv_search(self): + s = " 0123456789 " + sv = StringView(s, 1, -1) + assert sv.search(re.compile(r'5')) + assert not sv.search(re.compile(r' ')) + assert sv[5:].search(re.compile(r'5')) + assert not sv[:9].search(re.compile(r'9')) + + def test_find(self): + s = " 0123456789 " + sv = StringView(s, 1, -1) + assert sv.find('5') == 5 + assert sv.find(' ') < 0 + assert sv.find('0', 1) < 0 + assert sv.find('9', 0, 8) < 0 + assert sv.find('45', 1, 8) == 4 + + def test_startswith(self): + s = " 0123456789 " + sv = StringView(s, 1, -1) + assert sv.startswith('012') + assert sv.startswith('123', 1) + assert not sv.startswith('123', 1, 3) + + def test_EMPTY_STRING_VIEW(self): + assert len(EMPTY_STRING_VIEW) == 0 + assert EMPTY_STRING_VIEW.find('x') < 0 + assert not EMPTY_STRING_VIEW.match(re.compile(r'x')) + assert EMPTY_STRING_VIEW.match(re.compile(r'.*')) + assert len(EMPTY_STRING_VIEW[0:1]) == 0 + + +if __name__ == "__main__": + from DHParser.testing import runner + runner("", globals()) diff --git a/test/test_parser.py b/test/test_parser.py index 448712f..e28946a 100644 --- a/test/test_parser.py +++ b/test/test_parser.py @@ -25,12 +25,16 @@ from functools import partial sys.path.extend(['../', './']) from DHParser.toolkit import is_logging, logging, compile_python_object -from DHParser.stringview import StringView from DHParser.error import Error from DHParser.parser import compile_source, Retrieve, Grammar, Forward, Token, ZeroOrMore, RE, \ RegExp, Lookbehind, NegativeLookahead, OneOrMore, Series, Alternative from DHParser.ebnf import get_ebnf_grammar, get_ebnf_transformer, get_ebnf_compiler from DHParser.dsl import grammar_provider, DHPARSER_IMPORTS +try: + import pyximport; pyximport.install() + from DHParser.cstringview import StringView +except ImportError: + from DHParser.stringview import StringView class TestInfiLoopsAndRecursion: -- GitLab