Commit 5249535b authored by eckhart's avatar eckhart

merge stringview.pxd

parents 64d393a4 dbe1a013
......@@ -619,7 +619,7 @@ class EBNFCompiler(Compiler):
# prepare and add resume-rules
resume_rules = dict() # type: Dict[str, List[Union[str, unrpr]]]
resume_rules = dict() # type: Dict[str, List[Union[str, unrepr]]]
for symbol, raw_rules in self.directives['resume'].items():
refined_rules = []
for rule in raw_rules:
......
......@@ -16,4 +16,66 @@ cdef int last_char(text, int begin, int end)
cdef int pack_index(int index, int length)
@cython.locals(cbegin=cython.int, cend=cython.int)
cpdef real_indices(begin, end, int length)
cdef real_indices(begin, end, int length)
# cpdef real_indices(begin, end, int length)
# cdefs for class StringView: https://cython.readthedocs.io/en/latest/src/tutorial/pure.html
cdef class StringView:
cdef str text
cdef int begin, end, len
cdef str fullstring
cpdef __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None
cpdef __bool__(self) -> bool
cpdef __len__(self) -> int
cpdef __str__(self) -> str
cpdef __eq__(self, other) -> bool
cpdef __hash__(self) -> int
cpdef __add__(self, other) -> Union[str, 'StringView']
cpdef __radd__(self, other) -> Union[str, 'StringView']
@cython.locals(start=cython.int, stop=cython.int)
cpdef __getitem__(self, index: Optional[slice, int]) -> StringView
cpdef count(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int
cpdef find(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int
cpdef rfind(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int
cpdef startswith(self, prefix: str, start: int = 0, end: Optional[int] = None) -> bool
cpdef endswith(self, suffix: str, start: int = 0, end: Optional[int] = None) -> bool
cpdef match(self, regex, flags=0)
cpdef index(self, absolute_index: int) -> int
cpdef indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...]
cpdef search(self, regex)
cpdef finditer(self, regex)
@cython.locals(begin=cython.int, end=cython.int)
cpdef strip(self)
@cython.locals(begin=cython.int)
cpdef lstrip(self)
@cython.locals(end=cython.int)
cpdef rstrip(self)
@cython.locals(length=cython.int, k=cython.int, i=cython.int)
cpdef split(self, sep=None)
cpdef replace(self, old, new)
......@@ -72,6 +72,9 @@ def pack_index(index: int, length: int) -> int:
"""
# assert length >= 0
index = index if index >= 0 else index + length
# TODO: Test the following code for speedup
# if index < 0:
# index += length
return 0 if index < 0 else length if index > length else index
......@@ -86,7 +89,7 @@ def real_indices(begin: Optional[int],
return pack_index(cbegin, length), pack_index(cend, length)
class StringView(collections.abc.Sized):
class StringView: # (collections.abc.Sized):
"""
A rudimentary StringView class, just enough for the use cases
in parse.py. The difference between a StringView and the python
......@@ -106,13 +109,13 @@ class StringView(collections.abc.Sized):
else:
self.fullstring = ''
def __bool__(self):
def __bool__(self) -> bool:
return self.end > self.begin # and bool(self.text)
def __len__(self):
def __len__(self) -> int:
return self.len
def __str__(self):
def __str__(self) -> str:
# PERFORMANCE WARNING: This creates a copy of the string-slice
if self.fullstring: # optimization: avoid slicing/copying
return self.fullstring
......@@ -122,27 +125,27 @@ class StringView(collections.abc.Sized):
self.fullstring = self.text[self.begin:self.end]
return self.fullstring
def __eq__(self, other):
def __eq__(self, other) -> bool:
# PERFORMANCE WARNING: This creates copies of the strings
return len(other) == len(self) and str(self) == str(other)
def __hash__(self):
def __hash__(self) -> int:
# PERFORMANCE WARNING: This creates a copy of the string-slice
return hash(str(self))
def __add__(self, other):
def __add__(self, other) -> Union[str, 'StringView']:
if isinstance(other, str):
return str(self) + other
else:
return StringView(str(self) + str(other))
def __radd__(self, other):
def __radd__(self, other) -> Union[str, 'StringView']:
if isinstance(other, str):
return other + str(self)
else:
return StringView(str(other) + str(self))
def __getitem__(self, index):
def __getitem__(self, index: Union[slice, int]) -> 'StringView':
# assert isinstance(index, slice), "As of now, StringView only allows slicing."
# assert index.step is None or index.step == 1, \
# "Step sizes other than 1 are not yet supported by StringView"
......@@ -150,9 +153,9 @@ class StringView(collections.abc.Sized):
start, stop = real_indices(index.start, index.stop, self.len)
return StringView(self.text, self.begin + start, self.begin + stop)
except AttributeError:
return self.text[self.begin + index]
return StringView(self.text, self.begin + index, self.begin + index + 1)
def count(self, sub: str, start=None, end=None) -> int:
def count(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
"""Returns the number of non-overlapping occurrences of substring
`sub` in StringView S[start:end]. Optional arguments start and end
are interpreted as in slice notation.
......@@ -165,7 +168,7 @@ class StringView(collections.abc.Sized):
start, end = real_indices(start, end, self.len)
return self.text.count(sub, self.begin + start, self.begin + end)
def find(self, sub: str, start=None, end=None) -> int:
def find(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
"""Returns the lowest index in S where substring `sub` is found,
such that `sub` is contained within S[start:end]. Optional
arguments `start` and `end` are interpreted as in slice notation.
......@@ -179,7 +182,7 @@ class StringView(collections.abc.Sized):
start, end = real_indices(start, end, self.len)
return self.text.find(sub, self.begin + start, self.begin + end) - self.begin
def rfind(self, sub: str, start=None, end=None) -> int:
def rfind(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int:
"""Returns the highest index in S where substring `sub` is found,
such that `sub` is contained within S[start:end]. Optional
arguments `start` and `end` are interpreted as in slice notation.
......
......@@ -915,7 +915,7 @@ def parse_sxpr(sxpr: Union[str, StringView]) -> Node:
"""
sxpr = StringView(sxpr).strip() if isinstance(sxpr, str) else sxpr.strip()
mock_parsers = dict() # type: Dict[str, MockParser]
mock_parsers = dict() # type: Dict[StringView, MockParser]
def next_block(s: StringView):
"""Generator that yields all characters until the next closing bracket
......
......@@ -390,13 +390,13 @@ scroll down to the AST section, you'll see something like this:
"*": replace_by_single_child
}
As you can see, AST-transformations a specified declaratively (with the
As you can see, AST-transformations are specified declaratively (with the
option to add your own Python-programmed transformation rules). This
keeps the specification of the AST-transformation simple and concise. At
the same, we avoid adding hints for the AST-transformation in the
grammar specification, which would render the grammar less readable.
Now that you have seen how DHParser basically works, it is time to go
through the process of desining and testing a domain specific notation
through the process of designing and testing a domain specific notation
step by step from the very start. Head over to the documentation in
subdirectory and read the step by step guide.
......@@ -18,7 +18,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
https://www.apache.org/licenses/LICENSE-2.0e
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
......
......@@ -54,8 +54,8 @@ EBNF_TEMPLATE = r"""-grammar
#
#######################################################################
document = ~ { WORD } §EOF # root parser: a sequence of words preceded by whitespace
# until the end of file
document = ~ { WORD } §EOF # root parser: a sequence of words preceded
# by whitespace until the end of file
#######################################################################
#
......
Workshop: DHParser - Domain Specific Languages for the Digital Humanities
=========================================================================
Proposal for a workshop (180 min) for the [deRSE2019
Conference](https://derse19.uni-jena.de/)
by Eckhart Arnold, Bavarian Academy of Sciences and Humanities, arnold@badw.de
Abstract
--------
Domain specific languages have become an ubiquitous tool in the
software-industry, in many cases replacing XML as configuration or
data description language. By now, there exist quite a few mature
DSL-construction toolkits and DSL-parser generators out there
([Xtext], [MPS], [ANTLR], [pyparsing]) that support the creation of
DSLs.
Nonetheless, DSLs are strangely underused in Digital Humanities
Projects, even though they can provide a great addition, if not in
some cases viable alternative to the omnipresent XML-toolchains. One
possible reason why DSLs have not yet become popular in the Digital
Humanities is that the common DSL construction kits and parser
generators are geared towards different application domains, and do
not fulfill the specific demands of Digital Humanities contexts. In
the Digital Humanities DSLs, just like the XML-data-structures, say,
for a historical-critical edition, can become quite complex, evolve
over time, result from an iterative testing and discussion process in
which users interact with programmers and must be understandable and
usable with ease by researchers that not necessarily accustomed to
computer technology.
[DHParser] is a parser generator for DSLs, developed at the Bavarian
Academy of Sciences and Humanities, that specifically addresses the
Digital Humanities. In particular, it offers support for:
- unit testing of DSLs
- specifying meaningful error messages for the user of the DSL and
locating errors correctly
- debugging support for the DSL-specification and parsing process
- support for abstract-syntax-tree-generation
- a basic framework for compiler construction with XML-output as the
most common use case in mind
- programming in Python, the most commonly known and used programming
language in the Digital Humanities
In the workshop, I am going to explain how to develop a Frontend-DSL
for the “[DTA-Basisformat]” (or, for the purpose of introduction, a
subset thereof). We will assume the “DTA-Basisformat” as a given
target-format und run through the whole development process from
designing the syntax of the DSL through examples, specifying it
formally with [EBNF], directing abstract-syntax-tree generation,
generating XML-output, writing test-cases and specifying error
messages. If time permits, we will also look into the process of
preparing an editor / development environment for our DTA-DSL with
[Visual Studio Code].
In the end, every participant will have learned:
- what a DSL is and what the steps for creating one are
- how the syntax of a DSL can be specified in an EBNF-like formalism
- how a simple DSL-XML-compiler is programmed in Python with the
DHParser-framework
- how important practical concerns like unit-testing of DSLs and
error-reporting can be addressed
- How DSLs relate to XML: Basically, XML allows you to declare and encode
the domain specific semantics of any kind of data, DSLs also enable you
to specify a domain specific syntax for you data, rendering the encoded
data much more human-readable (and -writable) than XML.
- how to use DHParser ;-)
We will close the workshop with a discussion about the benefits as
well as possible disadvantages of employing DSLs in DH-projects in
relation to the necessary effort in in comparison to the
ordinary XML-workflows.
**Requirements for participating and benefiting from the workshop:**
- good working knowledge of [Python] and [regular expressions]
- a laptop with python installed
Suggested Reading:
- [Introduction to DHParser]
- or, more detailed, the [Step by Step Guide to DHParser]
- or, for a real world example, though work in progress, the [DSL for
the medival latin dictionary]
[Xtext]: https://www.eclipse.org/Xtext/
[MPS]: https://www.jetbrains.com/mps/
[ANTLR]: https://www.antlr.org/
[pyparsing]: https://pypi.org/project/pyparsing/
[DHParser]: https://gitlab.lrz.de/badw-it/DHParser
[DTA-Basisformat]: http://www.deutschestextarchiv.de/doku/basisformat/
[EBNF]: https://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf
[Visual Studio Code]: https://code.visualstudio.com/
[Python]: https://www.python.org/
[regular expressions]: https://docs.python.org/3/library/re.html
[Introduction to DHParser]: https://gitlab.lrz.de/badw-it/DHParser/blob/development/Introduction.md
[step by step guide to DHParser]: https://gitlab.lrz.de/badw-it/DHParser/blob/development/documentation/StepByStepGuide.rst
[DSL for the medival latin dictionary]: https://gitlab.lrz.de/badw-it/MLW-DSL
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment