diff --git a/DHParser/ebnf.py b/DHParser/ebnf.py index a028ea24994a175dc9e9c0bb1d1cc956c1f6bb09..6b80c385dd8ebc6cd0b932ea7373d0e3830eca8d 100644 --- a/DHParser/ebnf.py +++ b/DHParser/ebnf.py @@ -619,7 +619,7 @@ class EBNFCompiler(Compiler): # prepare and add resume-rules - resume_rules = dict() # type: Dict[str, List[Union[str, unrpr]]] + resume_rules = dict() # type: Dict[str, List[Union[str, unrepr]]] for symbol, raw_rules in self.directives['resume'].items(): refined_rules = [] for rule in raw_rules: diff --git a/DHParser/stringview.pxd b/DHParser/stringview.pxd index b2a436cb6c0853d19ac21f5f04bcdf28b5675b4a..27e6177f09f6272a4881d85a4d9615077b7fb091 100644 --- a/DHParser/stringview.pxd +++ b/DHParser/stringview.pxd @@ -16,5 +16,66 @@ cdef int last_char(text, int begin, int end) cdef int pack_index(int index, int length) @cython.locals(cbegin=cython.int, cend=cython.int) -cpdef real_indices(begin, end, int length) +cdef real_indices(begin, end, int length) +# cpdef real_indices(begin, end, int length) + +# cdefs for class StringView: https://cython.readthedocs.io/en/latest/src/tutorial/pure.html + +cdef class StringView: + cdef str text + cdef int begin, end, len + cdef str fullstring + + cpdef __init__(self, text: str, begin: Optional[int] = 0, end: Optional[int] = None) -> None + + cpdef __bool__(self) -> bool + + cpdef __len__(self) -> int + + cpdef __str__(self) -> str + + cpdef __eq__(self, other) -> bool + + cpdef __hash__(self) -> int + + cpdef __add__(self, other) -> Union[str, 'StringView'] + + cpdef __radd__(self, other) -> Union[str, 'StringView'] + + @cython.locals(start=cython.int, stop=cython.int) + cpdef __getitem__(self, index: Optional[slice, int]) -> StringView + + cpdef count(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int + + cpdef find(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int + + cpdef rfind(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int + + cpdef startswith(self, prefix: str, start: int = 0, end: Optional[int] = None) -> bool + + cpdef endswith(self, suffix: str, start: int = 0, end: Optional[int] = None) -> bool + + cpdef match(self, regex, flags=0) + + cpdef index(self, absolute_index: int) -> int + + cpdef indices(self, absolute_indices: Iterable[int]) -> Tuple[int, ...] + + cpdef search(self, regex) + + cpdef finditer(self, regex) + + @cython.locals(begin=cython.int, end=cython.int) + cpdef strip(self) + + @cython.locals(begin=cython.int) + cpdef lstrip(self) + + @cython.locals(end=cython.int) + cpdef rstrip(self) + + @cython.locals(length=cython.int, k=cython.int, i=cython.int) + cpdef split(self, sep=None) + + cpdef replace(self, old, new) \ No newline at end of file diff --git a/DHParser/stringview.py b/DHParser/stringview.py index 30453fb294dda1511a6986ce0bcaa0810b55d092..a91ba64d2c50e1b41adb57d9c10f96a28e48a53b 100644 --- a/DHParser/stringview.py +++ b/DHParser/stringview.py @@ -72,6 +72,9 @@ def pack_index(index: int, length: int) -> int: """ # assert length >= 0 index = index if index >= 0 else index + length + # TODO: Test the following code for speedup + # if index < 0: + # index += length return 0 if index < 0 else length if index > length else index @@ -86,7 +89,7 @@ def real_indices(begin: Optional[int], return pack_index(cbegin, length), pack_index(cend, length) -class StringView(collections.abc.Sized): +class StringView: # (collections.abc.Sized): """ A rudimentary StringView class, just enough for the use cases in parse.py. The difference between a StringView and the python @@ -106,13 +109,13 @@ class StringView(collections.abc.Sized): else: self.fullstring = '' - def __bool__(self): + def __bool__(self) -> bool: return self.end > self.begin # and bool(self.text) - def __len__(self): + def __len__(self) -> int: return self.len - def __str__(self): + def __str__(self) -> str: # PERFORMANCE WARNING: This creates a copy of the string-slice if self.fullstring: # optimization: avoid slicing/copying return self.fullstring @@ -122,27 +125,27 @@ class StringView(collections.abc.Sized): self.fullstring = self.text[self.begin:self.end] return self.fullstring - def __eq__(self, other): + def __eq__(self, other) -> bool: # PERFORMANCE WARNING: This creates copies of the strings return len(other) == len(self) and str(self) == str(other) - def __hash__(self): + def __hash__(self) -> int: # PERFORMANCE WARNING: This creates a copy of the string-slice return hash(str(self)) - def __add__(self, other): + def __add__(self, other) -> Union[str, 'StringView']: if isinstance(other, str): return str(self) + other else: return StringView(str(self) + str(other)) - def __radd__(self, other): + def __radd__(self, other) -> Union[str, 'StringView']: if isinstance(other, str): return other + str(self) else: return StringView(str(other) + str(self)) - def __getitem__(self, index): + def __getitem__(self, index: Union[slice, int]) -> 'StringView': # assert isinstance(index, slice), "As of now, StringView only allows slicing." # assert index.step is None or index.step == 1, \ # "Step sizes other than 1 are not yet supported by StringView" @@ -150,9 +153,9 @@ class StringView(collections.abc.Sized): start, stop = real_indices(index.start, index.stop, self.len) return StringView(self.text, self.begin + start, self.begin + stop) except AttributeError: - return self.text[self.begin + index] + return StringView(self.text, self.begin + index, self.begin + index + 1) - def count(self, sub: str, start=None, end=None) -> int: + def count(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int: """Returns the number of non-overlapping occurrences of substring `sub` in StringView S[start:end]. Optional arguments start and end are interpreted as in slice notation. @@ -165,7 +168,7 @@ class StringView(collections.abc.Sized): start, end = real_indices(start, end, self.len) return self.text.count(sub, self.begin + start, self.begin + end) - def find(self, sub: str, start=None, end=None) -> int: + def find(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int: """Returns the lowest index in S where substring `sub` is found, such that `sub` is contained within S[start:end]. Optional arguments `start` and `end` are interpreted as in slice notation. @@ -179,7 +182,7 @@ class StringView(collections.abc.Sized): start, end = real_indices(start, end, self.len) return self.text.find(sub, self.begin + start, self.begin + end) - self.begin - def rfind(self, sub: str, start=None, end=None) -> int: + def rfind(self, sub: str, start: Optional[int] = None, end: Optional[int] = None) -> int: """Returns the highest index in S where substring `sub` is found, such that `sub` is contained within S[start:end]. Optional arguments `start` and `end` are interpreted as in slice notation. diff --git a/DHParser/syntaxtree.py b/DHParser/syntaxtree.py index 2918efdbe079dbf0266675847d4633a0890ebb46..d570c2820238148f6cbec7446184243699c9a370 100644 --- a/DHParser/syntaxtree.py +++ b/DHParser/syntaxtree.py @@ -915,7 +915,7 @@ def parse_sxpr(sxpr: Union[str, StringView]) -> Node: """ sxpr = StringView(sxpr).strip() if isinstance(sxpr, str) else sxpr.strip() - mock_parsers = dict() # type: Dict[str, MockParser] + mock_parsers = dict() # type: Dict[StringView, MockParser] def next_block(s: StringView): """Generator that yields all characters until the next closing bracket diff --git a/Introduction.md b/Introduction.md index 9efc5fe924c5af7c5325ed684c093fcaa91712e3..f925f1d66e44e642d50fefbeb5b3d6c5deb776f1 100644 --- a/Introduction.md +++ b/Introduction.md @@ -390,13 +390,13 @@ scroll down to the AST section, you'll see something like this: "*": replace_by_single_child } -As you can see, AST-transformations a specified declaratively (with the +As you can see, AST-transformations are specified declaratively (with the option to add your own Python-programmed transformation rules). This keeps the specification of the AST-transformation simple and concise. At the same, we avoid adding hints for the AST-transformation in the grammar specification, which would render the grammar less readable. Now that you have seen how DHParser basically works, it is time to go -through the process of desining and testing a domain specific notation +through the process of designing and testing a domain specific notation step by step from the very start. Head over to the documentation in subdirectory and read the step by step guide. diff --git a/README.md b/README.md index 1fa3c7bfc9dc184b6df1ba3184bee5535ab6ac59..9a04f1e7f5624d26d415905eb7160745221f40a8 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - https://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0e Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/documentation/Presentations/deRSE2019/cuts.txt b/documentation/Presentations/deRSE2019/cuts.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/documentation/Presentations/deRSE2019/deRSE2019_proposal_EA.md b/documentation/Presentations/deRSE2019/deRSE2019_proposal_EA.md new file mode 100644 index 0000000000000000000000000000000000000000..dc4bd0bc3e05c06dbfe11abeb9803f16d8e86126 --- /dev/null +++ b/documentation/Presentations/deRSE2019/deRSE2019_proposal_EA.md @@ -0,0 +1,111 @@ +Workshop: DHParser - Domain Specific Languages for the Digital Humanities +========================================================================= + +Proposal for a workshop (180 min) for the [deRSE2019 +Conference](https://derse19.uni-jena.de/) + +by Eckhart Arnold, Bavarian Academy of Sciences and Humanities, arnold@badw.de + +Abstract +-------- + +Domain specific languages have become an ubiquitous tool in the +software-industry, in many cases replacing XML as configuration or +data description language. By now, there exist quite a few mature +DSL-construction toolkits and DSL-parser generators out there +([Xtext], [MPS], [ANTLR], [pyparsing]) that support the creation of +DSLs. + +Nonetheless, DSLs are strangely underused in Digital Humanities +Projects, even though they can provide a great addition, if not in +some cases viable alternative to the omnipresent XML-toolchains. One +possible reason why DSLs have not yet become popular in the Digital +Humanities is that the common DSL construction kits and parser +generators are geared towards different application domains, and do +not fulfill the specific demands of Digital Humanities contexts. In +the Digital Humanities DSLs, just like the XML-data-structures, say, +for a historical-critical edition, can become quite complex, evolve +over time, result from an iterative testing and discussion process in +which users interact with programmers and must be understandable and +usable with ease by researchers that not necessarily accustomed to +computer technology. + +[DHParser] is a parser generator for DSLs, developed at the Bavarian +Academy of Sciences and Humanities, that specifically addresses the +Digital Humanities. In particular, it offers support for: + +- unit testing of DSLs + +- specifying meaningful error messages for the user of the DSL and + locating errors correctly + +- debugging support for the DSL-specification and parsing process + +- support for abstract-syntax-tree-generation + +- a basic framework for compiler construction with XML-output as the + most common use case in mind + +- programming in Python, the most commonly known and used programming + language in the Digital Humanities + +In the workshop, I am going to explain how to develop a Frontend-DSL +for the “[DTA-Basisformat]” (or, for the purpose of introduction, a +subset thereof). We will assume the “DTA-Basisformat” as a given +target-format und run through the whole development process from +designing the syntax of the DSL through examples, specifying it +formally with [EBNF], directing abstract-syntax-tree generation, +generating XML-output, writing test-cases and specifying error +messages. If time permits, we will also look into the process of +preparing an editor / development environment for our DTA-DSL with +[Visual Studio Code]. + +In the end, every participant will have learned: + +- what a DSL is and what the steps for creating one are + +- how the syntax of a DSL can be specified in an EBNF-like formalism + +- how a simple DSL-XML-compiler is programmed in Python with the + DHParser-framework + +- how important practical concerns like unit-testing of DSLs and + error-reporting can be addressed + +- How DSLs relate to XML: Basically, XML allows you to declare and encode + the domain specific semantics of any kind of data, DSLs also enable you + to specify a domain specific syntax for you data, rendering the encoded + data much more human-readable (and -writable) than XML. + +- how to use DHParser ;-) + +We will close the workshop with a discussion about the benefits as +well as possible disadvantages of employing DSLs in DH-projects in +relation to the necessary effort in in comparison to the +ordinary XML-workflows. + +**Requirements for participating and benefiting from the workshop:** + +- good working knowledge of [Python] and [regular expressions] +- a laptop with python installed + +Suggested Reading: + +- [Introduction to DHParser] +- or, more detailed, the [Step by Step Guide to DHParser] +- or, for a real world example, though work in progress, the [DSL for + the medival latin dictionary] + +[Xtext]: https://www.eclipse.org/Xtext/ +[MPS]: https://www.jetbrains.com/mps/ +[ANTLR]: https://www.antlr.org/ +[pyparsing]: https://pypi.org/project/pyparsing/ +[DHParser]: https://gitlab.lrz.de/badw-it/DHParser +[DTA-Basisformat]: http://www.deutschestextarchiv.de/doku/basisformat/ +[EBNF]: https://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf +[Visual Studio Code]: https://code.visualstudio.com/ +[Python]: https://www.python.org/ +[regular expressions]: https://docs.python.org/3/library/re.html +[Introduction to DHParser]: https://gitlab.lrz.de/badw-it/DHParser/blob/development/Introduction.md +[step by step guide to DHParser]: https://gitlab.lrz.de/badw-it/DHParser/blob/development/documentation/StepByStepGuide.rst +[DSL for the medival latin dictionary]: https://gitlab.lrz.de/badw-it/MLW-DSL