12.8.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit 2c57bc34 authored by eckhart's avatar eckhart
Browse files

StepByStepGuide extended

parent cfe87bfb
......@@ -489,7 +489,7 @@ class EBNFCompiler(Compiler):
' return self.fallback_compiler(node)', '']
else:
compiler += [' # def ' + method_name + '(self, node):',
' # return self.fallback_compiler(node)', '']
' # return node', '']
compiler += [COMPILER_FACTORY.format(NAME=self.grammar_name)]
return '\n'.join(compiler)
......
......@@ -1312,7 +1312,7 @@ class Series(NaryOperator):
i = max(1, text.index(match.regs[1][0])) if match else 1
node = Node(self, text_[:i]).init_pos(self.grammar.document_length__
- len(text_))
node.add_error('%s expected; "%s"... found!'
node.add_error('%s expected; "%s" found!'
% (parser.repr, text_[:10].replace('\n', '\\n ')),
code=Error.MANDATORY_CONTINUATION)
text_ = text_[i:]
......
......@@ -298,7 +298,7 @@ def grammar_unit(test_unit, parser_factory, transformer_factory, report=True, ve
cst = parser(test_code, parser_name)
except UnknownParserError as upe:
cst = Node(ZOMBIE_PARSER, "").add_error(str(upe)).init_pos(0)
clean_test_name = test_name.replace('*', '')
clean_test_name = str(test_name).replace('*', '')
log_ST(cst, "match_%s_%s.cst" % (parser_name, clean_test_name))
tests.setdefault('__cst__', {})[test_name] = cst
if "ast" in tests or report:
......@@ -366,8 +366,10 @@ def grammar_unit(test_unit, parser_factory, transformer_factory, report=True, ve
def reset_unit(test_unit):
"""Resets the tests in ``test_unit`` by removing all results and
error messages."""
"""
Resets the tests in ``test_unit`` by removing all results and error
messages.
"""
for parser, tests in test_unit.items():
for key in list(tests.keys()):
if key not in UNIT_STAGES:
......
......@@ -63,12 +63,12 @@ a few drawbacks to this approach:
On the other hand, there are good reasons why XML is used in the
humanities: Important encoding standards like
[TEI-XML](http://www.tei-c.org/index.xml) are defined in
XML. Its strict syntax and the possibility to check data against
schema help to detect and avoiding encoding errors. If the schema is
well-defined, it is unambiguous, and it is easy to parse for a computer.
Most of these advantages, however, are on a technical level and few of
them are actually exclusive advantages of XML.
[TEI-XML](http://www.tei-c.org/index.xml) are defined in XML. Its strict
syntax and the possibility to check data against schema help to detect
and avoiding encoding errors. If the schema is well-defined, it is
unambiguous, and it is easy to parse for a computer. Most of these
advantages, however, are on a technical level and few of them are
actually exclusive advantages of XML.
All in all this means, that while XML is a solid back-end-technology, it
still is a pain to work with XML as a frontend-technology. This is where
......@@ -107,11 +107,11 @@ same information as the XML version. How, for example, would the
computer know for sure where a verse starts and ends or a stanza or what
is title and what stanza? Well, for all these matters there exist
conventions that poets have been using for several thousand years. For
example, a verse always starts and ends on the same line. There
is always a gap between stanzas. And the title is always written above
the poem and not in the middle of it. So, if there is a title at all, we
can be sure that what is written in the first line is the title and not
a stanza.
example, a verse always starts and ends on the same line. There is
always a gap between stanzas. And the title is always written above the
poem and not in the middle of it. So, if there is a title at all, we can
be sure that what is written in the first line is the title and not a
stanza.
DHParser is able to exploit all those hints in order to gather much the
same information as was encoded in the XML-Version. Don't believe it?
......@@ -119,8 +119,8 @@ You can try: Download DHParser from the
[gitlab-repository](https://gitlab.lrz.de/badw-it/DHParser) and enter
the directory `examples/Tutorial` on the command line interface (shell).
Just run `python LyrikCompiler_example.py` (you need to have installed
[Python](https://www.python.org/) Version 3.4 or higher on your computer).
The output will be something like this:
[Python](https://www.python.org/) Version 3.4 or higher on your
computer). The output will be something like this:
<gedicht>
<bibliographisches>
......@@ -158,19 +158,21 @@ The output will be something like this:
Now, you might notice that this is not exactly the XML-encoding as shown
above. (Can you spot the differences?) But you will probably believe me
without further proof that it can easily be converted into the other version
and contains all the information that the other version contains.
How does DHParser achieve this? Well, there is the rub. In order to convert
the poem in the domain specific version into the XML-version, DHParser
requires a structural description of the domain specific encoding. This is a
bit similar to a document type definition (DTD) in XML. This structural
description uses a slightly enhanced version of the [Extended-Backus-Naur-Form
without further proof that it can easily be converted into the other
version and contains all the information that the other version
contains.
How does DHParser achieve this? Well, there is the rub. In order to
convert the poem in the domain specific version into the XML-version,
DHParser requires a structural description of the domain specific
encoding. This is a bit similar to a document type definition (DTD) in
XML. This structural description uses a slightly enhanced version of the
[Extended-Backus-Naur-Form
(EBNF)](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form),
which is a well-established formalism for the structural description of formal
languages in computer sciences. An excerpt of the EBNF-definition of our
domain-specific encoding for the poem looks like this. (We leave out the
meta-data here. See
which is a well-established formalism for the structural description of
formal languages in computer sciences. An excerpt of the EBNF-definition
of our domain-specific encoding for the poem looks like this. (We leave
out the meta-data here. See
[`examples/Tutorial/Lyrik.ebnf`](https://gitlab.lrz.de/badw-it/DHParser/blob/master/examples/Tutorial/Lyrik.ebnf)
for the full EBNF):
......@@ -188,45 +190,47 @@ for the full EBNF):
LEERZEILE = /\n[ \t]*(?=\n)/~
ENDE = !/./
Without going into too much detail here, let me just explain a few basics of
this formal description: The slashes `/` enclose ordinary regular expressions.
Thus, `NZ` for ("Neue Zeile", German for: "new line") is defined as `/\n/~`
which is the newline-token `\n` in a regular expression, plus further
horizontal whitespace (signified by the tilde `~`), if there is any.
The braces `{` `}` enclose items that can be repeated zero or more times; with
a `+` appended to the closing brace it means one or more times. Now, look at
the definition of `text` in the 6th line: `{ strophe {LEERZEILE} }+`. This
reads as follows: The text of the poem consists of a sequence of stanzas, each
of which is followed by a sequence of empty lines (German: "Leerzeilen"). If
you now look at the structural definition of a stanza, you find that it
consists of a sequence of verses, each of which starts, i.e. is preceded by a
new line.
Can you figure out the rest? Hint: The angular brackets `[` and `]` mean that
and item is optional and the `§` sign means that it is obligatory. (Strictly
speaking, the §-signs are not necessary, because an item that is not optional
is always obligatory, but the §-signs help the converter to produce more
useful error messages.)
Without going into too much detail here, let me just explain a few
basics of this formal description: The slashes `/` enclose ordinary
regular expressions. Thus, `NZ` for ("Neue Zeile", German for: "new
line") is defined as `/\n/~` which is the newline-token `\n` in a
regular expression, plus further horizontal whitespace (signified by the
tilde `~`), if there is any.
The braces `{` `}` enclose items that can be repeated zero or more
times; with a `+` appended to the closing brace it means one or more
times. Now, look at the definition of `text` in the 6th line: `{ strophe
{LEERZEILE} }+`. This reads as follows: The text of the poem consists of
a sequence of stanzas, each of which is followed by a sequence of empty
lines (German: "Leerzeilen"). If you now look at the structural
definition of a stanza, you find that it consists of a sequence of
verses, each of which starts, i.e. is preceded by a new line.
Can you figure out the rest? Hint: The angular brackets `[` and `]` mean
that and item is optional and the `§` sign means that it is obligatory.
(Strictly speaking, the §-signs are not necessary, because an item that
is not optional is always obligatory, but the §-signs help the converter
to produce more useful error messages.)
This should be enough for an introduction to the purpose of DSLs in the
humanities. It has shown the probably most important use case of DHParser,
i.e. as a frontend-technology form XML-encodings. Of course, it can just as
well be used as a frontend for any other kind of structured data, like SQL or
graph-structured data. The latter is by the way is a very reasonable
alternative to XML for edition projects with a complex transmission history.
See Andreas Kuczera's Blog-entry on ["Graphdatenbanken für
humanities. It has shown the probably most important use case of
DHParser, i.e. as a frontend-technology form XML-encodings. Of course,
it can just as well be used as a frontend for any other kind of
structured data, like SQL or graph-structured data. The latter is by the
way is a very reasonable alternative to XML for edition projects with a
complex transmission history. See Andreas Kuczera's Blog-entry on
["Graphdatenbanken für
Historiker"](http://mittelalter.hypotheses.org/5995).
Tutorial: First Steps with DHParser
-----------------------------------
*You'll need to be able to use a shell and have some basic knowledge of Python
programming to be able to follow this section!* Also, you need to have
[git](https://git-scm.com/) and [python 3](https://www.python.org/) installed
on you system. It is important that you have at least python version 3.5.
DHParser will not work with python 2. You can simply start python to find out
which version you have got.
*You'll need to be able to use a shell and have some basic knowledge of
Python programming to be able to follow this section!* Also, you need to
have [git](https://git-scm.com/) and [python 3](https://www.python.org/)
installed on you system. It is important that you have at least python
version 3.5. DHParser will not work with python 2. You can simply start
python to find out which version you have got.
In order to try the example above, you should fetch DHParsers from its
git-repository. Open a shell and type:
......@@ -334,21 +338,21 @@ strictly separated steps:
what you like.
DHParser automatically only generates a parser for the very first step.
The other steps have to be programmed by hand, though DHParser
tries to make those parts as easy as possible. What you have just seen
in your editor is a Pseudo-XML-representation of the concrete syntax
tree. (The output of a parser is always a tree structure, just like
XML.) It is called concrete syntax tree, because it contains all the
syntactic details that have been described in the `Lyrik.ebnf`-grammar;
and the grammar needs to describe all those details, because otherwise
it would not be possible to parse the text. On the other hand most of
these details do not carry any important information. This is the reason
why in the second step the transformation into an abstract syntax tree
that leaves out the unimportant details. There is now general rule of
how to derive abstract syntax trees from concrete syntax trees, and
there cannot be, because it depends on the particular domain of
application which details are important and which not. For poems these
might be different from, say, for a catalogue entry. Therefore, the
The other steps have to be programmed by hand, though DHParser tries to
make those parts as easy as possible. What you have just seen in your
editor is a Pseudo-XML-representation of the concrete syntax tree. (The
output of a parser is always a tree structure, just like XML.) It is
called concrete syntax tree, because it contains all the syntactic
details that have been described in the `Lyrik.ebnf`-grammar; and the
grammar needs to describe all those details, because otherwise it would
not be possible to parse the text. On the other hand most of these
details do not carry any important information. This is the reason why
in the second step the transformation into an abstract syntax tree that
leaves out the unimportant details. There is now general rule of how to
derive abstract syntax trees from concrete syntax trees, and there
cannot be, because it depends on the particular domain of application
which details are important and which not. For poems these might be
different from, say, for a catalogue entry. Therefore, the
AST-transformation has to be specified for each grammar separately, just
as the grammar has to be specified for each application domain.
......
......@@ -31,12 +31,13 @@ in a pre-first-release state, it is for the time being more recommendable to
clone the most current version of DHParser from the git-repository rather than
installing the packages from the Python Package Index (PyPI).
This section takes you from cloning the DHParser git repository to setting up a
new DHParser-project in the ``experimental``-subdirectory and testing whether
the setup works. Similarily to current web development practices, most of the
work with DHParser is done from the shell. In the following, we assume a Unix
(Linux) environment. The same can most likely be done on other operating systems
in a very similar way, but there might be subtle differences.
This section takes you from cloning the DHParser git repository to setting up
a new DHParser-project in the ``experimental``-subdirectory and testing
whether the setup works. Similarily to current web development practices, most
of the work with DHParser is done from the shell. In the following, we assume
a Unix (Linux) environment. The same can most likely be done on other
operating systems in a very similar way, but there might be subtle
differences.
Installing DHParser from the git repository
-------------------------------------------
......@@ -120,13 +121,13 @@ The output is a block of pseudo-XML, looking like this::
...
Now, this does not look too helpful yet, partly, because it is cluttered with
all sorts of seemingly superflous pseudo-XML-tags like "<:ZeroOrMore>". However,
you might notice that it contains the original sequence of words "Life is but a
walkting shadow" in a structured form, where each word is (among other things)
surrounded by <WORD>-tags. In fact, the output of the compiler script is a
pseudo-XML-representation of the *contrete syntax tree* of our
"example.dsl"-document according the grammar specified in "poetry.ebnf" (which
we haven't looked into yet, but we will do so soon).
all sorts of seemingly superflous pseudo-XML-tags like "<:ZeroOrMore>".
However, you might notice that it contains the original sequence of words
"Life is but a walkting shadow" in a structured form, where each word is
(among other things) surrounded by <WORD>-tags. In fact, the output of the
compiler script is a pseudo-XML-representation of the *contrete syntax tree*
of our "example.dsl"-document according the grammar specified in "poetry.ebnf"
(which we haven't looked into yet, but we will do so soon).
If you see the pseudo-XML on screen, the setup of the new DHParser-project
has been successful.
......@@ -146,13 +147,13 @@ Generally speaking, the compilation process consists of three stages:
XML-representation or a relational database record.
Now, DHParser can fully automize the generation of a parser from a
syntax-description in EBNF-form, like our "poetry.ebnf", but it cannot automize
the transformation from the concrete into the abstract syntax tree (which for
the sake of brevity we will simply call "AST-Transformation" in the following),
and neither can it automize the compilation of the abstract syntax tree into
something more useful. Therefore, the AST-Transformation in the autogenerated
compile-script is simply left empty, while the compiling stage simply converts
the syntax tree into a pseudo-XML-format.
syntax-description in EBNF-form, like our "poetry.ebnf", but it cannot
automize the transformation from the concrete into the abstract syntax tree
(which for the sake of brevity we will simply call "AST-Transformation" in the
following), and neither can it automize the compilation of the abstract syntax
tree into something more useful. Therefore, the AST-Transformation in the
autogenerated compile-script is simply left empty, while the compiling stage
simply converts the syntax tree into a pseudo-XML-format.
The latter two stages have to be coded into the compile-script by hand, with
the support of templates within this script. If the grammar of the DSL is
......@@ -165,11 +166,55 @@ by hand. Stubs for theses parts of the compile-script will only be generated
if the compile-script does not yet exist, that is, on the very first calling
of the test-srcipt.
Usually, if you have adjusted the grammar, you will want
to run the unit tests anyway. Therefore, the regeneration of the parser-part
of the compile-script is triggered by the test-script.
Usually, if you have adjusted the grammar, you will want to run the unit tests
anyway. Therefore, the regeneration of the parser-part of the compile-script
is triggered by the test-script.
The development workflow for DSLs
---------------------------------
[TO BE CONTINUED...]
\ No newline at end of file
When developing a domain specific notation it is recommendable to first develop the grammar and the parser for that notation, then to the
abstract syntax tree transformations and finally to implement the compiler. Of course one can always come back and change the grammar later. But in order to avoid revising the AST-transformations and the compiler time and again it helps if the grammar has been worked out before. A bit of interlocking between these steps does not hurt, though.
A resonable workflow for developing the grammar proceeds like this:
1. Set out by writing down a few example documents for your DSL. It is
advisable to start with a few simple examples that use only a subset of the
intended features of your DSL.
2. Next you sktech a grammar for your DSL that is just rich enough to capture
those examples.
3. Right after sketching the grammar you should write test cases for your
grammar. The test cases can be small parts or snippets of your example
documents. You could also use your example documents as test cases, but
usually the test cases should have a smaller granularity to make locating
errors easier.
4. Next, you should run the test script. Usually, some test will fail at
the first attempt. So you'll keep revising the EBNF-grammar, adjusting and adding test cases until all tests pass.
5. Now it is time to try and compile the example documents. By this time the
test-script should have generated the compile-script, which you can be
called with the example documents. Don't worry too much about the output, yet. What is important at this stage is merely whether the parser can
handle the examples or not. If not, further test cases and adjustments the
EBNF grammar will be needed - or revision of the examples in case you decide to use different syntactic constructs.
If all examples can be parsed, you go back to step one and add further more complex examples, and continue to do so until you have the feeling that you DSL's grammar is rich enough for all intended application cases.
Let's try this with the trivial demo example that comes with creating a new project with the "dhparser.py"-script. Now, you have already seen that the
"example.dsl"-document merely contains a simple sequence of words: "Life is but a walking shadow" Now, wouldn't it be nice, if we could end this sequence with a full stop to turn it into a proper sentence. So, open "examples.dsl" with a text editor and add a full stop::
Life is but a walking shadow.
Now, try to compile "examples.dsl" with the compile-script::
$ python poetryCompiler.py example.dsl
example.dsl:1:29: Error: EOF expected; ".\n " found!
Since the grammar, obviously, did not allow full stops so far, the parser
returns an error message. The error message is pretty self-explanatory in this
case. (Often, you will unfortunately find that the error message are somewhat
difficult to decipher. In particular, because it so happens that an error the parser complains about is just the consequence of an error made at an earlier location that the parser may not have been able to recognize as such. We will learn more about how to avoid such situations, later.) EOF is actually the name of a parser that captures the end of the file, thus "EOF"! But instead of the expected end of file an, as of now, unparsable construct, namely a full stop followed by a line feed, signified by "\n", was found.
#!/usr/bin/python
#######################################################################
#
# SYMBOLS SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
from functools import partial
import os
import sys
try:
import regex as re
except ImportError:
import re
from DHParser import logging, is_filename, load_if_file, \
Grammar, Compiler, nil_preprocessor, PreprocessorToken, \
Lookbehind, Lookahead, Alternative, Pop, Token, Synonym, AllOf, SomeOf, Unordered, \
Option, NegativeLookbehind, OneOrMore, RegExp, Retrieve, Series, RE, Capture, \
ZeroOrMore, Forward, NegativeLookahead, mixin_comment, compile_source, \
last_value, counterpart, accumulate, PreprocessorFunc, \
Node, TransformationFunc, TransformationDict, \
traverse, remove_children_if, merge_children, is_anonymous, \
reduce_single_child, replace_by_single_child, replace_or_reduce, remove_whitespace, \
remove_expendables, remove_empty, remove_tokens, flatten, is_whitespace, \
is_empty, is_expendable, collapse, replace_content, WHITESPACE_PTYPE, TOKEN_PTYPE, \
remove_nodes, remove_content, remove_brackets, replace_parser, \
keep_children, is_one_of, has_content, apply_if, remove_first, remove_last, \
remove_anonymous_empty, keep_nodes, traverse_locally, strip
#######################################################################
#
# PREPROCESSOR SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
def ArithmeticPreprocessor(text):
return text, lambda i: i
def get_preprocessor() -> PreprocessorFunc:
return ArithmeticPreprocessor
#######################################################################
#
# PARSER SECTION - Don't edit! CHANGES WILL BE OVERWRITTEN!
#
#######################################################################
class ArithmeticGrammar(Grammar):
r"""Parser for an Arithmetic source file, with this grammar:
expression = term { ("+" | "-") term}
term = factor { ("*"|"/") factor}
factor = constant | variable | "(" expression ")"
variable = "x" | "y" | "z"
constant = digit {digit}
digit = "0" | "1" | "..." | "9"
test = digit constant variable
"""
constant = Forward()
digit = Forward()
expression = Forward()
variable = Forward()
source_hash__ = "3064cea87c9ceb59ade35566a31c3d75"
parser_initialization__ = "upon instantiation"
COMMENT__ = r''
WHITESPACE__ = r'[\t ]*'
WSP__ = mixin_comment(whitespace=WHITESPACE__, comment=COMMENT__)
wspL__ = ''
wspR__ = WSP__
test = Series(digit, constant, variable)
digit.set(Alternative(Token("0"), Token("1"), Token("..."), Token("9")))
constant.set(Series(digit, ZeroOrMore(digit)))
variable.set(Alternative(Token("x"), Token("y"), Token("z")))
factor = Alternative(constant, variable, Series(Token("("), expression, Token(")")))
term = Series(factor, ZeroOrMore(Series(Alternative(Token("*"), Token("/")), factor)))
expression.set(Series(term, ZeroOrMore(Series(Alternative(Token("+"), Token("-")), term))))
root__ = expression
def get_grammar() -> ArithmeticGrammar:
global thread_local_Arithmetic_grammar_singleton
try:
grammar = thread_local_Arithmetic_grammar_singleton
except NameError:
thread_local_Arithmetic_grammar_singleton = ArithmeticGrammar()
grammar = thread_local_Arithmetic_grammar_singleton
return grammar
#######################################################################
#
# AST SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
Arithmetic_AST_transformation_table = {
# AST Transformations for the Arithmetic-grammar
"+": remove_empty,
"expression": [],
"term": [],
"factor": [replace_or_reduce],
"variable": [replace_or_reduce],
"constant": [],
"digit": [replace_or_reduce],
"test": [],
":Token, :RE": reduce_single_child,
"*": replace_by_single_child
}
def ArithmeticTransform() -> TransformationDict:
return partial(traverse, processing_table=Arithmetic_AST_transformation_table.copy())
def get_transformer() -> TransformationFunc:
global thread_local_Arithmetic_transformer_singleton
try:
transformer = thread_local_Arithmetic_transformer_singleton
except NameError:
thread_local_Arithmetic_transformer_singleton = ArithmeticTransform()
transformer = thread_local_Arithmetic_transformer_singleton
return transformer
#######################################################################
#
# COMPILER SECTION - Can be edited. Changes will be preserved.
#
#######################################################################
class ArithmeticCompiler(Compiler):
"""Compiler for the abstract-syntax-tree of a Arithmetic source file.
"""
def __init__(self, grammar_name="Arithmetic", grammar_source=""):
super(ArithmeticCompiler, self).__init__(grammar_name, grammar_source)
assert re.match('\w+\Z', grammar_name)
def on_expression(self, node):
return node
# def on_term(self, node):
# return node
# def on_factor(self, node):
# return node
# def on_variable(self, node):
# return node
# def on_constant(self, node):
# return node
# def on_digit(self, node):
# return node
# def on_test(self, node):
# return node
def get_compiler(grammar_name="Arithmetic", grammar_source="") -> ArithmeticCompiler:
global thread_local_Arithmetic_compiler_singleton
try:
compiler = thread_local_Arithmetic_compiler_singleton
compiler.set_grammar_name(grammar_name, grammar_source)
except NameError:
thread_local_Arithmetic_compiler_singleton = \
ArithmeticCompiler(grammar_name, grammar_source)
compiler = thread_local_Arithmetic_compiler_singleton
return compiler
#######################################################################
#
# END OF DHPARSER-SECTIONS
#
#######################################################################
def compile_src(source, log_dir=''):
"""Compiles ``source`` and returns (result, errors, ast).
"""
with logging(log_dir):
compiler = get_compiler()
cname = compiler.__class__.__name__
log_file_name = os.path.basename(os.path.splitext(source)[0]) \
if is_filename(source) < 0 else cname[:cname.find('.')] + '_out'
result = compile_source(source, get_preprocessor(),
get_grammar(),
get_transformer(), compiler)
return result
if __name__ == "__main__":
if len(sys.argv) > 1:
file_name, log_dir = sys.argv[1], ''
if file_name in ['-d', '--debug'] and len(sys.argv) > 2:
file_name, log_dir = sys.argv[2], 'LOGS'
result, errors, ast = compile_src(file_name, log_dir)
if errors:
cwd = os.getcwd()
rel_path = file_name[len(cwd):] if file_name.startswith(cwd) else file_name
for error in errors:
print(rel_path + ':' + str(error))
sys.exit(1)
else:
print(result.as_xml() if isinstance(result, Node) else result)
else:
print("Usage: ArithmeticCompiler.py [FILENAME]")
......@@ -138,7 +138,7 @@ class TestCompilerGeneration:
sys.path.append(self.tmp)
from TestCompilerGenerationCompiler import compile_src
print(compile_src)
# print(compile_src)
result, errors, ast = compile_src(self.trivial_text)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment