10.12., 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit ce067809 authored by Eckhart Arnold's avatar Eckhart Arnold

- syntaxtree: S-expression serialization now supports attributes

parent 06de1882
......@@ -25,6 +25,7 @@ parser classes are defined in the ``parse`` module.
import collections.abc
from collections import OrderedDict
import copy
from DHParser.error import Error, linebreaks, line_col
......@@ -516,6 +517,13 @@ class Node(collections.abc.Sized):
return errors
@property
def attributes(self):
"""Returns a dictionary of XML-Attributes attached to the Node."""
if not hasattr(self, '_xml_attr'):
self._xml_attr = OrderedDict()
return self._xml_attr
def _tree_repr(self, tab, open_fn, close_fn, data_fn=lambda i: i, density=0) -> str:
"""
......@@ -580,16 +588,17 @@ class Node(collections.abc.Sized):
def opening(node) -> str:
"""Returns the opening string for the representation of `node`."""
txt = left_bracket + node.tag_name
txt = [left_bracket, node.tag_name]
# s += " '(pos %i)" % node.pos
if hasattr(node, '_xml_attr'):
txt.extend(""" `(%s "%s")""" % (k, v) for k, v in node.attributes.items())
if src:
txt += " '(pos %i " % node.pos # + " %i %i)" % line_col(src, node.pos)
txt.append(" `(pos %i %i %i)" % (node.pos, *line_col(src, node.pos)))
# if node.error_flag: # just for debugging error collecting
# txt += " HAS ERRORS"
if showerrors and node.errors:
txt += " '(err '(%s))" % ' '.join(str(err).replace('"', r'\"')
for err in node.errors)
return txt + '\n'
txt.append(" `(err %s)" % ' '.join(str(err) for err in node.errors))
return "".join(txt) + '\n'
def closing(node) -> str:
"""Returns the closing string for the representation of `node`."""
......@@ -604,14 +613,6 @@ class Node(collections.abc.Sized):
return self._tree_repr(' ', opening, closing, pretty, density=density)
@property
def attributes(self):
"""Returns a dictionary of XML-Attributes attached to the Node."""
if not hasattr(self, '_xml_attr'):
self._xml_attr = dict()
return self._xml_attr
def as_xml(self, src: str = None, showerrors: bool = True) -> str:
"""
Returns content as XML-tree.
......@@ -625,12 +626,13 @@ class Node(collections.abc.Sized):
def opening(node) -> str:
"""Returns the opening string for the representation of `node`."""
txt = ['<', node.tag_name]
# s += ' pos="%i"' % node.pos
has_reserved_attrs = hasattr(node, '_xml_attr') \
and any (r in node.attributes for r in {'err', 'line', 'col'})
if hasattr(node, '_xml_attr'):
txt.extend(' %s="%s"' % (k, v) for k, v in node.attributes.items())
if src:
if src and not has_reserved_attrs:
txt.append(' line="%i" col="%i"' % line_col(line_breaks, node.pos))
if showerrors and node.errors:
if showerrors and node.errors and not has_reserved_attrs:
txt.append(' err="%s"' % ''.join(str(err).replace('"', r'\"')
for err in node.errors))
return "".join(txt + [">\n"])
......@@ -728,7 +730,7 @@ class Node(collections.abc.Sized):
ZOMBIE_NODE = Node(ZOMBIE_PARSER, '')
def mock_syntax_tree(sxpr):
def mock_syntax_tree(sxpr: str) -> Node:
"""
Generates a tree of nodes from an S-expression. The main purpose of this is
to generate test data.
......@@ -763,7 +765,7 @@ def mock_syntax_tree(sxpr):
else 'Malformed S-expression. Closing bracket(s) ")" missing.'
raise AssertionError(errmsg)
sxpr = sxpr.strip()
sxpr = StringView(sxpr).strip()
if sxpr[0] != '(':
raise ValueError('"(" expected, not ' + sxpr[:10])
# assert sxpr[0] == '(', sxpr
......@@ -774,15 +776,33 @@ def mock_syntax_tree(sxpr):
'not "%s"' % sxpr[:40].replace('\n', ''))
name, class_name = (sxpr[:match.end()].split(':') + [''])[:2]
sxpr = sxpr[match.end():].strip()
pos = 0
attributes = OrderedDict()
if sxpr[0] == '(':
result = tuple(mock_syntax_tree(block) for block in next_block(sxpr))
pos = 0
for node in result:
node._pos = pos
pos += len(node)
else:
lines = []
while sxpr and sxpr[0] != ')':
# parse attributes
while sxpr[:2] == "`(":
i = sxpr.find('"')
k = sxpr.find(')')
if sxpr[2:5] == "pos" and (i < 0 or k < i):
pos = int(sxpr[5:k].strip().split(' ')[0])
elif sxpr[2:5] == "err":
m = sxpr.find('(', 5)
while m >= 0 and m < k:
m = sxpr.find('(', k)
k = max(k, sxpr.find(')', max(m, 0)))
else:
attr = sxpr[2:i].strip()
value = sxpr[i:k].strip()[1:-1]
attributes[attr] = value
sxpr = sxpr[k+1:].strip()
# parse content
for qtmark in ['"""', "'''", '"', "'"]:
match = re.match(qtmark + r'.*?' + qtmark, sxpr, re.DOTALL)
if match:
......@@ -796,7 +816,9 @@ def mock_syntax_tree(sxpr):
sxpr = sxpr[match.end():]
result = "\n".join(lines)
node = Node(MockParser(name, ':' + class_name), result)
node._pos = 0
if attributes:
node.attributes.update(attributes)
node._pos = pos
return node
# if __name__ == "__main__":
......
......@@ -260,6 +260,22 @@ our DSL. Rules in general always consist of a symbol on the left hand side of
a "="-sign (which in this context can be unterstood as a definition signifier)
and the definition of the rule on the right hand side.
.. note:: Traditional parser technology for context-free grammars often
distinguishes two phases, *scanning* and *parsing*, where a lexical scanner
would take a stream of characters and yield a sequence of tokens and the
actual parser would then operate on the stream of tokens. DHParser,
however, is an instance of a *scannerless parser* where the functionality
of the lexical scanner is seamlessly integrated into the
parser. This is done by allowing regular expressions in the definiendum of
grammar symbols. The regular expressions do the work of the lexical
scanner.
Theoretically, one could do without scanners or regular expressions.
Because regular languages are a subset of context-free languages, parsers
for context-free languages can do all the work that regular expressions can
do. But it makes things easier - and, in the case of DHParser, also faster
- to have them.
In our case the text as a whole, conveniently named "document" (any other name
would be allowed, too), consists of a leading whitespace, a possibly empty
sequence of an arbitrary number of words words ending only if the end of file
......@@ -495,7 +511,7 @@ luckily not in the case of our short demo example::
$ firefox LOGS/macbeth_full_parser.log.html &
..picture parsing_history.png
.. image:: parsing_history.png
What you see is a representation of the parsing history. It might look a bit
tedious in the beginning, especially the this column that contains the parser
......@@ -519,7 +535,90 @@ parser matched, the last column displays exactly that section of the text that
the parser did match. If the parser did not match, the last column displays
the text that still lies ahead and has not yet been parsed.
In our concrete example, we can see that the parser "WORD" matches "Life", but not "Life’s" or "’s". And this ultimately leads to the failure of the parsing process as a whole.
In our concrete example, we can see that the parser "WORD" matches "Life", but
not "Life’s" or "’s". And this ultimately leads to the failure of the parsing
process as a whole. The simplemost solution would be to add the apostrophe to
the list of allowed characters in a word by changeing the respective line in
the grammar definition to ``WORD = /[\w’]+/``. Now, before we even change the
grammar we first add another test case to capture this kind of error. Since we
have decided that "Life’s" should be parsed as a singe word, let's open the
file "grammar_tests/01_test_word.ini" and add the following test::
[match:WORD]
M3: Life’s
To be sure that the new tests captures the error we have found you might want
to run the script "tst_poetry_grammar.py" and verify that it reports the
failure of test "M3" in the suite "01_test_word.ini". After that, change the
regular expression for the symbol WORD in the grammar file "poetry.ebnf" as
just described. Now both the tests and the compilation of the file
"macbeth.dsl" should run through smoothly.
.. caution:: Depending on the purpose of your DSL, the simple solution of
allowing apostrophes within words, might not be what you want. After all
"Life’s" is but a shorthand for the two word phrase "Life is". Now,
whatever alternative solution now comes to your mind, be aware that there
are also cases like Irish names, say "O’Dolan" where the apostrophe is
actually a part of a word and cases like "don’t" which, if expanded, would
be two words *not* separated at the position of the apostrophe.
We leave that as an exercise, first to figure out, what different cases for
the use of apostrophes in the middle of a word exist. Secondly, to make a
reasonable decision which of these should be treated as a single and which
as separate words and, finally, if possible, to write a grammar that
provides for these cases. These steps are quite typical for the kind of
challenges that occur during the design of a DSL for a
Digital-Humanities-Project.
Controlling abstract-syntax-tree generation
-------------------------------------------
Compiling the example "macbeth.dsl" with the command ``python poetryCompier.py
macbeth.dsl``, you might find yourself not being able to avoid the impression
that the output is rather verbose. Just looking at the beginning of the
output, we find::
<document>
<:ZeroOrMore>
<sentence>
<part>
<WORD>
<:RegExp>Life’s</:RegExp>
<:Whitespace> </:Whitespace>
</WORD>
<WORD>
<:RegExp>but</:RegExp>
<:Whitespace> </:Whitespace>
</WORD>
...
But why do we need to know all those details! Why would we need a
":ZeroOrMore" element inside the "<document>" element, if the
"<sentence>"-elements could just as well be direct descendants of the
"<document>"-element? Why do we need the information that "Life’s" has been
captured by a regular expression parser? Wouldn't it suffice to know that the
word captured is "Life’s"? And is the whitespace really needed at all? If the
words in a sequence are separated by definition by whitespace, then it would
suffice to have the word without whitespace in our tree, and to add whitespace
only later when transforming the tree into some kind of output format. (On the
other hand, it might be convenient to have it in the tree never the less...)
Well, the answer to most most of these questions is that what our compilation
script yields is more or less the output that the parser yields which in turn
is the *concrete syntax tree* of the parsed text. Being a concrete syntax tree
it is by its very nature very verbose, because it captures every minute
syntactic detail described in the grammar and found in the text, no matter how
irrelevant it is, if we are primarily interested in the structure of our text.
In order for our tree to become more handy we have to transform it into an
*abstract syntax tree* first, which is called thus because it abstracts from
all details that deem us irrelevant. Now, which details we consider as
irrelevant is almost entirely up to ourselves. And we should think carefully
about what features must be included in the abstract syntax tree, because the
abstract syntax tree more or less reflects the data model (or is at most one
step away from it) with which want to capture our material.
For the sake of our example, let's assume that we are not interested that we
are not interested in whitespace and that we want to get rid of all
uniformative Nodes, i.e.
......@@ -32,7 +32,7 @@ import DHParser.log
from DHParser.log import log_parsing_history
LOGGING = True
LOGGING = False
if not DHParser.dsl.recompile_grammar('LaTeX.ebnf', force=False): # recompiles Grammar only if it has changed
print('\nErrors while recompiling "LaTeX.ebnf":\n--------------------------------------\n\n')
......
......@@ -2,6 +2,9 @@
M1: word
M2: one_word_with_underscores
[match:WORD]
M3: Life’s
[fail:WORD]
F1: two words
F2: ""
document = ~ { sentence } §EOF
sentence = part {"," part } "."
part = { WORD }+
WORD = /\w+/~
WORD = /[\w’]+/~
EOF = !/./
......@@ -59,10 +59,10 @@ class new2Grammar(Grammar):
document = ~ { sentence } §EOF
sentence = part {"," part } "."
part = { WORD }+
WORD = /\w+/~
WORD = /[\w’]+/~
EOF = !/./
"""
source_hash__ = "42443aabc6dfc68ae4567289b74ab085"
source_hash__ = "7a9984368b1c959222099d389d18c54f"
parser_initialization__ = "upon instantiation"
COMMENT__ = r''
WHITESPACE__ = r'\s*'
......@@ -71,7 +71,7 @@ class new2Grammar(Grammar):
wspR__ = WSP__
whitespace__ = Whitespace(WSP__)
EOF = NegativeLookahead(RegExp('.'))
WORD = RE('\\w+')
WORD = RE('[\\w’]+')
part = OneOrMore(WORD)
sentence = Series(part, ZeroOrMore(Series(Token(","), part)), Token("."))
document = Series(whitespace__, ZeroOrMore(sentence), EOF, mandatory=2)
......
......@@ -202,6 +202,17 @@ class TestNodeFind():
assert any(tree.select_by_tag('c', False))
class TestSerialization:
def test_attributes(self):
tree = mock_syntax_tree('(A "B")')
tree.attributes['attr'] = "value"
tree2 = mock_syntax_tree('(A `(attr "value") "B")')
assert tree.as_sxpr() == tree2.as_sxpr()
tree.attributes['attr2'] = "value2"
tree3 = mock_syntax_tree('(A `(attr "value") `(attr2 "value2") "B")')
assert tree.as_sxpr() == tree3.as_sxpr()
if __name__ == "__main__":
from DHParser.testing import runner
runner("", globals())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment