10.12., 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit ce067809 authored by Eckhart Arnold's avatar Eckhart Arnold

- syntaxtree: S-expression serialization now supports attributes

parent 06de1882
...@@ -25,6 +25,7 @@ parser classes are defined in the ``parse`` module. ...@@ -25,6 +25,7 @@ parser classes are defined in the ``parse`` module.
import collections.abc import collections.abc
from collections import OrderedDict
import copy import copy
from DHParser.error import Error, linebreaks, line_col from DHParser.error import Error, linebreaks, line_col
...@@ -516,6 +517,13 @@ class Node(collections.abc.Sized): ...@@ -516,6 +517,13 @@ class Node(collections.abc.Sized):
return errors return errors
@property
def attributes(self):
"""Returns a dictionary of XML-Attributes attached to the Node."""
if not hasattr(self, '_xml_attr'):
self._xml_attr = OrderedDict()
return self._xml_attr
def _tree_repr(self, tab, open_fn, close_fn, data_fn=lambda i: i, density=0) -> str: def _tree_repr(self, tab, open_fn, close_fn, data_fn=lambda i: i, density=0) -> str:
""" """
...@@ -580,16 +588,17 @@ class Node(collections.abc.Sized): ...@@ -580,16 +588,17 @@ class Node(collections.abc.Sized):
def opening(node) -> str: def opening(node) -> str:
"""Returns the opening string for the representation of `node`.""" """Returns the opening string for the representation of `node`."""
txt = left_bracket + node.tag_name txt = [left_bracket, node.tag_name]
# s += " '(pos %i)" % node.pos # s += " '(pos %i)" % node.pos
if hasattr(node, '_xml_attr'):
txt.extend(""" `(%s "%s")""" % (k, v) for k, v in node.attributes.items())
if src: if src:
txt += " '(pos %i " % node.pos # + " %i %i)" % line_col(src, node.pos) txt.append(" `(pos %i %i %i)" % (node.pos, *line_col(src, node.pos)))
# if node.error_flag: # just for debugging error collecting # if node.error_flag: # just for debugging error collecting
# txt += " HAS ERRORS" # txt += " HAS ERRORS"
if showerrors and node.errors: if showerrors and node.errors:
txt += " '(err '(%s))" % ' '.join(str(err).replace('"', r'\"') txt.append(" `(err %s)" % ' '.join(str(err) for err in node.errors))
for err in node.errors) return "".join(txt) + '\n'
return txt + '\n'
def closing(node) -> str: def closing(node) -> str:
"""Returns the closing string for the representation of `node`.""" """Returns the closing string for the representation of `node`."""
...@@ -604,14 +613,6 @@ class Node(collections.abc.Sized): ...@@ -604,14 +613,6 @@ class Node(collections.abc.Sized):
return self._tree_repr(' ', opening, closing, pretty, density=density) return self._tree_repr(' ', opening, closing, pretty, density=density)
@property
def attributes(self):
"""Returns a dictionary of XML-Attributes attached to the Node."""
if not hasattr(self, '_xml_attr'):
self._xml_attr = dict()
return self._xml_attr
def as_xml(self, src: str = None, showerrors: bool = True) -> str: def as_xml(self, src: str = None, showerrors: bool = True) -> str:
""" """
Returns content as XML-tree. Returns content as XML-tree.
...@@ -625,12 +626,13 @@ class Node(collections.abc.Sized): ...@@ -625,12 +626,13 @@ class Node(collections.abc.Sized):
def opening(node) -> str: def opening(node) -> str:
"""Returns the opening string for the representation of `node`.""" """Returns the opening string for the representation of `node`."""
txt = ['<', node.tag_name] txt = ['<', node.tag_name]
# s += ' pos="%i"' % node.pos has_reserved_attrs = hasattr(node, '_xml_attr') \
and any (r in node.attributes for r in {'err', 'line', 'col'})
if hasattr(node, '_xml_attr'): if hasattr(node, '_xml_attr'):
txt.extend(' %s="%s"' % (k, v) for k, v in node.attributes.items()) txt.extend(' %s="%s"' % (k, v) for k, v in node.attributes.items())
if src: if src and not has_reserved_attrs:
txt.append(' line="%i" col="%i"' % line_col(line_breaks, node.pos)) txt.append(' line="%i" col="%i"' % line_col(line_breaks, node.pos))
if showerrors and node.errors: if showerrors and node.errors and not has_reserved_attrs:
txt.append(' err="%s"' % ''.join(str(err).replace('"', r'\"') txt.append(' err="%s"' % ''.join(str(err).replace('"', r'\"')
for err in node.errors)) for err in node.errors))
return "".join(txt + [">\n"]) return "".join(txt + [">\n"])
...@@ -728,7 +730,7 @@ class Node(collections.abc.Sized): ...@@ -728,7 +730,7 @@ class Node(collections.abc.Sized):
ZOMBIE_NODE = Node(ZOMBIE_PARSER, '') ZOMBIE_NODE = Node(ZOMBIE_PARSER, '')
def mock_syntax_tree(sxpr): def mock_syntax_tree(sxpr: str) -> Node:
""" """
Generates a tree of nodes from an S-expression. The main purpose of this is Generates a tree of nodes from an S-expression. The main purpose of this is
to generate test data. to generate test data.
...@@ -763,7 +765,7 @@ def mock_syntax_tree(sxpr): ...@@ -763,7 +765,7 @@ def mock_syntax_tree(sxpr):
else 'Malformed S-expression. Closing bracket(s) ")" missing.' else 'Malformed S-expression. Closing bracket(s) ")" missing.'
raise AssertionError(errmsg) raise AssertionError(errmsg)
sxpr = sxpr.strip() sxpr = StringView(sxpr).strip()
if sxpr[0] != '(': if sxpr[0] != '(':
raise ValueError('"(" expected, not ' + sxpr[:10]) raise ValueError('"(" expected, not ' + sxpr[:10])
# assert sxpr[0] == '(', sxpr # assert sxpr[0] == '(', sxpr
...@@ -774,15 +776,33 @@ def mock_syntax_tree(sxpr): ...@@ -774,15 +776,33 @@ def mock_syntax_tree(sxpr):
'not "%s"' % sxpr[:40].replace('\n', '')) 'not "%s"' % sxpr[:40].replace('\n', ''))
name, class_name = (sxpr[:match.end()].split(':') + [''])[:2] name, class_name = (sxpr[:match.end()].split(':') + [''])[:2]
sxpr = sxpr[match.end():].strip() sxpr = sxpr[match.end():].strip()
pos = 0
attributes = OrderedDict()
if sxpr[0] == '(': if sxpr[0] == '(':
result = tuple(mock_syntax_tree(block) for block in next_block(sxpr)) result = tuple(mock_syntax_tree(block) for block in next_block(sxpr))
pos = 0
for node in result: for node in result:
node._pos = pos node._pos = pos
pos += len(node) pos += len(node)
else: else:
lines = [] lines = []
while sxpr and sxpr[0] != ')': while sxpr and sxpr[0] != ')':
# parse attributes
while sxpr[:2] == "`(":
i = sxpr.find('"')
k = sxpr.find(')')
if sxpr[2:5] == "pos" and (i < 0 or k < i):
pos = int(sxpr[5:k].strip().split(' ')[0])
elif sxpr[2:5] == "err":
m = sxpr.find('(', 5)
while m >= 0 and m < k:
m = sxpr.find('(', k)
k = max(k, sxpr.find(')', max(m, 0)))
else:
attr = sxpr[2:i].strip()
value = sxpr[i:k].strip()[1:-1]
attributes[attr] = value
sxpr = sxpr[k+1:].strip()
# parse content
for qtmark in ['"""', "'''", '"', "'"]: for qtmark in ['"""', "'''", '"', "'"]:
match = re.match(qtmark + r'.*?' + qtmark, sxpr, re.DOTALL) match = re.match(qtmark + r'.*?' + qtmark, sxpr, re.DOTALL)
if match: if match:
...@@ -796,7 +816,9 @@ def mock_syntax_tree(sxpr): ...@@ -796,7 +816,9 @@ def mock_syntax_tree(sxpr):
sxpr = sxpr[match.end():] sxpr = sxpr[match.end():]
result = "\n".join(lines) result = "\n".join(lines)
node = Node(MockParser(name, ':' + class_name), result) node = Node(MockParser(name, ':' + class_name), result)
node._pos = 0 if attributes:
node.attributes.update(attributes)
node._pos = pos
return node return node
# if __name__ == "__main__": # if __name__ == "__main__":
......
...@@ -260,6 +260,22 @@ our DSL. Rules in general always consist of a symbol on the left hand side of ...@@ -260,6 +260,22 @@ our DSL. Rules in general always consist of a symbol on the left hand side of
a "="-sign (which in this context can be unterstood as a definition signifier) a "="-sign (which in this context can be unterstood as a definition signifier)
and the definition of the rule on the right hand side. and the definition of the rule on the right hand side.
.. note:: Traditional parser technology for context-free grammars often
distinguishes two phases, *scanning* and *parsing*, where a lexical scanner
would take a stream of characters and yield a sequence of tokens and the
actual parser would then operate on the stream of tokens. DHParser,
however, is an instance of a *scannerless parser* where the functionality
of the lexical scanner is seamlessly integrated into the
parser. This is done by allowing regular expressions in the definiendum of
grammar symbols. The regular expressions do the work of the lexical
scanner.
Theoretically, one could do without scanners or regular expressions.
Because regular languages are a subset of context-free languages, parsers
for context-free languages can do all the work that regular expressions can
do. But it makes things easier - and, in the case of DHParser, also faster
- to have them.
In our case the text as a whole, conveniently named "document" (any other name In our case the text as a whole, conveniently named "document" (any other name
would be allowed, too), consists of a leading whitespace, a possibly empty would be allowed, too), consists of a leading whitespace, a possibly empty
sequence of an arbitrary number of words words ending only if the end of file sequence of an arbitrary number of words words ending only if the end of file
...@@ -495,7 +511,7 @@ luckily not in the case of our short demo example:: ...@@ -495,7 +511,7 @@ luckily not in the case of our short demo example::
$ firefox LOGS/macbeth_full_parser.log.html & $ firefox LOGS/macbeth_full_parser.log.html &
..picture parsing_history.png .. image:: parsing_history.png
What you see is a representation of the parsing history. It might look a bit What you see is a representation of the parsing history. It might look a bit
tedious in the beginning, especially the this column that contains the parser tedious in the beginning, especially the this column that contains the parser
...@@ -519,7 +535,90 @@ parser matched, the last column displays exactly that section of the text that ...@@ -519,7 +535,90 @@ parser matched, the last column displays exactly that section of the text that
the parser did match. If the parser did not match, the last column displays the parser did match. If the parser did not match, the last column displays
the text that still lies ahead and has not yet been parsed. the text that still lies ahead and has not yet been parsed.
In our concrete example, we can see that the parser "WORD" matches "Life", but not "Life’s" or "’s". And this ultimately leads to the failure of the parsing process as a whole. In our concrete example, we can see that the parser "WORD" matches "Life", but
not "Life’s" or "’s". And this ultimately leads to the failure of the parsing
process as a whole. The simplemost solution would be to add the apostrophe to
the list of allowed characters in a word by changeing the respective line in
the grammar definition to ``WORD = /[\w’]+/``. Now, before we even change the
grammar we first add another test case to capture this kind of error. Since we
have decided that "Life’s" should be parsed as a singe word, let's open the
file "grammar_tests/01_test_word.ini" and add the following test::
[match:WORD]
M3: Life’s
To be sure that the new tests captures the error we have found you might want
to run the script "tst_poetry_grammar.py" and verify that it reports the
failure of test "M3" in the suite "01_test_word.ini". After that, change the
regular expression for the symbol WORD in the grammar file "poetry.ebnf" as
just described. Now both the tests and the compilation of the file
"macbeth.dsl" should run through smoothly.
.. caution:: Depending on the purpose of your DSL, the simple solution of
allowing apostrophes within words, might not be what you want. After all
"Life’s" is but a shorthand for the two word phrase "Life is". Now,
whatever alternative solution now comes to your mind, be aware that there
are also cases like Irish names, say "O’Dolan" where the apostrophe is
actually a part of a word and cases like "don’t" which, if expanded, would
be two words *not* separated at the position of the apostrophe.
We leave that as an exercise, first to figure out, what different cases for
the use of apostrophes in the middle of a word exist. Secondly, to make a
reasonable decision which of these should be treated as a single and which
as separate words and, finally, if possible, to write a grammar that
provides for these cases. These steps are quite typical for the kind of
challenges that occur during the design of a DSL for a
Digital-Humanities-Project.
Controlling abstract-syntax-tree generation Controlling abstract-syntax-tree generation
------------------------------------------- -------------------------------------------
Compiling the example "macbeth.dsl" with the command ``python poetryCompier.py
macbeth.dsl``, you might find yourself not being able to avoid the impression
that the output is rather verbose. Just looking at the beginning of the
output, we find::
<document>
<:ZeroOrMore>
<sentence>
<part>
<WORD>
<:RegExp>Life’s</:RegExp>
<:Whitespace> </:Whitespace>
</WORD>
<WORD>
<:RegExp>but</:RegExp>
<:Whitespace> </:Whitespace>
</WORD>
...
But why do we need to know all those details! Why would we need a
":ZeroOrMore" element inside the "<document>" element, if the
"<sentence>"-elements could just as well be direct descendants of the
"<document>"-element? Why do we need the information that "Life’s" has been
captured by a regular expression parser? Wouldn't it suffice to know that the
word captured is "Life’s"? And is the whitespace really needed at all? If the
words in a sequence are separated by definition by whitespace, then it would
suffice to have the word without whitespace in our tree, and to add whitespace
only later when transforming the tree into some kind of output format. (On the
other hand, it might be convenient to have it in the tree never the less...)
Well, the answer to most most of these questions is that what our compilation
script yields is more or less the output that the parser yields which in turn
is the *concrete syntax tree* of the parsed text. Being a concrete syntax tree
it is by its very nature very verbose, because it captures every minute
syntactic detail described in the grammar and found in the text, no matter how
irrelevant it is, if we are primarily interested in the structure of our text.
In order for our tree to become more handy we have to transform it into an
*abstract syntax tree* first, which is called thus because it abstracts from
all details that deem us irrelevant. Now, which details we consider as
irrelevant is almost entirely up to ourselves. And we should think carefully
about what features must be included in the abstract syntax tree, because the
abstract syntax tree more or less reflects the data model (or is at most one
step away from it) with which want to capture our material.
For the sake of our example, let's assume that we are not interested that we
are not interested in whitespace and that we want to get rid of all
uniformative Nodes, i.e.
...@@ -32,7 +32,7 @@ import DHParser.log ...@@ -32,7 +32,7 @@ import DHParser.log
from DHParser.log import log_parsing_history from DHParser.log import log_parsing_history
LOGGING = True LOGGING = False
if not DHParser.dsl.recompile_grammar('LaTeX.ebnf', force=False): # recompiles Grammar only if it has changed if not DHParser.dsl.recompile_grammar('LaTeX.ebnf', force=False): # recompiles Grammar only if it has changed
print('\nErrors while recompiling "LaTeX.ebnf":\n--------------------------------------\n\n') print('\nErrors while recompiling "LaTeX.ebnf":\n--------------------------------------\n\n')
......
...@@ -2,6 +2,9 @@ ...@@ -2,6 +2,9 @@
M1: word M1: word
M2: one_word_with_underscores M2: one_word_with_underscores
[match:WORD]
M3: Life’s
[fail:WORD] [fail:WORD]
F1: two words F1: two words
F2: "" F2: ""
document = ~ { sentence } §EOF document = ~ { sentence } §EOF
sentence = part {"," part } "." sentence = part {"," part } "."
part = { WORD }+ part = { WORD }+
WORD = /\w+/~ WORD = /[\w’]+/~
EOF = !/./ EOF = !/./
...@@ -59,10 +59,10 @@ class new2Grammar(Grammar): ...@@ -59,10 +59,10 @@ class new2Grammar(Grammar):
document = ~ { sentence } §EOF document = ~ { sentence } §EOF
sentence = part {"," part } "." sentence = part {"," part } "."
part = { WORD }+ part = { WORD }+
WORD = /\w+/~ WORD = /[\w’]+/~
EOF = !/./ EOF = !/./
""" """
source_hash__ = "42443aabc6dfc68ae4567289b74ab085" source_hash__ = "7a9984368b1c959222099d389d18c54f"
parser_initialization__ = "upon instantiation" parser_initialization__ = "upon instantiation"
COMMENT__ = r'' COMMENT__ = r''
WHITESPACE__ = r'\s*' WHITESPACE__ = r'\s*'
...@@ -71,7 +71,7 @@ class new2Grammar(Grammar): ...@@ -71,7 +71,7 @@ class new2Grammar(Grammar):
wspR__ = WSP__ wspR__ = WSP__
whitespace__ = Whitespace(WSP__) whitespace__ = Whitespace(WSP__)
EOF = NegativeLookahead(RegExp('.')) EOF = NegativeLookahead(RegExp('.'))
WORD = RE('\\w+') WORD = RE('[\\w’]+')
part = OneOrMore(WORD) part = OneOrMore(WORD)
sentence = Series(part, ZeroOrMore(Series(Token(","), part)), Token(".")) sentence = Series(part, ZeroOrMore(Series(Token(","), part)), Token("."))
document = Series(whitespace__, ZeroOrMore(sentence), EOF, mandatory=2) document = Series(whitespace__, ZeroOrMore(sentence), EOF, mandatory=2)
......
...@@ -202,6 +202,17 @@ class TestNodeFind(): ...@@ -202,6 +202,17 @@ class TestNodeFind():
assert any(tree.select_by_tag('c', False)) assert any(tree.select_by_tag('c', False))
class TestSerialization:
def test_attributes(self):
tree = mock_syntax_tree('(A "B")')
tree.attributes['attr'] = "value"
tree2 = mock_syntax_tree('(A `(attr "value") "B")')
assert tree.as_sxpr() == tree2.as_sxpr()
tree.attributes['attr2'] = "value2"
tree3 = mock_syntax_tree('(A `(attr "value") `(attr2 "value2") "B")')
assert tree.as_sxpr() == tree3.as_sxpr()
if __name__ == "__main__": if __name__ == "__main__":
from DHParser.testing import runner from DHParser.testing import runner
runner("", globals()) runner("", globals())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment