Commit 73c9aa65 authored by eckhart's avatar eckhart

- XML-Parser verbessert

parent 7db02298
......@@ -523,7 +523,8 @@ class Node(collections.abc.Sized):
tail = tail.lstrip()
usetab, sep = '', ''
else:
usetab, sep = tab, '\n'
usetab = tab if head else '' # no indentation if tag is already omitted
sep = '\n'
if self.children:
content = []
......@@ -537,6 +538,7 @@ class Node(collections.abc.Sized):
res = cast(str, self.result) # safe, because if there are no children, result is a string
if not inline and not head:
# strip whitespace for omitted non inline node, e.g. CharData in mixed elements
res = res.strip()
if density & 1 and res.find('\n') < 0: # and head[0] == "<":
# except for XML, add a gap between opening statement and content
......@@ -635,7 +637,7 @@ class Node(collections.abc.Sized):
if node.tag_name in empty_tags:
assert not node.result, ("Node %s with content %s is not an empty element!" %
(node.tag_name, str(node)))
ending = "/>\n"
ending = "/>\n" if not node.tag_name[0] == '?' else "?>\n"
else:
ending = ">\n"
return "".join(txt + [ending])
......
......@@ -527,7 +527,7 @@ def get_transformer() -> TransformationFunc:
XML_AST_transformation_table = {
# AST Transformations for the XML-grammar
"+": [remove_empty, remove_anonymous_tokens, remove_whitespace, remove_nodes("S")],
"document": [],
"document": [flatten(lambda context: context[-1].tag_name == 'prolog', recursive=False)],
"prolog": [],
"XMLDecl": [],
"VersionInfo": [reduce_single_child],
......@@ -675,16 +675,12 @@ class XMLCompiler(Compiler):
def __init__(self, grammar_name="XML", grammar_source=""):
super(XMLCompiler, self).__init__(grammar_name, grammar_source)
assert re.match('\w+\Z', grammar_name)
self.cleanup_whitespace = True # remove empty CharData from mixed elements
def _reset(self):
super()._reset()
self.mock_parsers = dict()
def on_document(self, node):
self.tree.omit_tags.add('CharData')
self.tree.inline_tags.update({'to', 'from', 'heading', 'body', 'remark'})
return self.fallback_compiler(node)
def extract_attributes(self, node_sequence):
attributes = OrderedDict()
for node in node_sequence:
......@@ -698,11 +694,42 @@ class XMLCompiler(Compiler):
"""Returns a mock parser with the given tag_name as parser name."""
return self.mock_parsers.setdefault(tag_name, MockParser(tag_name))
def validity_constraint(self, node, condition, err_msg):
"""If `condition` is False an error is issued."""
if not condition:
self.tree.add_error(node, err_msg)
def value_constraint(self, node, value, allowed):
"""If value is not in allowed, an error is issued."""
self.constraint(node, value in allowed,
'Invalid value "%s" for "standalone"! Must be one of %s.' % (value, str(allowed)))
def on_document(self, node):
self.tree.omit_tags.update({'CharData', 'document'})
# TODO: Remove the following line. It is specific for testing with example.xml!
self.tree.inline_tags.update({'to', 'from', 'heading', 'body', 'remark'})
return self.fallback_compiler(node)
# def on_prolog(self, node):
# return node
# def on_XMLDecl(self, node):
# return node
def on_XMLDecl(self, node):
attributes = dict()
for child in node.children:
s = child.content
if child.tag_name == "VersionInfo":
attributes['version'] = s
elif child.tag_name == "EncodingDecl":
attributes['encoding'] = s
elif child.tag_name == "SDDecl":
attributes['standalone'] = s
self.value_constraint(node, s, {'yes', 'no'})
if attributes:
node.attributes.update(attributes)
node.result = ''
self.tree.empty_tags.add('?xml')
node.parser = self.get_parser('?xml')
return node
# def on_VersionInfo(self, node):
# return node
......@@ -874,11 +901,23 @@ class XMLCompiler(Compiler):
def on_element(self, node):
stag = node['STag']
tag_name = stag['Name'].content
attributes = self.extract_attributes(stag.children)
preserve_whitespace = tag_name in self.tree.inline_tags
if attributes:
node.attributes.update(attributes)
node.parser = self.get_parser(stag['Name'].content)
node.result = self.compile_children(node.get('content', ZOMBIE_NODE))
preserve_whitespace |= attributes.get('xml:space', '') == 'preserve'
node.parser = self.get_parser(tag_name)
content = self.compile_children(node.get('content', ZOMBIE_NODE))
if len(content) == 1:
if content[0].tag_name == "CharData":
# reduce single CharData children
content = content[0].content
elif self.cleanup_whitespace and not preserve_whitespace:
# remove CharData that consists only of whitespace from mixed elements
content = tuple(child for child in content
if child.tag_name != "CharData" or child.content.strip() != '')
node.result = content
return node
# def on_STag(self, node):
......@@ -1050,6 +1089,7 @@ if __name__ == "__main__":
print(rel_path + ':' + str(error))
sys.exit(1)
else:
print(result.as_sxpr(compact=True))
print(result.customized_XML() if isinstance(result, Node) else result)
else:
print("Usage: XMLCompiler.py [FILENAME]")
......@@ -3,7 +3,7 @@
<to>Tove</to>
<from>Jani</from>
<heading>Reminder</heading>
<body>Don't forget me this weekend!</body>
<body> Don't forget me this weekend! </body>
<priority level="high" />
<remark></remark>
</note>
\ No newline at end of file
......@@ -23,7 +23,6 @@ import copy
import sys
sys.path.extend(['../', './'])
from DHParser.error import Error
from DHParser.syntaxtree import Node, RootNode, parse_sxpr, parse_xml, flatten_sxpr, flatten_xml, TOKEN_PTYPE
from DHParser.transform import traverse, reduce_single_child, \
replace_by_single_child, flatten, remove_expendables
......@@ -166,37 +165,6 @@ class TestRootNode:
assert error_str.find("A") < error_str.find("B")
# class TestErrorHandling:
# def test_error_flag_propagation(self):
# tree = parse_sxpr('(a (b c) (d (e (f (g h)))))')
#
# def find_h(context):
# node = context[-1]
# if node.result == "h":
# node.new_error("an error deep inside the syntax tree")
#
# assert not tree.error_flag
# traverse(tree, {"*": find_h})
# assert tree.error_flag, tree.as_sxpr()
#
# def test_collect_errors(self):
# tree = parse_sxpr('(A (B 1) (C (D (E 2) (F 3))))')
# A = tree
# B = next(tree.select(lambda node: str(node) == "1"))
# D = next(tree.select(lambda node: node.parser.name == "D"))
# F = next(tree.select(lambda node: str(node) == "3"))
# B.new_error("Error in child node")
# F.new_error("Error in child's child node")
# tree.error_flag = Error.ERROR
# errors = tree.collect_errors()
# assert len(errors) == 2, str(errors)
# assert A.error_flag
# assert D.error_flag
# errors = tree.collect_errors(clear_errors=True)
# assert len(errors) == 2
# assert not D.error_flag
class TestNodeFind():
"""Test the select-functions of class Node.
"""
......@@ -268,6 +236,14 @@ class TestSerialization:
assert s == '(A\n (B\n (C\n "D"\n "X"\n )' \
'\n (E\n "F"\n )\n )\n (G\n " H "\n " Y "\n )\n)', s
def test_compact_representation(self):
tree = parse_sxpr('(A (B (C "D") (E "F")) (G "H"))')
compact = tree.as_sxpr(compact=True)
assert compact == 'A\n B\n C "D"\n E "F"\n G "H"', compact
tree = parse_sxpr('(A (B (C "D\nX") (E "F")) (G " H \n Y "))')
compact = tree.as_sxpr(compact=True)
assert compact == 'A\n B\n C\n "D"\n "X"\n E "F"' \
'\n G\n " H "\n " Y "', compact
def test_xml_inlining(self):
tree = parse_sxpr('(A (B "C") (D "E"))')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment