Commit 73c9aa65 authored by eckhart's avatar eckhart
Browse files

- XML-Parser verbessert

parent 7db02298
...@@ -523,7 +523,8 @@ class Node(collections.abc.Sized): ...@@ -523,7 +523,8 @@ class Node(collections.abc.Sized):
tail = tail.lstrip() tail = tail.lstrip()
usetab, sep = '', '' usetab, sep = '', ''
else: else:
usetab, sep = tab, '\n' usetab = tab if head else '' # no indentation if tag is already omitted
sep = '\n'
if self.children: if self.children:
content = [] content = []
...@@ -537,6 +538,7 @@ class Node(collections.abc.Sized): ...@@ -537,6 +538,7 @@ class Node(collections.abc.Sized):
res = cast(str, self.result) # safe, because if there are no children, result is a string res = cast(str, self.result) # safe, because if there are no children, result is a string
if not inline and not head: if not inline and not head:
# strip whitespace for omitted non inline node, e.g. CharData in mixed elements
res = res.strip() res = res.strip()
if density & 1 and res.find('\n') < 0: # and head[0] == "<": if density & 1 and res.find('\n') < 0: # and head[0] == "<":
# except for XML, add a gap between opening statement and content # except for XML, add a gap between opening statement and content
...@@ -635,7 +637,7 @@ class Node(collections.abc.Sized): ...@@ -635,7 +637,7 @@ class Node(collections.abc.Sized):
if node.tag_name in empty_tags: if node.tag_name in empty_tags:
assert not node.result, ("Node %s with content %s is not an empty element!" % assert not node.result, ("Node %s with content %s is not an empty element!" %
(node.tag_name, str(node))) (node.tag_name, str(node)))
ending = "/>\n" ending = "/>\n" if not node.tag_name[0] == '?' else "?>\n"
else: else:
ending = ">\n" ending = ">\n"
return "".join(txt + [ending]) return "".join(txt + [ending])
......
...@@ -527,7 +527,7 @@ def get_transformer() -> TransformationFunc: ...@@ -527,7 +527,7 @@ def get_transformer() -> TransformationFunc:
XML_AST_transformation_table = { XML_AST_transformation_table = {
# AST Transformations for the XML-grammar # AST Transformations for the XML-grammar
"+": [remove_empty, remove_anonymous_tokens, remove_whitespace, remove_nodes("S")], "+": [remove_empty, remove_anonymous_tokens, remove_whitespace, remove_nodes("S")],
"document": [], "document": [flatten(lambda context: context[-1].tag_name == 'prolog', recursive=False)],
"prolog": [], "prolog": [],
"XMLDecl": [], "XMLDecl": [],
"VersionInfo": [reduce_single_child], "VersionInfo": [reduce_single_child],
...@@ -675,16 +675,12 @@ class XMLCompiler(Compiler): ...@@ -675,16 +675,12 @@ class XMLCompiler(Compiler):
def __init__(self, grammar_name="XML", grammar_source=""): def __init__(self, grammar_name="XML", grammar_source=""):
super(XMLCompiler, self).__init__(grammar_name, grammar_source) super(XMLCompiler, self).__init__(grammar_name, grammar_source)
assert re.match('\w+\Z', grammar_name) assert re.match('\w+\Z', grammar_name)
self.cleanup_whitespace = True # remove empty CharData from mixed elements
def _reset(self): def _reset(self):
super()._reset() super()._reset()
self.mock_parsers = dict() self.mock_parsers = dict()
def on_document(self, node):
self.tree.omit_tags.add('CharData')
self.tree.inline_tags.update({'to', 'from', 'heading', 'body', 'remark'})
return self.fallback_compiler(node)
def extract_attributes(self, node_sequence): def extract_attributes(self, node_sequence):
attributes = OrderedDict() attributes = OrderedDict()
for node in node_sequence: for node in node_sequence:
...@@ -698,11 +694,42 @@ class XMLCompiler(Compiler): ...@@ -698,11 +694,42 @@ class XMLCompiler(Compiler):
"""Returns a mock parser with the given tag_name as parser name.""" """Returns a mock parser with the given tag_name as parser name."""
return self.mock_parsers.setdefault(tag_name, MockParser(tag_name)) return self.mock_parsers.setdefault(tag_name, MockParser(tag_name))
def validity_constraint(self, node, condition, err_msg):
"""If `condition` is False an error is issued."""
if not condition:
self.tree.add_error(node, err_msg)
def value_constraint(self, node, value, allowed):
"""If value is not in allowed, an error is issued."""
self.constraint(node, value in allowed,
'Invalid value "%s" for "standalone"! Must be one of %s.' % (value, str(allowed)))
def on_document(self, node):
self.tree.omit_tags.update({'CharData', 'document'})
# TODO: Remove the following line. It is specific for testing with example.xml!
self.tree.inline_tags.update({'to', 'from', 'heading', 'body', 'remark'})
return self.fallback_compiler(node)
# def on_prolog(self, node): # def on_prolog(self, node):
# return node # return node
# def on_XMLDecl(self, node): def on_XMLDecl(self, node):
# return node attributes = dict()
for child in node.children:
s = child.content
if child.tag_name == "VersionInfo":
attributes['version'] = s
elif child.tag_name == "EncodingDecl":
attributes['encoding'] = s
elif child.tag_name == "SDDecl":
attributes['standalone'] = s
self.value_constraint(node, s, {'yes', 'no'})
if attributes:
node.attributes.update(attributes)
node.result = ''
self.tree.empty_tags.add('?xml')
node.parser = self.get_parser('?xml')
return node
# def on_VersionInfo(self, node): # def on_VersionInfo(self, node):
# return node # return node
...@@ -874,11 +901,23 @@ class XMLCompiler(Compiler): ...@@ -874,11 +901,23 @@ class XMLCompiler(Compiler):
def on_element(self, node): def on_element(self, node):
stag = node['STag'] stag = node['STag']
tag_name = stag['Name'].content
attributes = self.extract_attributes(stag.children) attributes = self.extract_attributes(stag.children)
preserve_whitespace = tag_name in self.tree.inline_tags
if attributes: if attributes:
node.attributes.update(attributes) node.attributes.update(attributes)
node.parser = self.get_parser(stag['Name'].content) preserve_whitespace |= attributes.get('xml:space', '') == 'preserve'
node.result = self.compile_children(node.get('content', ZOMBIE_NODE)) node.parser = self.get_parser(tag_name)
content = self.compile_children(node.get('content', ZOMBIE_NODE))
if len(content) == 1:
if content[0].tag_name == "CharData":
# reduce single CharData children
content = content[0].content
elif self.cleanup_whitespace and not preserve_whitespace:
# remove CharData that consists only of whitespace from mixed elements
content = tuple(child for child in content
if child.tag_name != "CharData" or child.content.strip() != '')
node.result = content
return node return node
# def on_STag(self, node): # def on_STag(self, node):
...@@ -1050,6 +1089,7 @@ if __name__ == "__main__": ...@@ -1050,6 +1089,7 @@ if __name__ == "__main__":
print(rel_path + ':' + str(error)) print(rel_path + ':' + str(error))
sys.exit(1) sys.exit(1)
else: else:
print(result.as_sxpr(compact=True))
print(result.customized_XML() if isinstance(result, Node) else result) print(result.customized_XML() if isinstance(result, Node) else result)
else: else:
print("Usage: XMLCompiler.py [FILENAME]") print("Usage: XMLCompiler.py [FILENAME]")
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<to>Tove</to> <to>Tove</to>
<from>Jani</from> <from>Jani</from>
<heading>Reminder</heading> <heading>Reminder</heading>
<body>Don't forget me this weekend!</body> <body> Don't forget me this weekend! </body>
<priority level="high" /> <priority level="high" />
<remark></remark> <remark></remark>
</note> </note>
\ No newline at end of file
...@@ -23,7 +23,6 @@ import copy ...@@ -23,7 +23,6 @@ import copy
import sys import sys
sys.path.extend(['../', './']) sys.path.extend(['../', './'])
from DHParser.error import Error
from DHParser.syntaxtree import Node, RootNode, parse_sxpr, parse_xml, flatten_sxpr, flatten_xml, TOKEN_PTYPE from DHParser.syntaxtree import Node, RootNode, parse_sxpr, parse_xml, flatten_sxpr, flatten_xml, TOKEN_PTYPE
from DHParser.transform import traverse, reduce_single_child, \ from DHParser.transform import traverse, reduce_single_child, \
replace_by_single_child, flatten, remove_expendables replace_by_single_child, flatten, remove_expendables
...@@ -166,37 +165,6 @@ class TestRootNode: ...@@ -166,37 +165,6 @@ class TestRootNode:
assert error_str.find("A") < error_str.find("B") assert error_str.find("A") < error_str.find("B")
# class TestErrorHandling:
# def test_error_flag_propagation(self):
# tree = parse_sxpr('(a (b c) (d (e (f (g h)))))')
#
# def find_h(context):
# node = context[-1]
# if node.result == "h":
# node.new_error("an error deep inside the syntax tree")
#
# assert not tree.error_flag
# traverse(tree, {"*": find_h})
# assert tree.error_flag, tree.as_sxpr()
#
# def test_collect_errors(self):
# tree = parse_sxpr('(A (B 1) (C (D (E 2) (F 3))))')
# A = tree
# B = next(tree.select(lambda node: str(node) == "1"))
# D = next(tree.select(lambda node: node.parser.name == "D"))
# F = next(tree.select(lambda node: str(node) == "3"))
# B.new_error("Error in child node")
# F.new_error("Error in child's child node")
# tree.error_flag = Error.ERROR
# errors = tree.collect_errors()
# assert len(errors) == 2, str(errors)
# assert A.error_flag
# assert D.error_flag
# errors = tree.collect_errors(clear_errors=True)
# assert len(errors) == 2
# assert not D.error_flag
class TestNodeFind(): class TestNodeFind():
"""Test the select-functions of class Node. """Test the select-functions of class Node.
""" """
...@@ -268,6 +236,14 @@ class TestSerialization: ...@@ -268,6 +236,14 @@ class TestSerialization:
assert s == '(A\n (B\n (C\n "D"\n "X"\n )' \ assert s == '(A\n (B\n (C\n "D"\n "X"\n )' \
'\n (E\n "F"\n )\n )\n (G\n " H "\n " Y "\n )\n)', s '\n (E\n "F"\n )\n )\n (G\n " H "\n " Y "\n )\n)', s
def test_compact_representation(self):
tree = parse_sxpr('(A (B (C "D") (E "F")) (G "H"))')
compact = tree.as_sxpr(compact=True)
assert compact == 'A\n B\n C "D"\n E "F"\n G "H"', compact
tree = parse_sxpr('(A (B (C "D\nX") (E "F")) (G " H \n Y "))')
compact = tree.as_sxpr(compact=True)
assert compact == 'A\n B\n C\n "D"\n "X"\n E "F"' \
'\n G\n " H "\n " Y "', compact
def test_xml_inlining(self): def test_xml_inlining(self):
tree = parse_sxpr('(A (B "C") (D "E"))') tree = parse_sxpr('(A (B "C") (D "E"))')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment