10.12., 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit d8cc42fb authored by di68kap's avatar di68kap

- syntaxtree.py: fixed "parse_xml()"

parent 9f1872d8
......@@ -38,7 +38,7 @@ from DHParser.log import is_logging, HistoryRecord
from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME
from DHParser.stringview import StringView, EMPTY_STRING_VIEW
from DHParser.syntaxtree import Node, RootNode, ParserBase, WHITESPACE_PTYPE, \
TOKEN_PTYPE, ZOMBIE_PARSER
PLAINTEXT_PTYPE, TOKEN_PTYPE, ZOMBIE_PARSER
from DHParser.toolkit import sane_parser_name, escape_control_characters, re, typing
from typing import Callable, cast, Dict, DefaultDict, List, Set, Tuple, Union, Optional
......@@ -894,6 +894,7 @@ class PlainText(Parser):
>>> Grammar(while_token)("while").content
'while'
"""
assert PLAINTEXT_PTYPE == ":PlainText"
def __init__(self, text: str, name: str = '') -> None:
super().__init__(name)
......
......@@ -118,6 +118,7 @@ class StringView(collections.abc.Sized):
return self.fullstring
# since the slice is being copyied now, anyway, the copy might
# as well be stored in the string view
# return self.text[self.begin:self.end] # use this for debugging!
self.fullstring = self.text[self.begin:self.end]
return self.fullstring
......
......@@ -36,6 +36,7 @@ from typing import Callable, cast, Iterator, List, AbstractSet, Set, Union, Tupl
__all__ = ('ParserBase',
'WHITESPACE_PTYPE',
'PLAINTEXT_PTYPE',
'TOKEN_PTYPE',
'MockParser',
'ZombieParser',
......@@ -109,6 +110,7 @@ class ParserBase:
WHITESPACE_PTYPE = ':Whitespace'
PLAINTEXT_PTYPE = ':PlainText'
TOKEN_PTYPE = ':Token'
......@@ -873,13 +875,16 @@ def parse_sxpr(sxpr: str) -> Node:
return inner_parser(sxpr)
RX_WHITESPACE_TAIL = re.compile(r'\s*$')
def parse_xml(xml: str) -> Node:
"""
Generates a tree of nodes from a (Pseudo-)XML-source.
"""
xml = StringView(xml)
PlainText = MockParser('', ':PlainText')
mock_parsers = {':PlainText': PlainText}
PlainText = MockParser('', PLAINTEXT_PTYPE)
mock_parsers = {PLAINTEXT_PTYPE: PlainText}
def parse_attributes(s: StringView) -> Tuple[StringView, OrderedDict]:
"""Parses a sqeuence of XML-Attributes. Returns the string-slice
......@@ -900,7 +905,8 @@ def parse_xml(xml: str) -> Node:
match = s.match(re.compile(r'<\s*(?P<tagname>[\w:]+)\s*'))
assert match
tagname = match.groupdict()['tagname']
s, attributes = parse_attributes(s[match.end() - s.begin:])
section = s[match.end() - s.begin:]
s, attributes = parse_attributes(section)
i = s.find('>')
assert i >= 0
return s[i+1:], tagname, attributes, s[i-1] == "/"
......@@ -931,22 +937,24 @@ def parse_xml(xml: str) -> Node:
if not solitary:
while s and not s[:2] == "</":
s, leaf = parse_leaf_content(s)
if not s.match(re.compile("\s*$")):
if not leaf.match(RX_WHITESPACE_TAIL):
result.append(Node(PlainText, leaf))
if s[:1] == "<" and s[:2] != "</":
s, child = parse_full_content(s)
result.append(child)
s, closing_tagname = parse_closing_tag(s)
assert tagname == closing_tagname
if len(result) == 1 and isinstance(result[0].parser == PlainText):
if len(result) == 1 and result[0].parser.ptype == PLAINTEXT_PTYPE:
result = result[0].result
else:
result = tuple(result)
return Node(mock_parsers.setdefault(tagname, MockParser(name, ":" + class_name)), result)
return s, Node(mock_parsers.setdefault(tagname, MockParser(name, ":" + class_name)), result)
match_header = xml.search(re.compile(r'<(?!\?)'))
start = match_header.start() if match_header else 0
return parse_full_content(xml[start:])
_, tree = parse_full_content(xml[start:])
assert _.match(RX_WHITESPACE_TAIL)
return tree
# if __name__ == "__main__":
# st = parse_sxpr("(alpha (beta (gamma i\nj\nk) (delta y)) (epsilon z))")
......
......@@ -51,7 +51,15 @@ class TestParseXML:
fxml = flatten_xml(xml)
assert fxml == '<a><b>c</b><d><e>f</e><h>i</h></d></a>'
tree2 = parse_xml(fxml)
print(tree2.as_sxpr())
assert fxml == flatten_xml(tree2.as_xml())
def test_plaintext_handling(self):
tree = parse_xml('<a>alpha <b>beta</b> gamma</a>')
assert flatten_sxpr(tree.as_sxpr()) == \
'(a (:PlainText "alpha ") (b "beta") (:PlainText " gamma"))'
tree = parse_xml(' <a> <b>beta</b> </a> ')
assert flatten_xml(tree.as_xml()) == '<a><b>beta</b></a>'
class TestNode:
"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment