2.12.2021, 9:00 - 11:00: Due to updates GitLab may be unavailable for some minutes between 09:00 and 11:00.

Commit d8cc42fb authored by di68kap's avatar di68kap
Browse files

- syntaxtree.py: fixed "parse_xml()"

parent 9f1872d8
...@@ -38,7 +38,7 @@ from DHParser.log import is_logging, HistoryRecord ...@@ -38,7 +38,7 @@ from DHParser.log import is_logging, HistoryRecord
from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME from DHParser.preprocess import BEGIN_TOKEN, END_TOKEN, RX_TOKEN_NAME
from DHParser.stringview import StringView, EMPTY_STRING_VIEW from DHParser.stringview import StringView, EMPTY_STRING_VIEW
from DHParser.syntaxtree import Node, RootNode, ParserBase, WHITESPACE_PTYPE, \ from DHParser.syntaxtree import Node, RootNode, ParserBase, WHITESPACE_PTYPE, \
TOKEN_PTYPE, ZOMBIE_PARSER PLAINTEXT_PTYPE, TOKEN_PTYPE, ZOMBIE_PARSER
from DHParser.toolkit import sane_parser_name, escape_control_characters, re, typing from DHParser.toolkit import sane_parser_name, escape_control_characters, re, typing
from typing import Callable, cast, Dict, DefaultDict, List, Set, Tuple, Union, Optional from typing import Callable, cast, Dict, DefaultDict, List, Set, Tuple, Union, Optional
...@@ -894,6 +894,7 @@ class PlainText(Parser): ...@@ -894,6 +894,7 @@ class PlainText(Parser):
>>> Grammar(while_token)("while").content >>> Grammar(while_token)("while").content
'while' 'while'
""" """
assert PLAINTEXT_PTYPE == ":PlainText"
def __init__(self, text: str, name: str = '') -> None: def __init__(self, text: str, name: str = '') -> None:
super().__init__(name) super().__init__(name)
......
...@@ -118,6 +118,7 @@ class StringView(collections.abc.Sized): ...@@ -118,6 +118,7 @@ class StringView(collections.abc.Sized):
return self.fullstring return self.fullstring
# since the slice is being copyied now, anyway, the copy might # since the slice is being copyied now, anyway, the copy might
# as well be stored in the string view # as well be stored in the string view
# return self.text[self.begin:self.end] # use this for debugging!
self.fullstring = self.text[self.begin:self.end] self.fullstring = self.text[self.begin:self.end]
return self.fullstring return self.fullstring
......
...@@ -36,6 +36,7 @@ from typing import Callable, cast, Iterator, List, AbstractSet, Set, Union, Tupl ...@@ -36,6 +36,7 @@ from typing import Callable, cast, Iterator, List, AbstractSet, Set, Union, Tupl
__all__ = ('ParserBase', __all__ = ('ParserBase',
'WHITESPACE_PTYPE', 'WHITESPACE_PTYPE',
'PLAINTEXT_PTYPE',
'TOKEN_PTYPE', 'TOKEN_PTYPE',
'MockParser', 'MockParser',
'ZombieParser', 'ZombieParser',
...@@ -109,6 +110,7 @@ class ParserBase: ...@@ -109,6 +110,7 @@ class ParserBase:
WHITESPACE_PTYPE = ':Whitespace' WHITESPACE_PTYPE = ':Whitespace'
PLAINTEXT_PTYPE = ':PlainText'
TOKEN_PTYPE = ':Token' TOKEN_PTYPE = ':Token'
...@@ -873,13 +875,16 @@ def parse_sxpr(sxpr: str) -> Node: ...@@ -873,13 +875,16 @@ def parse_sxpr(sxpr: str) -> Node:
return inner_parser(sxpr) return inner_parser(sxpr)
RX_WHITESPACE_TAIL = re.compile(r'\s*$')
def parse_xml(xml: str) -> Node: def parse_xml(xml: str) -> Node:
""" """
Generates a tree of nodes from a (Pseudo-)XML-source. Generates a tree of nodes from a (Pseudo-)XML-source.
""" """
xml = StringView(xml) xml = StringView(xml)
PlainText = MockParser('', ':PlainText') PlainText = MockParser('', PLAINTEXT_PTYPE)
mock_parsers = {':PlainText': PlainText} mock_parsers = {PLAINTEXT_PTYPE: PlainText}
def parse_attributes(s: StringView) -> Tuple[StringView, OrderedDict]: def parse_attributes(s: StringView) -> Tuple[StringView, OrderedDict]:
"""Parses a sqeuence of XML-Attributes. Returns the string-slice """Parses a sqeuence of XML-Attributes. Returns the string-slice
...@@ -900,7 +905,8 @@ def parse_xml(xml: str) -> Node: ...@@ -900,7 +905,8 @@ def parse_xml(xml: str) -> Node:
match = s.match(re.compile(r'<\s*(?P<tagname>[\w:]+)\s*')) match = s.match(re.compile(r'<\s*(?P<tagname>[\w:]+)\s*'))
assert match assert match
tagname = match.groupdict()['tagname'] tagname = match.groupdict()['tagname']
s, attributes = parse_attributes(s[match.end() - s.begin:]) section = s[match.end() - s.begin:]
s, attributes = parse_attributes(section)
i = s.find('>') i = s.find('>')
assert i >= 0 assert i >= 0
return s[i+1:], tagname, attributes, s[i-1] == "/" return s[i+1:], tagname, attributes, s[i-1] == "/"
...@@ -931,22 +937,24 @@ def parse_xml(xml: str) -> Node: ...@@ -931,22 +937,24 @@ def parse_xml(xml: str) -> Node:
if not solitary: if not solitary:
while s and not s[:2] == "</": while s and not s[:2] == "</":
s, leaf = parse_leaf_content(s) s, leaf = parse_leaf_content(s)
if not s.match(re.compile("\s*$")): if not leaf.match(RX_WHITESPACE_TAIL):
result.append(Node(PlainText, leaf)) result.append(Node(PlainText, leaf))
if s[:1] == "<" and s[:2] != "</": if s[:1] == "<" and s[:2] != "</":
s, child = parse_full_content(s) s, child = parse_full_content(s)
result.append(child) result.append(child)
s, closing_tagname = parse_closing_tag(s) s, closing_tagname = parse_closing_tag(s)
assert tagname == closing_tagname assert tagname == closing_tagname
if len(result) == 1 and isinstance(result[0].parser == PlainText): if len(result) == 1 and result[0].parser.ptype == PLAINTEXT_PTYPE:
result = result[0].result result = result[0].result
else: else:
result = tuple(result) result = tuple(result)
return Node(mock_parsers.setdefault(tagname, MockParser(name, ":" + class_name)), result) return s, Node(mock_parsers.setdefault(tagname, MockParser(name, ":" + class_name)), result)
match_header = xml.search(re.compile(r'<(?!\?)')) match_header = xml.search(re.compile(r'<(?!\?)'))
start = match_header.start() if match_header else 0 start = match_header.start() if match_header else 0
return parse_full_content(xml[start:]) _, tree = parse_full_content(xml[start:])
assert _.match(RX_WHITESPACE_TAIL)
return tree
# if __name__ == "__main__": # if __name__ == "__main__":
# st = parse_sxpr("(alpha (beta (gamma i\nj\nk) (delta y)) (epsilon z))") # st = parse_sxpr("(alpha (beta (gamma i\nj\nk) (delta y)) (epsilon z))")
......
...@@ -51,7 +51,15 @@ class TestParseXML: ...@@ -51,7 +51,15 @@ class TestParseXML:
fxml = flatten_xml(xml) fxml = flatten_xml(xml)
assert fxml == '<a><b>c</b><d><e>f</e><h>i</h></d></a>' assert fxml == '<a><b>c</b><d><e>f</e><h>i</h></d></a>'
tree2 = parse_xml(fxml) tree2 = parse_xml(fxml)
print(tree2.as_sxpr()) assert fxml == flatten_xml(tree2.as_xml())
def test_plaintext_handling(self):
tree = parse_xml('<a>alpha <b>beta</b> gamma</a>')
assert flatten_sxpr(tree.as_sxpr()) == \
'(a (:PlainText "alpha ") (b "beta") (:PlainText " gamma"))'
tree = parse_xml(' <a> <b>beta</b> </a> ')
assert flatten_xml(tree.as_xml()) == '<a><b>beta</b></a>'
class TestNode: class TestNode:
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment