Skip to content

Commit e41d61c

Browse files
gh-149489: Fix ElementTree serialization to HTML
* The content of comments, processing instructions and elements "xmp", "iframe", "noembed", "noframes", and "plaintext" is no longer escaped. * The "plaintext" element no longer have the closing tag. * Add support of empty attributes (with value None).
1 parent f5c7535 commit e41d61c

3 files changed

Lines changed: 48 additions & 10 deletions

File tree

Lib/test/test_xml_etree.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1278,7 +1278,13 @@ def check(p, expected, namespaces=None):
12781278
{'': 'http://www.w3.org/2001/XMLSchema',
12791279
'ns': 'http://www.w3.org/2001/XMLSchema'})
12801280

1281-
def test_processinginstruction(self):
1281+
def test_comment_serialization(self):
1282+
comm = ET.Comment('<spam> & ham')
1283+
self.assertEqual(ET.tostring(comm), b'<!--<spam> & ham-->')
1284+
self.assertEqual(ET.tostring(comm, method='html'), b'<!--<spam> & ham-->')
1285+
self.assertEqual(ET.tostring(comm, method='text'), b'<spam> & ham')
1286+
1287+
def test_processinginstruction_serialization(self):
12821288
# Test ProcessingInstruction directly
12831289

12841290
self.assertEqual(ET.tostring(ET.ProcessingInstruction('test', 'instruction')),
@@ -1293,6 +1299,21 @@ def test_processinginstruction(self):
12931299
self.assertEqual(ET.tostring(ET.PI('test', '<testing&>\xe3'), 'latin-1'),
12941300
b"<?xml version='1.0' encoding='latin-1'?>\n"
12951301
b"<?test <testing&>\xe3?>")
1302+
self.assertEqual(ET.tostring(ET.PI('test', 'ham & eggs < spam'), method='html'),
1303+
b'<?test ham & eggs < spam?>')
1304+
1305+
def test_empty_attribute_serialization(self):
1306+
elem = ET.Element('tag', attrib={'attr': None})
1307+
self.assertRaises(TypeError, ET.tostring, elem)
1308+
self.assertEqual(ET.tostring(elem, method='html'), b'<tag attr></tag>')
1309+
1310+
@support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes"))
1311+
def test_html_cdata_elems_serialization(self, tag):
1312+
tag = tag.title()
1313+
elem = ET.Element(tag)
1314+
elem.text = '<spam>&ham'
1315+
self.assertEqual(ET.tostring(elem, method='html'),
1316+
('<%s><spam>&ham</%s>' % (tag, tag)).encode())
12961317

12971318
def test_html_empty_elems_serialization(self):
12981319
# issue 15970
@@ -1308,6 +1329,12 @@ def test_html_empty_elems_serialization(self):
13081329
method='html')
13091330
self.assertEqual(serialized, expected)
13101331

1332+
def test_html_plaintext_serialization(self):
1333+
elem = ET.Element('PlainText')
1334+
elem.text = '<spam>&ham'
1335+
self.assertEqual(ET.tostring(elem, method='html'),
1336+
b'<PlainText><spam>&ham')
1337+
13111338
def test_dump_attribute_order(self):
13121339
# See BPO 34160
13131340
e = ET.Element('cirriculum', status='public', company='example')

Lib/xml/etree/ElementTree.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -907,17 +907,20 @@ def _serialize_xml(write, elem, qnames, namespaces,
907907
if elem.tail:
908908
write(_escape_cdata(elem.tail))
909909

910+
_CDATA_CONTENT_ELEMENTS = {"script", "style", "xmp", "iframe", "noembed",
911+
"noframes", "plaintext"}
912+
910913
HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr",
911914
"img", "input", "isindex", "link", "meta", "param", "source",
912-
"track", "wbr"}
915+
"track", "wbr", "plaintext"}
913916

914917
def _serialize_html(write, elem, qnames, namespaces, **kwargs):
915918
tag = elem.tag
916919
text = elem.text
917920
if tag is Comment:
918-
write("<!--%s-->" % _escape_cdata(text))
921+
write("<!--%s-->" % text)
919922
elif tag is ProcessingInstruction:
920-
write("<?%s?>" % _escape_cdata(text))
923+
write("<?%s?>" % text)
921924
else:
922925
tag = qnames[tag]
923926
if tag is None:
@@ -941,16 +944,19 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs):
941944
for k, v in items:
942945
if isinstance(k, QName):
943946
k = k.text
944-
if isinstance(v, QName):
945-
v = qnames[v.text]
947+
k = qnames[k]
948+
if v is None:
949+
write(" %s" % k)
946950
else:
947-
v = _escape_attrib_html(v)
948-
# FIXME: handle boolean attributes
949-
write(" %s=\"%s\"" % (qnames[k], v))
951+
if isinstance(v, QName):
952+
v = qnames[v.text]
953+
else:
954+
v = _escape_attrib_html(v)
955+
write(" %s=\"%s\"" % (k, v))
950956
write(">")
951957
ltag = tag.lower()
952958
if text:
953-
if ltag == "script" or ltag == "style":
959+
if ltag in _CDATA_CONTENT_ELEMENTS:
954960
write(text)
955961
else:
956962
write(_escape_cdata(text))
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix :mod:`~xml.etree.ElementTree` serialization to HTML. The content of
2+
comments, processing instructions and elements "xmp", "iframe", "noembed",
3+
"noframes", and "plaintext" is no longer escaped. The "plaintext" element no
4+
longer have the closing tag. Add support of empty attributes (with value
5+
``None``).

0 commit comments

Comments
 (0)