Skip to content

Commit a134c0b

Browse files
gh-149468: Add option to validate ElementTree during serialization
1 parent e41d61c commit a134c0b

5 files changed

Lines changed: 321 additions & 20 deletions

File tree

Doc/library/xml.etree.elementtree.rst

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -711,14 +711,14 @@ Functions
711711

712712
.. function:: tostring(element, encoding="us-ascii", method="xml", *, \
713713
xml_declaration=None, default_namespace=None, \
714-
short_empty_elements=True)
714+
validate=False, short_empty_elements=True)
715715

716716
Generates a string representation of an XML element, including all
717717
subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is
718718
the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to
719719
generate a Unicode string (otherwise, a bytestring is generated). *method*
720720
is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``).
721-
*xml_declaration*, *default_namespace* and *short_empty_elements* has the same
721+
*xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* has the same
722722
meaning as in :meth:`ElementTree.write`. Returns an (optionally) encoded string
723723
containing the XML data.
724724

@@ -732,17 +732,20 @@ Functions
732732
The :func:`tostring` function now preserves the attribute order
733733
specified by the user.
734734

735+
.. versionchanged:: next
736+
Added the *validate* parameter.
737+
735738

736739
.. function:: tostringlist(element, encoding="us-ascii", method="xml", *, \
737740
xml_declaration=None, default_namespace=None, \
738-
short_empty_elements=True)
741+
validate=False, short_empty_elements=True)
739742

740743
Generates a string representation of an XML element, including all
741744
subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is
742745
the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to
743746
generate a Unicode string (otherwise, a bytestring is generated). *method*
744747
is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``).
745-
*xml_declaration*, *default_namespace* and *short_empty_elements* has the same
748+
*xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* has the same
746749
meaning as in :meth:`ElementTree.write`. Returns a list of (optionally) encoded
747750
strings containing the XML data. It does not guarantee any specific sequence,
748751
except that ``b"".join(tostringlist(element)) == tostring(element)``.
@@ -752,13 +755,19 @@ Functions
752755
.. versionchanged:: 3.4
753756
Added the *short_empty_elements* parameter.
754757

758+
.. versionchanged:: next
759+
Added the *validate* parameter.
760+
755761
.. versionchanged:: 3.8
756762
Added the *xml_declaration* and *default_namespace* parameters.
757763

758764
.. versionchanged:: 3.8
759765
The :func:`tostringlist` function now preserves the attribute order
760766
specified by the user.
761767

768+
.. versionchanged:: next
769+
Added the *validate* parameter.
770+
762771

763772
.. function:: XML(text, parser=None)
764773

@@ -1186,7 +1195,7 @@ ElementTree Objects
11861195

11871196
.. method:: write(file, encoding="us-ascii", xml_declaration=None, \
11881197
default_namespace=None, method="xml", *, \
1189-
short_empty_elements=True)
1198+
validate=False, short_empty_elements=True)
11901199
11911200
Writes the element tree to a file, as XML. *file* is a file name, or a
11921201
:term:`file object` opened for writing. *encoding* [1]_ is the output
@@ -1197,6 +1206,14 @@ ElementTree Objects
11971206
*default_namespace* sets the default XML namespace (for "xmlns").
11981207
*method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is
11991208
``"xml"``).
1209+
1210+
If *validate* is true, check that all characters are legal XML or HTML
1211+
characters, depending on *method*, element and attribute names are
1212+
valid, and the content of comments, processing instructions and
1213+
HTML elements like ``<script>`` do not contain illegal sequences,
1214+
and raise :exc:`ValueError` otherwise.
1215+
By default, no validation is performed.
1216+
12001217
The keyword-only *short_empty_elements* parameter controls the formatting
12011218
of elements that contain no content. If ``True`` (the default), they are
12021219
emitted as a single self-closed tag, otherwise they are emitted as a pair
@@ -1216,6 +1233,9 @@ ElementTree Objects
12161233
The :meth:`write` method now preserves the attribute order specified
12171234
by the user.
12181235

1236+
.. versionchanged:: next
1237+
Added the *validate* parameter.
1238+
12191239

12201240
This is the XML file that is going to be manipulated::
12211241

Doc/whatsnew/3.15.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1813,6 +1813,17 @@ xml
18131813
(Contributed by Serhiy Storchaka in :gh:`139489`.)
18141814

18151815

1816+
xml.etree.ElementTree
1817+
---------------------
1818+
1819+
* Add the *validate* option to functions
1820+
:func:`~xml.etree.ElementTree.tostring`,
1821+
:func:`~xml.etree.ElementTree.tostringlist`, and the
1822+
:meth:`Element.write <xml.etree.ElementTree.ElementTree.write>` method,
1823+
which allows to validate the element or element tree before serialization.
1824+
(Contributed by Serhiy Storchaka in :gh:`xxxxxx`.)
1825+
1826+
18161827
xml.parsers.expat
18171828
-----------------
18181829

Lib/test/test_xml_etree.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,6 +1358,192 @@ def test_attlist_default(self):
13581358
{'{http://www.w3.org/XML/1998/namespace}lang': 'eng'})
13591359

13601360

1361+
class XMLValidationTest(unittest.TestCase):
1362+
1363+
def check(self, elem, expected=None):
1364+
self.assertRaises(ValueError,
1365+
ET.tostring, elem, validate=True)
1366+
ET.tostring(elem) # no exception
1367+
1368+
def test_invalid_comment(self):
1369+
self.check(ET.Comment('a--b'))
1370+
self.check(ET.Comment(' B+, B, or B-'))
1371+
1372+
def test_invalid_processing_instruction(self):
1373+
self.check(ET.PI(''))
1374+
self.check(ET.PI('0'))
1375+
self.check(ET.PI('a/b'))
1376+
self.check(ET.PI('foo\xa0bar'))
1377+
self.check(ET.PI('xml'))
1378+
self.check(ET.PI('xml', 'encoding="UTF-8"'))
1379+
self.check(ET.PI('foo', 'a?>b'))
1380+
self.check(ET.PI('foo', '\x00'))
1381+
self.check(ET.PI('foo', '\ud8ff'))
1382+
self.check(ET.PI('foo', '\ufffe'))
1383+
1384+
def test_invalid_tag(self):
1385+
self.check(ET.Element(''))
1386+
self.check(ET.Element('0'))
1387+
self.check(ET.Element('a/b'))
1388+
self.check(ET.Element(ET.QName('')))
1389+
self.check(ET.Element(ET.QName('0')))
1390+
self.check(ET.Element(ET.QName('a/b')))
1391+
1392+
def test_invalid_attr_name(self):
1393+
self.check(ET.Element('tag', attrib={'': 'value'}))
1394+
self.check(ET.Element('tag', attrib={'0': 'value'}))
1395+
self.check(ET.Element('tag', attrib={'a/b': 'value'}))
1396+
self.check(ET.Element('tag', attrib={ET.QName(''): 'value'}))
1397+
self.check(ET.Element('tag', attrib={ET.QName('0'): 'value'}))
1398+
self.check(ET.Element('tag', attrib={ET.QName('a/b'): 'value'}))
1399+
1400+
def test_invalid_attr_value(self):
1401+
self.check(ET.Element('tag', attrib={'key': '\x00'}))
1402+
self.check(ET.Element('tag', attrib={'key': '\ud8ff'}))
1403+
self.check(ET.Element('tag', attrib={'key': '\ufffe'}))
1404+
self.check(ET.Element('tag', attrib={'key': ET.QName('\x00')}))
1405+
self.check(ET.Element('tag', attrib={'key': ET.QName('\ud8ff')}))
1406+
self.check(ET.Element('tag', attrib={'key': ET.QName('\ufffe')}))
1407+
1408+
def test_invalid_text(self):
1409+
elem = ET.Element('tag')
1410+
elem.text = '\x00'
1411+
self.check(elem)
1412+
elem.text = '\ud8ff'
1413+
self.check(elem)
1414+
elem.text = '\ufffe'
1415+
self.check(elem)
1416+
1417+
def test_invalid_tail(self):
1418+
elem = ET.Element('tag')
1419+
elem.tail = '\x00'
1420+
self.check(elem)
1421+
elem.tail = '\ud8ff'
1422+
self.check(elem)
1423+
elem.tail = '\ufffe'
1424+
self.check(elem)
1425+
1426+
def test_invalid_text_without_tag(self):
1427+
elem = ET.Element(None)
1428+
elem.text = '\x00'
1429+
self.check(elem)
1430+
elem.text = '\ud8ff'
1431+
self.check(elem)
1432+
elem.text = '\ufffe'
1433+
self.check(elem)
1434+
1435+
def test_invalid_subelements(self):
1436+
elem = ET.Element('tag')
1437+
subelem = ET.SubElement(elem, 'subtag')
1438+
ET.SubElement(subelem, '\x00')
1439+
self.check(elem)
1440+
elem.tag = None
1441+
self.check(elem)
1442+
1443+
def test_invalid_namespace_uri(self):
1444+
self.check(ET.Element('{\x00}tag'))
1445+
self.check(ET.Element('{\ud8ff}tag'))
1446+
self.check(ET.Element('{\ufffe}tag'))
1447+
self.check(ET.Element(ET.QName('\x00', 'tag')))
1448+
self.check(ET.Element(ET.QName('\ud8ff', 'tag')))
1449+
self.check(ET.Element(ET.QName('\ufffe', 'tag')))
1450+
1451+
class HTMLValidationTest(unittest.TestCase):
1452+
1453+
def check(self, elem, expected=None):
1454+
self.assertRaises(ValueError,
1455+
ET.tostring, elem, method='html', validate=True)
1456+
ET.tostring(elem, method='html') # no exception
1457+
1458+
def test_invalid_comment(self):
1459+
self.check(ET.Comment('>'))
1460+
self.check(ET.Comment('->'))
1461+
self.check(ET.Comment('a-->b'))
1462+
self.check(ET.Comment('a--!>b'))
1463+
self.check(ET.Comment('a\x00b'))
1464+
1465+
def test_invalid_processing_instruction(self):
1466+
self.check(ET.PI('a>b'))
1467+
self.check(ET.PI('a\x00b'))
1468+
1469+
def test_invalid_tag(self):
1470+
self.check(ET.Element(''))
1471+
self.check(ET.Element('?'))
1472+
self.check(ET.Element('!'))
1473+
self.check(ET.Element('0'))
1474+
self.check(ET.Element(' a'))
1475+
self.check(ET.Element('a b'))
1476+
self.check(ET.Element('a\nb'))
1477+
self.check(ET.Element('a/b'))
1478+
self.check(ET.Element('a>b'))
1479+
self.check(ET.Element('a\x00b'))
1480+
self.check(ET.Element(ET.QName('')))
1481+
self.check(ET.Element(ET.QName('0')))
1482+
self.check(ET.Element(ET.QName('a/b')))
1483+
1484+
def test_invalid_attr_name(self):
1485+
self.check(ET.Element('tag', attrib={'': 'value'}))
1486+
self.check(ET.Element('tag', attrib={'a/b': 'value'}))
1487+
self.check(ET.Element('tag', attrib={'a=b': 'value'}))
1488+
self.check(ET.Element('tag', attrib={ET.QName(''): 'value'}))
1489+
self.check(ET.Element('tag', attrib={ET.QName('a/b'): 'value'}))
1490+
1491+
def test_invalid_attr_value(self):
1492+
self.check(ET.Element('tag', attrib={'key': '\x00'}))
1493+
self.check(ET.Element('tag', attrib={'key': ET.QName('\x00')}))
1494+
self.check(ET.Element('tag', attrib={'key': ET.QName('a"b')}))
1495+
self.check(ET.Element('tag', attrib={'key': ET.QName('a&b')}))
1496+
1497+
def test_invalid_text(self):
1498+
elem = ET.Element('tag')
1499+
elem.text = '\x00'
1500+
self.check(elem)
1501+
1502+
def test_invalid_tail(self):
1503+
elem = ET.Element('tag')
1504+
elem.tail = '\x00'
1505+
self.check(elem)
1506+
1507+
def test_invalid_text_without_tag(self):
1508+
elem = ET.Element(None)
1509+
elem.text = '\x00'
1510+
self.check(elem)
1511+
1512+
def test_invalid_subelements(self):
1513+
elem = ET.Element('tag')
1514+
subelem = ET.SubElement(elem, 'subtag')
1515+
ET.SubElement(subelem, '\x00')
1516+
self.check(elem)
1517+
elem.tag = None
1518+
self.check(elem)
1519+
1520+
def test_invalid_namespace_uri(self):
1521+
self.check(ET.Element('{\x00}tag'))
1522+
self.check(ET.Element(ET.QName('\x00', 'tag')))
1523+
1524+
@support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes"))
1525+
def test_invalid_cdata_content(self, tag):
1526+
elem = ET.Element(tag.upper())
1527+
elem.text = 'a</%s>b' % tag.title()
1528+
self.check(elem)
1529+
elem.text = 'a</%s b' % tag.title()
1530+
self.check(elem)
1531+
elem.text = 'a</%s/b' % tag.title()
1532+
self.check(elem)
1533+
elem.text = 'a\x00b'
1534+
self.check(elem)
1535+
1536+
@support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes"))
1537+
def test_cdata_subelements(self, tag):
1538+
elem = ET.Element(tag)
1539+
ET.SubElement(elem, 'subtag')
1540+
self.check(elem)
1541+
1542+
def test_invalid_plaintext_content(self):
1543+
elem = ET.Element('plaintext')
1544+
elem.text = 'a\x00b'
1545+
self.check(elem)
1546+
13611547
class IterparseTest(unittest.TestCase):
13621548
# Test iterparse interface.
13631549

0 commit comments

Comments
 (0)