Skip to content

Commit a5c7a74

Browse files
gh-139489: Add xml.is_valid_text() (GH-149412)
1 parent 9274d96 commit a5c7a74

5 files changed

Lines changed: 47 additions & 1 deletion

File tree

Doc/library/xml.rst

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,19 @@ This module also defines utility functions.
5454
"!", "?", and "=" are forbidden.
5555
The name cannot start with a digit or a character like "-", ".", and "·".
5656

57-
..versionadded:: next
57+
.. versionadded:: next
58+
59+
60+
.. function:: is_valid_text(data)
61+
62+
Return ``True`` if the string is a sequence of legal XML 1.0 characters,
63+
``False`` otherwise.
64+
65+
Almost all characters are permitted in XML 1.0 documents, except C0 control
66+
characters (excluding TAB, CR and LF), surrogate characters and special
67+
Unicode characters U+FFFE and U+FFFF.
68+
69+
.. versionadded:: next
5870

5971

6072
.. _xml-security:

Doc/whatsnew/3.15.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1714,6 +1714,10 @@ xml
17141714
whether a string can be used as an element or attribute name in XML.
17151715
(Contributed by Serhiy Storchaka in :gh:`139489`.)
17161716

1717+
* Add the :func:`xml.is_valid_text` function, which allows to check
1718+
whether a string can be used in the XML document.
1719+
(Contributed by Serhiy Storchaka in :gh:`139489`.)
1720+
17171721

17181722
xml.parsers.expat
17191723
-----------------

Lib/test/test_xml.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,22 @@ def test_is_valid_name(self):
2222
for c in '<>/!?=\x00\x01\x7f\ud800\udfff\ufffe\uffff\U000F0000':
2323
self.assertFalse(is_valid_name('name' + c))
2424

25+
def test_is_valid_text(self):
26+
is_valid_text = xml.is_valid_text
27+
self.assertTrue(is_valid_text(''))
28+
self.assertTrue(is_valid_text('!0Aa_~ \r\n\t\x85\xa0'))
29+
self.assertTrue(is_valid_text('\ud7ff\ue000\ufffd\U00010000\U0010ffff'))
30+
self.assertFalse(is_valid_text('\x00'))
31+
self.assertFalse(is_valid_text('\x01'))
32+
self.assertFalse(is_valid_text('\x1f'))
33+
self.assertTrue(is_valid_text('\x7f'))
34+
self.assertTrue(is_valid_text('\x80'))
35+
self.assertTrue(is_valid_text('\x9f'))
36+
self.assertFalse(is_valid_text('\ud800'))
37+
self.assertFalse(is_valid_text('\udfff'))
38+
self.assertFalse(is_valid_text('\ufffe'))
39+
self.assertFalse(is_valid_text('\uffff'))
40+
2541

2642
if __name__ == '__main__':
2743
unittest.main()

Lib/xml/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,15 @@ def is_valid_name(name):
2323
'\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF'
2424
']*+',
2525
name) is not None
26+
27+
# https://www.w3.org/TR/xml/#charsets
28+
_ILLEGAL_XML_CHAR = (
29+
'['
30+
'\x00-\x08\x0B\x0C\x0E-\x1F' # C0 controls except TAB, CR and LF
31+
'\uD800-\uDFFF' # the surrogate blocks
32+
'\uFFFE\uFFFF' # special Unicode characters
33+
']')
34+
35+
def is_valid_text(data):
36+
"""Test whether a string is a sequence of legal XML 1.0 characters."""
37+
return _re.search(_ILLEGAL_XML_CHAR, data) is None
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Add the :func:`xml.is_valid_text` function, which allows to check whether
2+
a string can be used in the XML document.

0 commit comments

Comments
 (0)