Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/sgraph/sgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from collections.abc import Sequence
import io
import os
import re
import sys
import uuid
import xml.sax.handler
Expand Down Expand Up @@ -155,6 +156,11 @@ def traverse(self, traverser: Callable[[SElement], None]):
for e in self.rootNode.children:
traverser(e)

# C0 control characters that XML 1.0 forbids in any content (see
# https://www.w3.org/TR/xml/#charsets). TAB (0x09), LF (0x0A) and CR (0x0D)
# are allowed and handled explicitly during escaping.
_XML_INVALID_CONTROL_RE = re.compile('[\x00-\x08\x0b\x0c\x0e-\x1f]')

def to_xml(self, fname: str | None, stdout: bool = True) -> str | None:
rootNode = self.rootNode
counter = Counter()
Expand Down Expand Up @@ -218,6 +224,7 @@ def enc_xml_a_v(v: int | float | set[str] | dict[object, object] | list[str] | s
# Forbidden chars are: naked ampersand, left angle bracket, double quote
# single quote is fine as we are using double quotes in XML for attributes
v = v.encode('utf-8', 'replace').decode()
v = SGraph._XML_INVALID_CONTROL_RE.sub('', v)
return v.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace(
'\n', '&' + '#' + '10;').replace('"', '&quot;')
return ''
Expand Down
30 changes: 30 additions & 0 deletions tests/sgraph_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import io
import os
from typing import Any

Expand Down Expand Up @@ -121,3 +122,32 @@ def test_repr_multi_root_omits_count_when_small():
text = repr(graph)
assert 'nginx' in text and 'bar' in text
assert 'count=' not in text


def test_to_xml_strips_invalid_control_chars_and_roundtrips():
"""Attribute values containing C0 control chars forbidden by XML 1.0 must
be sanitised so that the serialised model parses back cleanly."""
from sgraph.selement import SElement

graph = SGraph()
repo = SElement(graph.rootNode, 'repo')
elem = SElement(repo, 'file.py')

c0_controls = ''.join(chr(i) for i in range(0x20) if i not in (0x09, 0x0A, 0x0D))
nasty = f'before{c0_controls}after<&"\'>{chr(0x7F)} tab\there'
elem.attrs['description'] = nasty

xml = graph.to_xml(None, stdout=False)
assert xml is not None
parsed = SGraph.parse_xml_file_or_stream(io.StringIO(xml))

repo_node = parsed.rootNode.children[0]
file_node = next(c for c in repo_node.children if c.name == 'file.py')
got = file_node.attrs['description']

# All C0 chars except TAB/LF/CR must be stripped.
for ch in c0_controls:
assert ch not in got, f'control char {ch!r} leaked into parsed attribute'
# The visible content must survive intact.
assert got.startswith('before')
assert 'after<&"\'>' in got
Loading