Skip to content

Commit 22a1260

Browse files
cigraingerclaude
andcommitted
ElementTree read-only drop-in compatibility
Element now has find(), findall(), iterfind(), findtext() — the core ElementTree query API. Path syntax (tag, .//tag, *, .., [@attrib], [tag='text']) is translated to XPath and evaluated by the SIMD engine. Also adds: - extend(), makeelement() on Element (raise TypeError) - iter() now includes self (matching stdlib behavior) - itertext() now yields text/tail in correct stdlib order - ET module: fromstringlist, tostringlist, dump, iselement - ET module: SubElement, Comment, PI, indent (raise TypeError) - ET module: register_namespace (no-op), ElementTree.write (TypeError) - ET module: ElementTree.iter, findtext, iterfind 131 new tests (322 total): - 49 ET compat tests (parse, ElementTree class, find/findall, read-only) - 82 exhaustive drop-in tests cross-validating every operation against stdlib xml.etree.ElementTree on 7 corpus types - Property tests (Hypothesis) for iter, itertext, find, len, text/tail - Adversarial: empty docs, deep nesting, entities, mixed content, CDATA Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 39508b5 commit 22a1260

6 files changed

Lines changed: 1143 additions & 113 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ target-version = "py39"
4343

4444
[tool.ruff.lint]
4545
select = ["E", "F", "I", "N", "W", "UP", "RUF", "B", "SIM", "PTH"]
46-
ignore = ["N999"] # ElementTree.py is a stdlib-compat module name
46+
ignore = ["N999", "N802"] # ElementTree.py has stdlib-compat names
4747

4848
[tool.ruff.lint.per-file-ignores]
4949
"tests/*" = ["N817", "PTH123"] # ET alias, open() in test helpers

python/simdxml/_core.pyi

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,33 @@ class Element:
103103
def xpath_text(self, expr: str) -> list[str]:
104104
"""Evaluate an XPath expression and return text content of matches."""
105105
...
106+
def find(
107+
self, path: str, namespaces: dict[str, str] | None = None
108+
) -> Element | None:
109+
"""Find first matching subelement by path (ElementTree API).
110+
111+
Supports: ``tag``, ``{ns}tag``, ``*/tag``, ``.//tag``, ``..``,
112+
``[@attrib]``, ``[tag]``, ``[tag='text']``.
113+
"""
114+
...
115+
def findall(
116+
self, path: str, namespaces: dict[str, str] | None = None
117+
) -> ElementList:
118+
"""Find all matching subelements by path (ElementTree API)."""
119+
...
120+
def iterfind(
121+
self, path: str, namespaces: dict[str, str] | None = None
122+
) -> Iterator[Element]:
123+
"""Iterate over matching subelements by path (ElementTree API)."""
124+
...
125+
def findtext(
126+
self,
127+
path: str,
128+
default: str | None = None,
129+
namespaces: dict[str, str] | None = None,
130+
) -> str | None:
131+
"""Find text of first matching subelement (ElementTree API)."""
132+
...
106133
def getparent(self) -> Element | None:
107134
"""Parent element, or None if this is the root."""
108135
...
Lines changed: 108 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Drop-in replacement for xml.etree.ElementTree (read-only).
22
3-
Provides the same API as xml.etree.ElementTree for parsing and querying XML,
4-
backed by simdxml's SIMD-accelerated structural index.
3+
Provides the same read-only API as xml.etree.ElementTree for parsing and
4+
querying XML, backed by simdxml's SIMD-accelerated structural index.
55
66
Usage::
77
@@ -12,26 +12,25 @@
1212
titles = root.findall(".//title")
1313
1414
Note: simdxml Elements are read-only. Mutation operations (append, remove,
15-
set, text assignment) raise TypeError.
15+
set, text assignment, SubElement, etc.) raise TypeError.
1616
"""
1717

1818
from __future__ import annotations
1919

2020
import os
21+
import sys
2122
from collections.abc import Iterator
2223
from typing import IO
2324

2425
import simdxml._core as _core
2526

26-
# Re-export Element and Document
27+
# Re-export Element for isinstance checks and type annotations
2728
Element = _core.Element
29+
ElementList = _core.ElementList
2830

2931

3032
class ElementTree:
31-
"""An XML element hierarchy backed by simdxml.
32-
33-
This is a read-only wrapper matching the stdlib ElementTree API.
34-
"""
33+
"""An XML element hierarchy backed by simdxml (read-only)."""
3534

3635
def __init__(
3736
self,
@@ -40,7 +39,6 @@ def __init__(
4039
) -> None:
4140
if file is not None:
4241
if not isinstance(file, (str, os.PathLike)):
43-
# File-like object
4442
data: bytes = file.read()
4543
else:
4644
from pathlib import Path
@@ -59,33 +57,53 @@ def __init__(
5957
def getroot(self) -> Element:
6058
"""Return the root element."""
6159
if self._root is None:
62-
raise ValueError("ElementTree has no root element")
60+
msg = "ElementTree has no root element"
61+
raise ValueError(msg)
6362
return self._root
6463

6564
def find(
6665
self, path: str, namespaces: dict[str, str] | None = None
6766
) -> Element | None:
6867
"""Find first matching element by path."""
69-
root = self.getroot()
70-
return _find(root, path, namespaces)
68+
return self.getroot().find(path, namespaces)
7169

7270
def findall(
7371
self, path: str, namespaces: dict[str, str] | None = None
74-
) -> list[Element]:
72+
) -> ElementList:
7573
"""Find all matching elements by path."""
76-
root = self.getroot()
77-
return _findall(root, path, namespaces)
74+
return self.getroot().findall(path, namespaces)
75+
76+
def findtext(
77+
self,
78+
path: str,
79+
default: str | None = None,
80+
namespaces: dict[str, str] | None = None,
81+
) -> str | None:
82+
"""Find text of first matching element."""
83+
return self.getroot().findtext(path, default, namespaces)
7884

7985
def iterfind(
8086
self, path: str, namespaces: dict[str, str] | None = None
8187
) -> Iterator[Element]:
8288
"""Iterate over matching elements."""
83-
return iter(self.findall(path, namespaces))
89+
return self.getroot().iterfind(path, namespaces)
90+
91+
def iter(self, tag: str | None = None) -> Iterator[Element]:
92+
"""Iterate over all elements in the tree."""
93+
return self.getroot().iter(tag)
94+
95+
def write(self, *_args: object, **_kwargs: object) -> None:
96+
"""Not supported (read-only)."""
97+
msg = "simdxml ElementTree is read-only"
98+
raise TypeError(msg)
99+
100+
101+
# ---------------------------------------------------------------------------
102+
# Module-level functions matching xml.etree.ElementTree
103+
# ---------------------------------------------------------------------------
84104

85105

86-
def parse(
87-
source: str | os.PathLike[str] | IO[bytes],
88-
) -> ElementTree:
106+
def parse(source: str | os.PathLike[str] | IO[bytes]) -> ElementTree:
89107
"""Parse an XML file into an ElementTree."""
90108
return ElementTree(file=source)
91109

@@ -95,14 +113,24 @@ def fromstring(text: str | bytes) -> Element:
95113
doc = _core.parse(text)
96114
root = doc.root
97115
if root is None:
98-
raise ValueError("no root element found")
116+
msg = "no root element found"
117+
raise ValueError(msg)
99118
return root
100119

101120

121+
def fromstringlist(sequence: list[str | bytes], parser: object = None) -> Element:
122+
"""Parse XML from a sequence of strings."""
123+
text = b"".join(s.encode() if isinstance(s, str) else s for s in sequence)
124+
return fromstring(text)
125+
126+
102127
def tostring(
103128
element: Element,
104129
encoding: str | None = None,
105130
method: str | None = None,
131+
*,
132+
short_empty_elements: bool = True,
133+
xml_declaration: bool | None = None,
106134
) -> bytes | str:
107135
"""Serialize an Element to XML.
108136
@@ -115,58 +143,72 @@ def tostring(
115143
return raw.encode(enc)
116144

117145

146+
def tostringlist(
147+
element: Element,
148+
encoding: str | None = None,
149+
method: str | None = None,
150+
*,
151+
short_empty_elements: bool = True,
152+
xml_declaration: bool | None = None,
153+
) -> list[bytes | str]:
154+
"""Serialize an Element to a list of strings."""
155+
return [tostring(element, encoding, method)]
156+
157+
158+
def dump(elem: Element) -> None:
159+
"""Write element tree or element to sys.stdout."""
160+
sys.stdout.write(elem.tostring())
161+
sys.stdout.write("\n")
162+
163+
164+
def iselement(element: object) -> bool:
165+
"""Check if an object is an Element."""
166+
return isinstance(element, _core.Element)
167+
168+
118169
# ---------------------------------------------------------------------------
119-
# ET path → XPath translation for find/findall
170+
# Read-only stubs for write/construction APIs
120171
# ---------------------------------------------------------------------------
121172

173+
_READONLY_MSG = "simdxml is read-only. Use xml.etree.ElementTree for XML construction."
122174

123-
def _path_to_xpath(path: str) -> str:
124-
"""Convert ET path syntax to XPath.
125175

126-
ET paths are a subset of XPath with some differences:
127-
- {ns}tag → namespace handling (we pass through as-is for now)
128-
- . → self
129-
- .. → parent
130-
- // → descendant-or-self
131-
- * → wildcard
132-
- [tag] → child element predicate
133-
- [@attrib] → attribute exists
134-
- [tag='text'] → child text match
135-
- [@attrib='value'] → attribute value match
136-
"""
137-
# If it already looks like XPath, pass through
138-
if path.startswith("/") or path.startswith("("):
139-
return path
176+
def SubElement(
177+
parent: Element,
178+
tag: str,
179+
attrib: dict[str, str] | None = None,
180+
**extra: str,
181+
) -> Element:
182+
"""Not supported (read-only). Raises TypeError."""
183+
raise TypeError(_READONLY_MSG)
140184

141-
# Ensure relative paths start with ./ for XPath context
142-
if not path.startswith("."):
143-
path = "./" + path
144185

145-
return path
186+
def Comment(text: str | None = None) -> Element:
187+
"""Not supported (read-only). Raises TypeError."""
188+
raise TypeError(_READONLY_MSG)
146189

147190

148-
def _find(
149-
element: Element,
150-
path: str,
151-
namespaces: dict[str, str] | None = None,
152-
) -> Element | None:
153-
"""Find first matching subelement."""
154-
xpath = _path_to_xpath(path)
155-
try:
156-
results = element.xpath(xpath)
157-
return results[0] if results else None
158-
except ValueError:
159-
return None
160-
161-
162-
def _findall(
163-
element: Element,
164-
path: str,
165-
namespaces: dict[str, str] | None = None,
166-
) -> list[Element]:
167-
"""Find all matching subelements."""
168-
xpath = _path_to_xpath(path)
169-
try:
170-
return list(element.xpath(xpath))
171-
except ValueError:
172-
return []
191+
def ProcessingInstruction(target: str, text: str | None = None) -> Element:
192+
"""Not supported (read-only). Raises TypeError."""
193+
raise TypeError(_READONLY_MSG)
194+
195+
196+
PI = ProcessingInstruction
197+
198+
199+
def indent(
200+
tree: Element | ElementTree,
201+
space: str = " ",
202+
level: int = 0,
203+
) -> None:
204+
"""Not supported (read-only, modifies tree). Raises TypeError."""
205+
raise TypeError(_READONLY_MSG)
206+
207+
208+
# Namespace registry (no-op for compatibility)
209+
_namespace_map: dict[str, str] = {}
210+
211+
212+
def register_namespace(prefix: str, uri: str) -> None:
213+
"""Register a namespace prefix (stored but not used for queries)."""
214+
_namespace_map[prefix] = uri

0 commit comments

Comments
 (0)