Skip to content

Commit be5be28

Browse files
author
Samson Gebre
committed
Enhance FetchXmlQuery and QueryOperations with input validation and paging improvements
1 parent e6a3b16 commit be5be28

3 files changed

Lines changed: 212 additions & 27 deletions

File tree

src/PowerPlatform/Dataverse/models/fetchxml_query.py

Lines changed: 62 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@
55

66
from __future__ import annotations
77

8+
import warnings
89
import xml.etree.ElementTree as _ET
910
from typing import Iterator, List, TYPE_CHECKING
10-
from urllib.parse import unquote as _url_unquote
11+
from urllib.parse import unquote as _url_unquote, quote as _url_quote
1112

13+
from ..core.errors import ValidationError
1214
from .record import QueryResult, Record
1315

1416
if TYPE_CHECKING:
@@ -21,6 +23,17 @@
2123
"odata.include-annotations=" '"Microsoft.Dynamics.CRM.fetchxmlpagingcookie,' 'Microsoft.Dynamics.CRM.morerecords"'
2224
)
2325

26+
# Documented Dataverse GET request URL limit. See:
27+
# learn.microsoft.com/power-apps/developer/data-platform/webapi/compose-http-requests-handle-errors#maximum-url-length
28+
# FetchXML queries with many attributes or conditions are the most common way to reach it.
29+
# $batch POST doubles this to 64 KB.
30+
_MAX_URL_LENGTH = 32_768
31+
# Guards against infinite paging loops caused by a bug in cookie propagation or an
32+
# unexpected server response. At the default Dataverse page size of 5,000 rows this
33+
# cap allows up to 50 million records before raising; it is not a practical record
34+
# limit but a circuit-breaker against runaway iteration.
35+
_MAX_PAGES = 10_000
36+
2437

2538
class FetchXmlQuery:
2639
"""Inert FetchXML query object. No HTTP request is made until
@@ -81,12 +94,27 @@ def execute_pages(self) -> Iterator[QueryResult]:
8194
"""
8295
current_xml = self._xml
8396
page_num = 1
97+
page_count = 0
8498

8599
with self._client._scoped_odata() as od:
86100
entity_set = od._entity_set_from_schema_name(self._entity_name)
87101
base_url = f"{od.api}/{entity_set}"
88102

89103
while True:
104+
page_count += 1
105+
if page_count > _MAX_PAGES:
106+
raise ValidationError(
107+
f"FetchXML paging exceeded {_MAX_PAGES} pages. "
108+
"This may indicate a runaway query or a bug in paging cookie propagation."
109+
)
110+
111+
encoded_len = len(base_url) + len("?fetchXml=") + len(_url_quote(current_xml, safe=""))
112+
if encoded_len > _MAX_URL_LENGTH:
113+
raise ValidationError(
114+
f"FetchXML request URL exceeds {_MAX_URL_LENGTH} characters after encoding. "
115+
"Simplify the query or reduce attributes/conditions."
116+
)
117+
90118
r = od._request(
91119
"get",
92120
base_url,
@@ -103,29 +131,46 @@ def execute_pages(self) -> Iterator[QueryResult]:
103131

104132
yield QueryResult(page_records)
105133

106-
more = bool(data.get("@Microsoft.Dynamics.CRM.morerecords", False)) if isinstance(data, dict) else False
134+
more_raw = data.get("@Microsoft.Dynamics.CRM.morerecords", False) if isinstance(data, dict) else False
135+
more = more_raw is True or (isinstance(more_raw, str) and more_raw.lower() == "true")
107136
if not more:
108137
break
109138

110139
raw_cookie = (
111140
data.get("@Microsoft.Dynamics.CRM.fetchxmlpagingcookie", "") if isinstance(data, dict) else ""
112141
)
113-
if not raw_cookie:
114-
break
115-
116-
# The annotation is outer XML: <cookie pagenumber="N" pagingcookie="DOUBLE_ENCODED" />
117-
# The pagingcookie attribute is double URL-encoded; decode twice to get raw cookie XML.
118-
try:
119-
cookie_el = _ET.fromstring(raw_cookie)
120-
except _ET.ParseError:
121-
break
122-
inner_encoded = cookie_el.get("pagingcookie", "")
123-
if not inner_encoded:
124-
break
125-
cookie = _url_unquote(_url_unquote(inner_encoded))
126-
page_num = int(cookie_el.get("pagenumber", str(page_num + 1)))
127142

143+
if raw_cookie:
144+
try:
145+
cookie_el = _ET.fromstring(raw_cookie)
146+
inner_encoded = cookie_el.get("pagingcookie", "")
147+
if inner_encoded:
148+
cookie = _url_unquote(_url_unquote(inner_encoded))
149+
page_num = int(cookie_el.get("pagenumber", str(page_num + 1)))
150+
fetch_el = _ET.fromstring(current_xml)
151+
fetch_el.set("paging-cookie", cookie)
152+
fetch_el.set("page", str(page_num))
153+
current_xml = _ET.tostring(fetch_el, encoding="unicode")
154+
continue
155+
except (_ET.ParseError, ValueError):
156+
pass # Fall through to simple paging
157+
158+
# Simple paging fallback: server returned morerecords=true but no paging
159+
# cookie. Dataverse omits the cookie when the query cannot use cookie-based
160+
# paging (e.g. FetchXML ordered by a link-entity column). We continue with
161+
# page-number-only paging rather than truncating, but warn because simple
162+
# paging has a 50,000-record server cap and performance degrades at high page
163+
# numbers. The caller may be able to avoid this by reordering on the root
164+
# entity instead.
165+
warnings.warn(
166+
"Dataverse did not return a paging cookie; falling back to simple paging "
167+
"(page-number increment only). Simple paging is capped at 50,000 records "
168+
"and degrades in performance at high page numbers. Consider reordering on "
169+
"a root-entity column to enable cookie-based paging.",
170+
UserWarning,
171+
stacklevel=2,
172+
)
173+
page_num += 1
128174
fetch_el = _ET.fromstring(current_xml)
129-
fetch_el.set("paging-cookie", cookie)
130175
fetch_el.set("page", str(page_num))
131176
current_xml = _ET.tostring(fetch_el, encoding="unicode")

src/PowerPlatform/Dataverse/operations/query.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88
import warnings
99
import xml.etree.ElementTree as _ET
1010
from typing import Any, Dict, List, Optional, TYPE_CHECKING
11+
from urllib.parse import quote as _url_quote
1112

12-
from ..core.errors import MetadataError
13-
from ..models.fetchxml_query import FetchXmlQuery
13+
from ..core.errors import MetadataError, ValidationError
14+
from ..models.fetchxml_query import FetchXmlQuery, _MAX_URL_LENGTH
1415
from ..models.record import Record
1516
from ..models.query_builder import QueryBuilder
1617

@@ -193,15 +194,35 @@ def fetchxml(self, xml: str) -> FetchXmlQuery:
193194
for page in query.execute_pages():
194195
process(page.to_dataframe())
195196
"""
196-
root_el = _ET.fromstring(xml.strip())
197+
if not isinstance(xml, str):
198+
raise ValidationError("xml must be a string")
199+
xml = xml.strip()
200+
if not xml:
201+
raise ValidationError("xml must not be empty")
202+
# Fast-fail before any HTTP is attempted; execute_pages() re-checks the full URL
203+
# (base + encoded XML) on each page.
204+
if len(_url_quote(xml, safe="")) > _MAX_URL_LENGTH:
205+
raise ValidationError(
206+
f"FetchXML exceeds the Dataverse URL length limit ({_MAX_URL_LENGTH:,} characters) when encoded. "
207+
"Use a $batch POST request to send FetchXML in the request body where the limit is 64 KB."
208+
)
209+
# Parse only to verify well-formedness and extract the entity name needed for the
210+
# request URL. Structural and semantic validation is intentionally left to the server
211+
# to avoid duplicating rules that may diverge from Dataverse's own enforcement.
212+
# ElementTree does not resolve external entities or expand recursive internal entity
213+
# references, so pathological inputs of that kind raise ParseError rather than
214+
# consuming resources.
215+
try:
216+
root_el = _ET.fromstring(xml)
217+
except _ET.ParseError as exc:
218+
raise ValidationError(f"xml is not well-formed: {exc}") from exc
197219
entity_el = root_el.find("entity")
198220
if entity_el is None:
199221
raise ValueError("FetchXML must contain an <entity> child element")
200222
entity_name = entity_el.get("name", "")
201223
if not entity_name:
202224
raise ValueError("FetchXML <entity> element must have a 'name' attribute")
203-
204-
return FetchXmlQuery(xml.strip(), entity_name, self._client)
225+
return FetchXmlQuery(xml, entity_name, self._client)
205226

206227
# --------------------------------------------------------------- sql_columns
207228

tests/unit/test_phase4_ga.py

Lines changed: 124 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ def test_empty_result_returns_empty_query_result(self):
8787

8888
def test_pagination_fetches_all_pages(self):
8989
"""execute_pages() drives the HTTP loop; each page yields one QueryResult."""
90-
cookie_raw = "%25253Cpagingcookie%252520pagingcookie%25253D%252522"
90+
# Annotation is outer XML; pagingcookie attribute is double URL-encoded inner cookie.
91+
cookie_raw = '<cookie pagenumber="2" pagingcookie="%253Cc%252F%253E" istracking="False" />'
9192
page1 = self._mock_response([{"name": "A"}], more=True, cookie=cookie_raw)
9293
page2 = self._mock_response([{"name": "B"}], more=False)
9394
self.client._odata._request.side_effect = [page1, page2]
@@ -98,7 +99,8 @@ def test_pagination_fetches_all_pages(self):
9899

99100
def test_pagination_second_request_includes_page_and_cookie(self):
100101
"""execute_pages() injects the decoded paging cookie into the second request."""
101-
cookie_raw = "%25253Cpagingcookie%252520test%25253D%252522"
102+
# pagingcookie="%253Cc%252F%253E": double URL-decode gives "<c/>" (the inner cookie XML).
103+
cookie_raw = '<cookie pagenumber="2" pagingcookie="%253Cc%252F%253E" istracking="False" />'
102104
page1 = self._mock_response([{"name": "A"}], more=True, cookie=cookie_raw)
103105
page2 = self._mock_response([{"name": "B"}], more=False)
104106
self.client._odata._request.side_effect = [page1, page2]
@@ -168,7 +170,7 @@ def test_no_deprecation_warning_emitted(self):
168170

169171
def test_execute_pages_returns_iterator_of_query_result(self):
170172
"""execute_pages() yields QueryResult objects, one per HTTP page."""
171-
cookie_raw = "%25253Cpagingcookie%252520pagingcookie%25253D%252522"
173+
cookie_raw = '<cookie pagenumber="2" pagingcookie="%253Cc%252F%253E" istracking="False" />'
172174
page1 = self._mock_response([{"name": "A"}], more=True, cookie=cookie_raw)
173175
page2 = self._mock_response([{"name": "B"}], more=False)
174176
self.client._odata._request.side_effect = [page1, page2]
@@ -180,7 +182,7 @@ def test_execute_pages_returns_iterator_of_query_result(self):
180182

181183
def test_execute_pages_one_http_call_per_page(self):
182184
"""Each execute_pages() iteration fires exactly one HTTP request."""
183-
cookie_raw = "%25253Cpagingcookie%252520pagingcookie%25253D%252522"
185+
cookie_raw = '<cookie pagenumber="2" pagingcookie="%253Cc%252F%253E" istracking="False" />'
184186
page1 = self._mock_response([{"name": "A"}], more=True, cookie=cookie_raw)
185187
page2 = self._mock_response([{"name": "B"}], more=False)
186188
self.client._odata._request.side_effect = [page1, page2]
@@ -193,7 +195,7 @@ def test_execute_pages_one_http_call_per_page(self):
193195

194196
def test_execute_pages_per_page_records(self):
195197
"""Each page yielded by execute_pages() contains only its own records."""
196-
cookie_raw = "%25253Cpagingcookie%252520pagingcookie%25253D%252522"
198+
cookie_raw = '<cookie pagenumber="2" pagingcookie="%253Cc%252F%253E" istracking="False" />'
197199
page1 = self._mock_response([{"name": "A"}], more=True, cookie=cookie_raw)
198200
page2 = self._mock_response([{"name": "B"}, {"name": "C"}], more=False)
199201
self.client._odata._request.side_effect = [page1, page2]
@@ -204,6 +206,123 @@ def test_execute_pages_per_page_records(self):
204206
self.assertEqual(pages[0].first()["name"], "A")
205207
self.assertEqual(pages[1].first()["name"], "B")
206208

209+
# ------------------------------------------------------------------
210+
# Input validation
211+
# ------------------------------------------------------------------
212+
213+
def test_non_string_input_raises_validation_error(self):
214+
from PowerPlatform.Dataverse.core.errors import ValidationError
215+
216+
with self.assertRaises(ValidationError):
217+
self.client.query.fetchxml(123)
218+
219+
def test_empty_string_raises_validation_error(self):
220+
from PowerPlatform.Dataverse.core.errors import ValidationError
221+
222+
with self.assertRaises(ValidationError):
223+
self.client.query.fetchxml("")
224+
225+
def test_whitespace_only_raises_validation_error(self):
226+
from PowerPlatform.Dataverse.core.errors import ValidationError
227+
228+
with self.assertRaises(ValidationError):
229+
self.client.query.fetchxml(" ")
230+
231+
def test_malformed_xml_raises_validation_error(self):
232+
from PowerPlatform.Dataverse.core.errors import ValidationError
233+
234+
with self.assertRaises(ValidationError):
235+
self.client.query.fetchxml("<fetch><unclosed>")
236+
237+
def test_url_too_long_raises_validation_error(self):
238+
"""XML whose URL-encoded form exceeds 32,768 chars is rejected before any HTTP."""
239+
from PowerPlatform.Dataverse.core.errors import ValidationError
240+
241+
# Alphanumeric chars are URL-safe and don't expand; a 32,769-char name attribute
242+
# value pushes the encoded XML over the limit.
243+
long_name = "a" * 32_769
244+
big_xml = f'<fetch><entity name="{long_name}"><attribute name="x"/></entity></fetch>'
245+
with self.assertRaises(ValidationError):
246+
self.client.query.fetchxml(big_xml)
247+
248+
# ------------------------------------------------------------------
249+
# Paging behaviour
250+
# ------------------------------------------------------------------
251+
252+
def test_morerecords_string_true_continues_paging(self):
253+
"""morerecords annotation as string "true" (not bool) is handled correctly."""
254+
cookie_raw = '<cookie pagenumber="2" pagingcookie="%253Cc%252F%253E" istracking="False" />'
255+
page1_payload = {
256+
"value": [{"name": "A"}],
257+
"@Microsoft.Dynamics.CRM.morerecords": "true",
258+
"@Microsoft.Dynamics.CRM.fetchxmlpagingcookie": cookie_raw,
259+
}
260+
page2_payload = {
261+
"value": [{"name": "B"}],
262+
"@Microsoft.Dynamics.CRM.morerecords": False,
263+
}
264+
r1, r2 = MagicMock(), MagicMock()
265+
r1.json.return_value = page1_payload
266+
r2.json.return_value = page2_payload
267+
self.client._odata._request.side_effect = [r1, r2]
268+
269+
result = self.client.query.fetchxml(self._fetch_xml()).execute()
270+
self.assertEqual(len(result), 2)
271+
self.assertEqual(self.client._odata._request.call_count, 2)
272+
273+
def test_simple_paging_fallback_emits_user_warning(self):
274+
"""No cookie returned with morerecords=True triggers a UserWarning."""
275+
page1 = self._mock_response([{"name": "A"}], more=True, cookie="")
276+
page2 = self._mock_response([{"name": "B"}], more=False)
277+
self.client._odata._request.side_effect = [page1, page2]
278+
279+
with warnings.catch_warnings(record=True) as caught:
280+
warnings.simplefilter("always")
281+
list(self.client.query.fetchxml(self._fetch_xml()).execute_pages())
282+
283+
user_warnings = [w for w in caught if issubclass(w.category, UserWarning)]
284+
self.assertEqual(len(user_warnings), 1)
285+
self.assertIn("simple paging", str(user_warnings[0].message).lower())
286+
287+
def test_simple_paging_fallback_fetches_all_pages(self):
288+
"""Simple paging fallback continues iterating; caller still gets all records."""
289+
page1 = self._mock_response([{"name": "A"}], more=True, cookie="")
290+
page2 = self._mock_response([{"name": "B"}], more=False)
291+
self.client._odata._request.side_effect = [page1, page2]
292+
293+
with warnings.catch_warnings(record=True):
294+
warnings.simplefilter("always")
295+
result = self.client.query.fetchxml(self._fetch_xml()).execute()
296+
297+
self.assertEqual(len(result), 2)
298+
self.assertEqual(self.client._odata._request.call_count, 2)
299+
300+
def test_malformed_cookie_falls_back_to_simple_paging(self):
301+
"""A cookie annotation that is not parseable as XML triggers simple paging fallback."""
302+
page1 = self._mock_response([{"name": "A"}], more=True, cookie="not-valid-xml")
303+
page2 = self._mock_response([{"name": "B"}], more=False)
304+
self.client._odata._request.side_effect = [page1, page2]
305+
306+
with warnings.catch_warnings(record=True) as caught:
307+
warnings.simplefilter("always")
308+
result = self.client.query.fetchxml(self._fetch_xml()).execute()
309+
310+
self.assertEqual(len(result), 2)
311+
user_warnings = [w for w in caught if issubclass(w.category, UserWarning)]
312+
self.assertEqual(len(user_warnings), 1)
313+
314+
def test_max_pages_exceeded_raises(self):
315+
"""Paging loop raises ValidationError after exceeding _MAX_PAGES."""
316+
from PowerPlatform.Dataverse.core.errors import ValidationError
317+
318+
cookie_raw = '<cookie pagenumber="2" pagingcookie="%253Cc%252F%253E" istracking="False" />'
319+
always_more = self._mock_response([{"name": "A"}], more=True, cookie=cookie_raw)
320+
self.client._odata._request.return_value = always_more
321+
322+
with patch("PowerPlatform.Dataverse.models.fetchxml_query._MAX_PAGES", 3):
323+
with self.assertRaises(ValidationError):
324+
list(self.client.query.fetchxml(self._fetch_xml()).execute_pages())
325+
207326

208327
# ---------------------------------------------------------------------------
209328
# Removed SQL helpers — raise AttributeError

0 commit comments

Comments
 (0)