Skip to content

Commit d01181e

Browse files
Abel Milashclaude
andcommitted
Restore inline comments in _sql_guardrails and SQL regex patterns
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent cc8e478 commit d01181e

1 file changed

Lines changed: 164 additions & 1 deletion

File tree

src/PowerPlatform/Dataverse/data/_odata_base.py

Lines changed: 164 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def _lowercase_keys(record: Dict[str, Any]) -> Dict[str, Any]:
181181
def _lowercase_list(items: Optional[List[str]]) -> Optional[List[str]]:
182182
"""Convert all strings in a list to lowercase for case-insensitive column names.
183183
184-
Used for $select, $orderby, $expand parameters where column names must be lowercase.
184+
Used for $select and $orderby parameters where column names must be lowercase.
185185
"""
186186
if not items:
187187
return items
@@ -714,6 +714,169 @@ def _build_get_relationship(self, schema_name: str) -> _RawRequest:
714714
url=f"{self.api}/RelationshipDefinitions?$filter=SchemaName eq '{escaped}'",
715715
)
716716

717+
# ------------------------------------------------------------------
718+
# SQL guardrails
719+
# ------------------------------------------------------------------
720+
721+
# ----------------------- SQL guardrail patterns --------------------
722+
_SQL_WRITE_RE = re.compile(
723+
r"^\s*(?:INSERT|UPDATE|DELETE|DROP|TRUNCATE|ALTER|CREATE|EXEC|GRANT|REVOKE|BULK)\b",
724+
re.IGNORECASE,
725+
)
726+
_SQL_COMMENT_RE = re.compile(r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/|--[^\n]*", re.DOTALL)
727+
_SQL_LEADING_WILDCARD_RE = re.compile(r"\bLIKE\s+'%[^']", re.IGNORECASE)
728+
_SQL_IMPLICIT_CROSS_JOIN_RE = re.compile(
729+
r"\bFROM\s+[A-Za-z0-9_]+(?:\s+[A-Za-z0-9_]+)?\s*,\s*[A-Za-z0-9_]+",
730+
re.IGNORECASE,
731+
)
732+
# Server-blocked SQL patterns (save the round-trip by catching early)
733+
_SQL_UNSUPPORTED_JOIN_RE = re.compile(
734+
r"\b(?:CROSS\s+JOIN|RIGHT\s+(?:OUTER\s+)?JOIN|FULL\s+(?:OUTER\s+)?JOIN)\b",
735+
re.IGNORECASE,
736+
)
737+
_SQL_UNION_RE = re.compile(r"\bUNION\b", re.IGNORECASE)
738+
_SQL_HAVING_RE = re.compile(r"\bHAVING\b", re.IGNORECASE)
739+
_SQL_CTE_RE = re.compile(r"^\s*WITH\b", re.IGNORECASE)
740+
_SQL_SUBQUERY_RE = re.compile(
741+
r"\bIN\s*\(\s*SELECT\b|\bEXISTS\s*\(\s*SELECT\b|\(\s*SELECT\b.*\bFROM\b",
742+
re.IGNORECASE,
743+
)
744+
# SELECT * is intentionally rejected -- not a technical limitation but a
745+
# deliberate design decision. Wide entities (e.g. account has 307 columns)
746+
# make SELECT * extremely expensive on shared database infrastructure.
747+
# COUNT(*) is NOT matched because COUNT appears before the *.
748+
_SQL_SELECT_STAR_RE = re.compile(
749+
r"\bSELECT\b\s+(?:DISTINCT\s+)?(?:TOP\s+\d+(?:\s+PERCENT)?\s+)?\*\s",
750+
re.IGNORECASE,
751+
)
752+
753+
def _sql_guardrails(self, sql: str) -> str:
754+
"""Apply safety guardrails to a SQL query before sending to the server.
755+
756+
Checks split into two categories:
757+
758+
**Blocked** (``ValidationError`` -- saves a server round-trip):
759+
760+
1. Write statements (INSERT/UPDATE/DELETE/DROP/etc.)
761+
2. CROSS JOIN, RIGHT JOIN, FULL OUTER JOIN (server rejects these)
762+
3. UNION / UNION ALL (server rejects)
763+
4. HAVING clause (server rejects)
764+
5. CTE / WITH clause (server rejects)
765+
6. Subqueries -- IN (SELECT ...), EXISTS (SELECT ...) (server rejects)
766+
7. SELECT * -- intentional design decision, not a technical limitation.
767+
Wide entities make wildcard selects extremely expensive on shared
768+
database infrastructure. ``COUNT(*)`` is not affected.
769+
770+
**Warned** (``UserWarning`` -- query still executes):
771+
772+
8. Leading-wildcard LIKE (full table scan)
773+
9. Implicit cross join FROM a, b (cartesian product)
774+
775+
All blocked patterns are also blocked by the server, but catching
776+
them here saves the network round-trip and provides clearer error
777+
messages. To bypass a specific check (e.g., if the server adds
778+
support in the future), all checks are in this single method.
779+
780+
:param sql: The SQL string (already stripped).
781+
:return: The SQL string (unchanged).
782+
:raises ValidationError: If the SQL contains a blocked pattern.
783+
"""
784+
# --- BLOCKED (save server round-trip) ---
785+
786+
# 1. Block writes (strip SQL comments first to catch comment-prefixed writes)
787+
sql_no_comments = self._SQL_COMMENT_RE.sub(" ", sql).strip()
788+
if self._SQL_WRITE_RE.search(sql_no_comments):
789+
raise ValidationError(
790+
"SQL endpoint is read-only. Use client.records or "
791+
"client.dataframe for write operations "
792+
"(INSERT/UPDATE/DELETE are not supported).",
793+
subcode=VALIDATION_SQL_WRITE_BLOCKED,
794+
)
795+
796+
# 2. Block unsupported JOIN types
797+
m = self._SQL_UNSUPPORTED_JOIN_RE.search(sql)
798+
if m:
799+
raise ValidationError(
800+
f"Unsupported JOIN type: '{m.group(0).strip()}'. "
801+
"Only INNER JOIN and LEFT JOIN are supported by the "
802+
"Dataverse SQL endpoint.",
803+
subcode=VALIDATION_SQL_UNSUPPORTED_SYNTAX,
804+
)
805+
806+
# 3. Block UNION
807+
if self._SQL_UNION_RE.search(sql):
808+
raise ValidationError(
809+
"UNION is not supported by the Dataverse SQL endpoint. "
810+
"Execute separate queries and combine results in Python "
811+
"(e.g. pd.concat([df1, df2])).",
812+
subcode=VALIDATION_SQL_UNSUPPORTED_SYNTAX,
813+
)
814+
815+
# 4. Block HAVING
816+
if self._SQL_HAVING_RE.search(sql):
817+
raise ValidationError(
818+
"HAVING is not supported by the Dataverse SQL endpoint. "
819+
"Use WHERE to filter before GROUP BY instead.",
820+
subcode=VALIDATION_SQL_UNSUPPORTED_SYNTAX,
821+
)
822+
823+
# 5. Block CTE / WITH
824+
if self._SQL_CTE_RE.search(sql):
825+
raise ValidationError(
826+
"CTE (WITH ... AS) is not supported by the Dataverse SQL "
827+
"endpoint. Use separate queries and combine in Python.",
828+
subcode=VALIDATION_SQL_UNSUPPORTED_SYNTAX,
829+
)
830+
831+
# 6. Block subqueries
832+
if self._SQL_SUBQUERY_RE.search(sql):
833+
raise ValidationError(
834+
"Subqueries are not supported by the Dataverse SQL "
835+
"endpoint. Use separate SQL calls and combine results "
836+
"in Python (e.g. step 1: get IDs, step 2: WHERE IN).",
837+
subcode=VALIDATION_SQL_UNSUPPORTED_SYNTAX,
838+
)
839+
840+
# 7. Block SELECT * -- intentional design decision.
841+
# Wide entities (e.g. account has 307 columns) make wildcard selects
842+
# extremely expensive on shared database infrastructure.
843+
# COUNT(*) is NOT matched: _SQL_SELECT_STAR_RE requires * to be the
844+
# first token after SELECT/DISTINCT/TOP N, so COUNT appears before *.
845+
if self._SQL_SELECT_STAR_RE.search(sql):
846+
raise ValidationError(
847+
"SELECT * is not supported. Specify column names explicitly "
848+
"(e.g. SELECT name, revenue FROM account). "
849+
"Use client.query.sql_columns('account') to discover available columns.",
850+
subcode=VALIDATION_SQL_UNSUPPORTED_SYNTAX,
851+
)
852+
853+
# --- WARNED (query still executes) ---
854+
855+
# 8. Warn on leading-wildcard LIKE
856+
if self._SQL_LEADING_WILDCARD_RE.search(sql):
857+
warnings.warn(
858+
"Query contains a leading-wildcard LIKE pattern "
859+
"(e.g. LIKE '%value'). This forces a full table scan "
860+
"and may degrade performance on large tables. "
861+
"Prefer trailing wildcards (LIKE 'value%') when possible.",
862+
UserWarning,
863+
stacklevel=4,
864+
)
865+
866+
# 9. Warn on implicit cross joins (server allows but risky)
867+
if self._SQL_IMPLICIT_CROSS_JOIN_RE.search(sql):
868+
warnings.warn(
869+
"Query uses an implicit cross join (FROM table1, table2). "
870+
"This produces a cartesian product that can generate "
871+
"millions of intermediate rows and degrade shared database "
872+
"performance. Use explicit JOIN...ON syntax instead: "
873+
"FROM table1 a JOIN table2 b ON a.column = b.column",
874+
UserWarning,
875+
stacklevel=4,
876+
)
877+
878+
return sql
879+
717880
# ------------------------------------------------------------------
718881
# Cache maintenance
719882
# ------------------------------------------------------------------

0 commit comments

Comments
 (0)