@@ -181,7 +181,7 @@ def _lowercase_keys(record: Dict[str, Any]) -> Dict[str, Any]:
181181 def _lowercase_list (items : Optional [List [str ]]) -> Optional [List [str ]]:
182182 """Convert all strings in a list to lowercase for case-insensitive column names.
183183
184- Used for $select, $orderby, $expand parameters where column names must be lowercase.
184+ Used for $select and $orderby parameters where column names must be lowercase.
185185 """
186186 if not items :
187187 return items
@@ -714,6 +714,169 @@ def _build_get_relationship(self, schema_name: str) -> _RawRequest:
714714 url = f"{ self .api } /RelationshipDefinitions?$filter=SchemaName eq '{ escaped } '" ,
715715 )
716716
717+ # ------------------------------------------------------------------
718+ # SQL guardrails
719+ # ------------------------------------------------------------------
720+
721+ # ----------------------- SQL guardrail patterns --------------------
722+ _SQL_WRITE_RE = re .compile (
723+ r"^\s*(?:INSERT|UPDATE|DELETE|DROP|TRUNCATE|ALTER|CREATE|EXEC|GRANT|REVOKE|BULK)\b" ,
724+ re .IGNORECASE ,
725+ )
726+ _SQL_COMMENT_RE = re .compile (r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/|--[^\n]*" , re .DOTALL )
727+ _SQL_LEADING_WILDCARD_RE = re .compile (r"\bLIKE\s+'%[^']" , re .IGNORECASE )
728+ _SQL_IMPLICIT_CROSS_JOIN_RE = re .compile (
729+ r"\bFROM\s+[A-Za-z0-9_]+(?:\s+[A-Za-z0-9_]+)?\s*,\s*[A-Za-z0-9_]+" ,
730+ re .IGNORECASE ,
731+ )
732+ # Server-blocked SQL patterns (save the round-trip by catching early)
733+ _SQL_UNSUPPORTED_JOIN_RE = re .compile (
734+ r"\b(?:CROSS\s+JOIN|RIGHT\s+(?:OUTER\s+)?JOIN|FULL\s+(?:OUTER\s+)?JOIN)\b" ,
735+ re .IGNORECASE ,
736+ )
737+ _SQL_UNION_RE = re .compile (r"\bUNION\b" , re .IGNORECASE )
738+ _SQL_HAVING_RE = re .compile (r"\bHAVING\b" , re .IGNORECASE )
739+ _SQL_CTE_RE = re .compile (r"^\s*WITH\b" , re .IGNORECASE )
740+ _SQL_SUBQUERY_RE = re .compile (
741+ r"\bIN\s*\(\s*SELECT\b|\bEXISTS\s*\(\s*SELECT\b|\(\s*SELECT\b.*\bFROM\b" ,
742+ re .IGNORECASE ,
743+ )
744+ # SELECT * is intentionally rejected -- not a technical limitation but a
745+ # deliberate design decision. Wide entities (e.g. account has 307 columns)
746+ # make SELECT * extremely expensive on shared database infrastructure.
747+ # COUNT(*) is NOT matched because COUNT appears before the *.
748+ _SQL_SELECT_STAR_RE = re .compile (
749+ r"\bSELECT\b\s+(?:DISTINCT\s+)?(?:TOP\s+\d+(?:\s+PERCENT)?\s+)?\*\s" ,
750+ re .IGNORECASE ,
751+ )
752+
753+ def _sql_guardrails (self , sql : str ) -> str :
754+ """Apply safety guardrails to a SQL query before sending to the server.
755+
756+ Checks split into two categories:
757+
758+ **Blocked** (``ValidationError`` -- saves a server round-trip):
759+
760+ 1. Write statements (INSERT/UPDATE/DELETE/DROP/etc.)
761+ 2. CROSS JOIN, RIGHT JOIN, FULL OUTER JOIN (server rejects these)
762+ 3. UNION / UNION ALL (server rejects)
763+ 4. HAVING clause (server rejects)
764+ 5. CTE / WITH clause (server rejects)
765+ 6. Subqueries -- IN (SELECT ...), EXISTS (SELECT ...) (server rejects)
766+ 7. SELECT * -- intentional design decision, not a technical limitation.
767+ Wide entities make wildcard selects extremely expensive on shared
768+ database infrastructure. ``COUNT(*)`` is not affected.
769+
770+ **Warned** (``UserWarning`` -- query still executes):
771+
772+ 8. Leading-wildcard LIKE (full table scan)
773+ 9. Implicit cross join FROM a, b (cartesian product)
774+
775+ All blocked patterns are also blocked by the server, but catching
776+ them here saves the network round-trip and provides clearer error
777+ messages. To bypass a specific check (e.g., if the server adds
778+ support in the future), all checks are in this single method.
779+
780+ :param sql: The SQL string (already stripped).
781+ :return: The SQL string (unchanged).
782+ :raises ValidationError: If the SQL contains a blocked pattern.
783+ """
784+ # --- BLOCKED (save server round-trip) ---
785+
786+ # 1. Block writes (strip SQL comments first to catch comment-prefixed writes)
787+ sql_no_comments = self ._SQL_COMMENT_RE .sub (" " , sql ).strip ()
788+ if self ._SQL_WRITE_RE .search (sql_no_comments ):
789+ raise ValidationError (
790+ "SQL endpoint is read-only. Use client.records or "
791+ "client.dataframe for write operations "
792+ "(INSERT/UPDATE/DELETE are not supported)." ,
793+ subcode = VALIDATION_SQL_WRITE_BLOCKED ,
794+ )
795+
796+ # 2. Block unsupported JOIN types
797+ m = self ._SQL_UNSUPPORTED_JOIN_RE .search (sql )
798+ if m :
799+ raise ValidationError (
800+ f"Unsupported JOIN type: '{ m .group (0 ).strip ()} '. "
801+ "Only INNER JOIN and LEFT JOIN are supported by the "
802+ "Dataverse SQL endpoint." ,
803+ subcode = VALIDATION_SQL_UNSUPPORTED_SYNTAX ,
804+ )
805+
806+ # 3. Block UNION
807+ if self ._SQL_UNION_RE .search (sql ):
808+ raise ValidationError (
809+ "UNION is not supported by the Dataverse SQL endpoint. "
810+ "Execute separate queries and combine results in Python "
811+ "(e.g. pd.concat([df1, df2]))." ,
812+ subcode = VALIDATION_SQL_UNSUPPORTED_SYNTAX ,
813+ )
814+
815+ # 4. Block HAVING
816+ if self ._SQL_HAVING_RE .search (sql ):
817+ raise ValidationError (
818+ "HAVING is not supported by the Dataverse SQL endpoint. "
819+ "Use WHERE to filter before GROUP BY instead." ,
820+ subcode = VALIDATION_SQL_UNSUPPORTED_SYNTAX ,
821+ )
822+
823+ # 5. Block CTE / WITH
824+ if self ._SQL_CTE_RE .search (sql ):
825+ raise ValidationError (
826+ "CTE (WITH ... AS) is not supported by the Dataverse SQL "
827+ "endpoint. Use separate queries and combine in Python." ,
828+ subcode = VALIDATION_SQL_UNSUPPORTED_SYNTAX ,
829+ )
830+
831+ # 6. Block subqueries
832+ if self ._SQL_SUBQUERY_RE .search (sql ):
833+ raise ValidationError (
834+ "Subqueries are not supported by the Dataverse SQL "
835+ "endpoint. Use separate SQL calls and combine results "
836+ "in Python (e.g. step 1: get IDs, step 2: WHERE IN)." ,
837+ subcode = VALIDATION_SQL_UNSUPPORTED_SYNTAX ,
838+ )
839+
840+ # 7. Block SELECT * -- intentional design decision.
841+ # Wide entities (e.g. account has 307 columns) make wildcard selects
842+ # extremely expensive on shared database infrastructure.
843+ # COUNT(*) is NOT matched: _SQL_SELECT_STAR_RE requires * to be the
844+ # first token after SELECT/DISTINCT/TOP N, so COUNT appears before *.
845+ if self ._SQL_SELECT_STAR_RE .search (sql ):
846+ raise ValidationError (
847+ "SELECT * is not supported. Specify column names explicitly "
848+ "(e.g. SELECT name, revenue FROM account). "
849+ "Use client.query.sql_columns('account') to discover available columns." ,
850+ subcode = VALIDATION_SQL_UNSUPPORTED_SYNTAX ,
851+ )
852+
853+ # --- WARNED (query still executes) ---
854+
855+ # 8. Warn on leading-wildcard LIKE
856+ if self ._SQL_LEADING_WILDCARD_RE .search (sql ):
857+ warnings .warn (
858+ "Query contains a leading-wildcard LIKE pattern "
859+ "(e.g. LIKE '%value'). This forces a full table scan "
860+ "and may degrade performance on large tables. "
861+ "Prefer trailing wildcards (LIKE 'value%') when possible." ,
862+ UserWarning ,
863+ stacklevel = 4 ,
864+ )
865+
866+ # 9. Warn on implicit cross joins (server allows but risky)
867+ if self ._SQL_IMPLICIT_CROSS_JOIN_RE .search (sql ):
868+ warnings .warn (
869+ "Query uses an implicit cross join (FROM table1, table2). "
870+ "This produces a cartesian product that can generate "
871+ "millions of intermediate rows and degrade shared database "
872+ "performance. Use explicit JOIN...ON syntax instead: "
873+ "FROM table1 a JOIN table2 b ON a.column = b.column" ,
874+ UserWarning ,
875+ stacklevel = 4 ,
876+ )
877+
878+ return sql
879+
717880 # ------------------------------------------------------------------
718881 # Cache maintenance
719882 # ------------------------------------------------------------------
0 commit comments