Update remaining existing examples to make testable/standalone executable (#1437)

ntjohnson1 · web-flow · commit 6cea061abeca · 2026-03-27T09:29:09.000-04:00
* Move example to doctestable examples for context.py

* Add more standard dafusion namespaces to reduce clutter

* Update project to use ruff compatible with pre-commit version

* Resolve ruff errors for newer version but just ignore them

* Convert dataframe examples to doctestable. Found bug in dropping A

* Move expr.py to doctestable examples

* Move user_defined.py to doctestable examples
diff --git a/conftest.py b/conftest.py
@@ -20,10 +20,15 @@
 import datafusion as dfn
 import numpy as np
 import pytest
+from datafusion import col, lit
+from datafusion import functions as F
 
 
 @pytest.fixture(autouse=True)
 def _doctest_namespace(doctest_namespace: dict) -> None:
     """Add common imports to the doctest namespace."""
     doctest_namespace["dfn"] = dfn
     doctest_namespace["np"] = np
+    doctest_namespace["col"] = col
+    doctest_namespace["lit"] = lit
+    doctest_namespace["F"] = F
diff --git a/pyproject.toml b/pyproject.toml
@@ -89,7 +89,6 @@ ignore = [
   "FIX002",  # Allow TODO lines - consider removing at some point
   "ISC001",  # Recommended to ignore these rules when using with ruff-format
   "N812",    # Allow importing functions as `F`
-  "PD901",   # Allow variable name df
   "PLR0913", # Allow many arguments in function definition
   "SLF001",  # Allow accessing private members
   "TD002",   # Do not require author names in TODO statements
@@ -195,7 +194,7 @@ dev = [
   "pytest-asyncio>=0.23.3",
   "pytest>=7.4.4",
   "pyyaml>=6.0.3",
-  "ruff>=0.9.1",
+  "ruff>=0.15.1",
   "toml>=0.10.2",
 ]
 docs = [
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -371,9 +371,8 @@ def with_fair_spill_pool(self, size: int) -> RuntimeEnvBuilder:
         Returns:
             A new :py:class:`RuntimeEnvBuilder` object with the updated setting.
 
-        Examples usage::
-
-            config = RuntimeEnvBuilder().with_fair_spill_pool(1024)
+        Examples:
+            >>> config = dfn.RuntimeEnvBuilder().with_fair_spill_pool(1024)
         """
         self.config_internal = self.config_internal.with_fair_spill_pool(size)
         return self
@@ -391,9 +390,8 @@ def with_greedy_memory_pool(self, size: int) -> RuntimeEnvBuilder:
         Returns:
             A new :py:class:`RuntimeEnvBuilder` object with the updated setting.
 
-        Example usage::
-
-            config = RuntimeEnvBuilder().with_greedy_memory_pool(1024)
+        Examples:
+            >>> config = dfn.RuntimeEnvBuilder().with_greedy_memory_pool(1024)
         """
         self.config_internal = self.config_internal.with_greedy_memory_pool(size)
         return self
@@ -407,9 +405,8 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeEnvBuilder:
         Returns:
             A new :py:class:`RuntimeEnvBuilder` object with the updated setting.
 
-        Example usage::
-
-            config = RuntimeEnvBuilder().with_temp_file_path("/tmp")
+        Examples:
+            >>> config = dfn.RuntimeEnvBuilder().with_temp_file_path("/tmp")
         """
         self.config_internal = self.config_internal.with_temp_file_path(str(path))
         return self
@@ -444,9 +441,8 @@ def with_allow_ddl(self, allow: bool = True) -> SQLOptions:
         Returns:
             A new :py:class:`SQLOptions` object with the updated setting.
 
-        Example usage::
-
-            options = SQLOptions().with_allow_ddl(True)
+        Examples:
+            >>> options = dfn.SQLOptions().with_allow_ddl(True)
         """
         self.options_internal = self.options_internal.with_allow_ddl(allow)
         return self
@@ -462,9 +458,8 @@ def with_allow_dml(self, allow: bool = True) -> SQLOptions:
         Returns:
             A new :py:class:`SQLOptions` object with the updated setting.
 
-        Example usage::
-
-            options = SQLOptions().with_allow_dml(True)
+        Examples:
+            >>> options = dfn.SQLOptions().with_allow_dml(True)
         """
         self.options_internal = self.options_internal.with_allow_dml(allow)
         return self
@@ -478,9 +473,8 @@ def with_allow_statements(self, allow: bool = True) -> SQLOptions:
         Returns:
             A new :py:class:SQLOptions` object with the updated setting.
 
-        Example usage::
-
-            options = SQLOptions().with_allow_statements(True)
+        Examples:
+            >>> options = dfn.SQLOptions().with_allow_statements(True)
         """
         self.options_internal = self.options_internal.with_allow_statements(allow)
         return self
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -331,7 +331,7 @@ def into_view(self, temporary: bool = False) -> Table:
             >>> result[0].column("value").to_pylist()
             [1]
         """
-        from datafusion.catalog import Table as _Table
+        from datafusion.catalog import Table as _Table  # noqa: PLC0415
 
         return _Table(self.df.into_view(temporary))
 
@@ -451,9 +451,20 @@ def drop(self, *columns: str) -> DataFrame:
         Returns:
             DataFrame with those columns removed in the projection.
 
-        Example Usage::
-            df.drop('a')                    # To drop a lower-cased column 'a'
-            df.drop('"a"')                  # To drop an upper-cased column 'A'
+        Examples:
+            To drop a lower-cased column 'a'
+
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [1, 2], "b": [3, 4]})
+            >>> df.drop("a").schema().names
+            ['b']
+
+            Or to drop an upper-cased column 'A'
+
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"A": [1, 2], "b": [3, 4]})
+            >>> df.drop('"A"').schema().names
+            ['b']
         """
         return DataFrame(self.df.drop(*columns))
 
@@ -468,11 +479,13 @@ def filter(self, *predicates: Expr | str) -> DataFrame:
         that will be parsed against the DataFrame schema. If more complex logic is
         required, see the logical operations in :py:mod:`~datafusion.functions`.
 
-        Example::
-
-            from datafusion import col, lit
-            df.filter(col("a") > lit(1))
-            df.filter("a > 1")
+        Examples:
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+            >>> df.filter(col("a") > lit(1)).to_pydict()
+            {'a': [2, 3]}
+            >>> df.filter("a > 1").to_pydict()
+            {'a': [2, 3]}
 
         Args:
             predicates: Predicate expression(s) or SQL strings to filter the DataFrame.
@@ -495,14 +508,12 @@ def parse_sql_expr(self, expr: str) -> Expr:
 
         The expression is created and processed against the current schema.
 
-        Example::
-
-            from datafusion import col, lit
-            df.parse_sql_expr("a > 1")
-
-            should produce:
-
-            col("a") > lit(1)
+        Examples:
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+            >>> expr = df.parse_sql_expr("a > 1")
+            >>> df.filter(expr).to_pydict()
+            {'a': [2, 3]}
 
         Args:
             expr: Expression string to be converted to datafusion expression
@@ -519,10 +530,11 @@ def with_column(self, name: str, expr: Expr | str) -> DataFrame:
         :func:`datafusion.col` or :func:`datafusion.lit`, or a SQL expression
         string that will be parsed against the DataFrame schema.
 
-        Example::
-
-            from datafusion import col, lit
-            df.with_column("b", col("a") + lit(1))
+        Examples:
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [1, 2]})
+            >>> df.with_column("b", col("a") + lit(10)).to_pydict()
+            {'a': [1, 2], 'b': [11, 12]}
 
         Args:
             name: Name of the column to add.
@@ -885,10 +897,14 @@ def join_on(
         built with :func:`datafusion.col`. On expressions are used to support
         in-equality predicates. Equality predicates are correctly optimized.
 
-        Example::
-
-            from datafusion import col
-            df.join_on(other_df, col("id") == col("other_id"))
+        Examples:
+            >>> ctx = dfn.SessionContext()
+            >>> left = ctx.from_pydict({"a": [1, 2], "x": ["a", "b"]})
+            >>> right = ctx.from_pydict({"b": [1, 2], "y": ["c", "d"]})
+            >>> left.join_on(
+            ...     right, col("a") == col("b")
+            ... ).sort(col("x")).to_pydict()
+            {'a': [1, 2], 'x': ['a', 'b'], 'b': [1, 2], 'y': ['c', 'd']}
 
         Args:
             right: Other DataFrame to join with.
@@ -1350,15 +1366,17 @@ def __aiter__(self) -> AsyncIterator[RecordBatch]:
     def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
         """Apply a function to the current DataFrame which returns another DataFrame.
 
-        This is useful for chaining together multiple functions. For example::
-
-            def add_3(df: DataFrame) -> DataFrame:
-                return df.with_column("modified", lit(3))
+        This is useful for chaining together multiple functions.
 
-            def within_limit(df: DataFrame, limit: int) -> DataFrame:
-                return df.filter(col("a") < lit(limit)).distinct()
-
-            df = df.transform(modify_df).transform(within_limit, 4)
+        Examples:
+            >>> ctx = dfn.SessionContext()
+            >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+            >>> def add_3(df):
+            ...     return df.with_column("modified", dfn.lit(3))
+            >>> def within_limit(df: DataFrame, limit: int) -> DataFrame:
+            ...     return df.filter(col("a") < lit(limit)).distinct()
+            >>> df.transform(add_3).transform(within_limit, 4).sort("a").to_pydict()
+            {'a': [1, 2, 3], 'modified': [3, 3, 3]}
 
         Args:
             func: A callable function that takes a DataFrame as it's first argument
diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py
@@ -342,7 +342,7 @@ def sort_list_to_raw_sort_list(
     return raw_sort_list
 
 
-class Expr:
+class Expr:  # noqa: PLW1641
     """Expression object.
 
     Expressions are one of the core concepts in DataFusion. See
@@ -1367,16 +1367,18 @@ def is_unbounded(self) -> bool:
 class CaseBuilder:
     """Builder class for constructing case statements.
 
-    An example usage would be as follows::
-
-        import datafusion.functions as f
-        from datafusion import lit, col
-        df.select(
-            f.case(col("column_a"))
-            .when(lit(1), lit("One"))
-            .when(lit(2), lit("Two"))
-            .otherwise(lit("Unknown"))
-        )
+    Examples:
+        >>> ctx = dfn.SessionContext()
+        >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+        >>> result = df.select(
+        ...     dfn.functions.case(dfn.col("a"))
+        ...     .when(dfn.lit(1), dfn.lit("One"))
+        ...     .when(dfn.lit(2), dfn.lit("Two"))
+        ...     .otherwise(dfn.lit("Other"))
+        ...     .alias("label")
+        ... )
+        >>> result.to_pydict()
+        {'label': ['One', 'Two', 'Other']}
     """
 
     def __init__(self, case_builder: expr_internal.CaseBuilder) -> None:
diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py
@@ -46,7 +46,7 @@ def build_table(
         num_rows = 0  # Total number of rows in the file. Used for statistics
         columns = []
         if file_format == "parquet":
-            import pyarrow.parquet as pq
+            import pyarrow.parquet as pq  # noqa: PLC0415
 
             # Read the Parquet metadata
             metadata = pq.read_metadata(input_item)
@@ -61,7 +61,7 @@ def build_table(
             ]
 
         elif format == "csv":
-            import csv
+            import csv  # noqa: PLC0415
 
             # Consume header row and count number of rows for statistics.
             # TODO: Possibly makes sense to have the eager number of rows
diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py
@@ -32,7 +32,7 @@
 ]
 
 
-class LogicalPlan:
+class LogicalPlan:  # noqa: PLW1641
     """Logical Plan.
 
     A `LogicalPlan` is a node in a tree of relational operators (such as
diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py
diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@`
`32`	`32`	`]`
`33`	`33`
`34`	`34`
`35`		`-class LogicalPlan:`
	`35`	`+class LogicalPlan: # noqa: PLW1641`
`36`	`36`	`"""Logical Plan.`
`37`	`37`
`38`	`38`	A `LogicalPlan` is a node in a tree of relational operators (such as