feat: SessionContext.read_batches / read_batch

timsaucer · claude · timsaucer · commit 51fc2bccd9d4 · 2026-05-21T14:55:32.000-04:00
Wrap upstream `SessionContext::read_batches`, which materializes a
DataFrame directly from a sequence of `RecordBatch`es without
registering a named table. The single-batch convenience
`SessionContext.read_batch` is implemented in pure Python by calling
`read_batches([batch])`, so the Rust side only needs the one binding.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/crates/core/src/context.rs b/crates/core/src/context.rs
@@ -847,6 +847,13 @@ impl PySessionContext {
         Ok(())
     }
 
+    pub fn read_batches(
+        &self,
+        batches: PyArrowType<Vec<RecordBatch>>,
+    ) -> PyDataFusionResult<PyDataFrame> {
+        Ok(PyDataFrame::new(self.ctx.read_batches(batches.0)?))
+    }
+
     #[allow(clippy::too_many_arguments)]
     #[pyo3(signature = (name, path, table_partition_cols=vec![],
                         parquet_pruning=true,
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -962,6 +962,45 @@ def register_record_batches(
         """
         self.ctx.register_record_batches(name, partitions)
 
+    def read_batch(self, batch: pa.RecordBatch) -> DataFrame:
+        """Return a :py:class:`~datafusion.DataFrame` reading a single batch.
+
+        Convenience wrapper around :py:meth:`read_batches` for the single-batch
+        case. Unlike :py:meth:`register_batch`, this does not register the
+        batch as a named table; it returns an anonymous
+        :py:class:`~datafusion.DataFrame` directly.
+
+        Args:
+            batch: Record batch to wrap as a DataFrame.
+
+        Examples:
+            >>> ctx = dfn.SessionContext()
+            >>> batch = pa.RecordBatch.from_pydict({"a": [1, 2, 3]})
+            >>> ctx.read_batch(batch).to_pydict()
+            {'a': [1, 2, 3]}
+        """
+        return self.read_batches([batch])
+
+    def read_batches(self, batches: list[pa.RecordBatch]) -> DataFrame:
+        """Return a :py:class:`~datafusion.DataFrame` reading the given batches.
+
+        All batches must share the same schema. Unlike
+        :py:meth:`register_record_batches`, this does not register the batches
+        as a named table; it returns an anonymous
+        :py:class:`~datafusion.DataFrame` directly.
+
+        Args:
+            batches: Record batches to wrap as a DataFrame.
+
+        Examples:
+            >>> ctx = dfn.SessionContext()
+            >>> b1 = pa.RecordBatch.from_pydict({"a": [1, 2]})
+            >>> b2 = pa.RecordBatch.from_pydict({"a": [3, 4]})
+            >>> ctx.read_batches([b1, b2]).to_pydict()
+            {'a': [1, 2, 3, 4]}
+        """
+        return DataFrame(self.ctx.read_batches(batches))
+
     def register_parquet(
         self,
         name: str,
diff --git a/python/tests/test_context.py b/python/tests/test_context.py
@@ -905,6 +905,21 @@ def test_register_batch_empty(ctx):
     assert result[0].num_rows == 0
 
 
+def test_read_batch_returns_dataframe(ctx):
+    batch = pa.RecordBatch.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df = ctx.read_batch(batch)
+    assert df.to_pydict() == {"a": [1, 2, 3], "b": [4, 5, 6]}
+    # read_batch should not register a named table.
+    assert ctx.catalog().schema().names() == set()
+
+
+def test_read_batches_concatenates(ctx):
+    b1 = pa.RecordBatch.from_pydict({"a": [1, 2]})
+    b2 = pa.RecordBatch.from_pydict({"a": [3, 4]})
+    df = ctx.read_batches([b1, b2])
+    assert df.to_pydict() == {"a": [1, 2, 3, 4]}
+
+
 def test_create_sql_options():
     SQLOptions()