[HSTACK] Building blocks for Ray DataFusionDatasource

ccciudatu · adragomir · commit d6836d14eb5f · 2025-03-12T13:19:12.000+02:00
diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
@@ -29,7 +29,7 @@
 from . import functions, object_store, substrait
 
 # The following imports are okay to remain as opaque to the user.
-from ._internal import Config
+from ._internal import Config, partition_stream
 from .catalog import Catalog, Database, Table
 from .common import (
     DFSchema,
@@ -86,6 +86,7 @@
     "read_avro",
     "read_csv",
     "read_json",
+    "partition_stream",
 ]
 
 
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -805,6 +805,9 @@ def count(self) -> int:
         """
         return self.df.count()
 
+    def distributed_plan(self):
+        return self.df.distributed_plan()
+
     @deprecated("Use :py:func:`unnest_columns` instead.")
     def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame:
         """See :py:func:`unnest_columns`."""
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -27,30 +27,38 @@ use arrow::util::display::{ArrayFormatter, FormatOptions};
 use datafusion::arrow::datatypes::Schema;
 use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow};
 use datafusion::arrow::util::pretty;
-use datafusion::common::UnnestOptions;
-use datafusion::config::{CsvOptions, TableParquetOptions};
+use datafusion::common::stats::Precision;
+use datafusion::common::{DFSchema, DataFusionError, Statistics, UnnestOptions};
+use datafusion::common::tree_node::{Transformed, TreeNode};
+use datafusion::config::{ConfigOptions, CsvOptions, TableParquetOptions};
 use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
 use datafusion::datasource::TableProvider;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::datasource::physical_plan::ParquetExec;
 use datafusion::execution::SendableRecordBatchStream;
 use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
+use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 use datafusion::prelude::*;
+
+use datafusion_proto::physical_plan::{AsExecutionPlan, PhysicalExtensionCodec};
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use deltalake::delta_datafusion::DeltaPhysicalCodec;
+use prost::Message;
 use pyo3::exceptions::PyValueError;
 use pyo3::prelude::*;
 use pyo3::pybacked::PyBackedStr;
 use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods};
 use tokio::task::JoinHandle;
 
 use crate::catalog::PyTable;
+use crate::common::df_schema::PyDFSchema;
 use crate::errors::{py_datafusion_err, PyDataFusionError};
 use crate::expr::sort_expr::to_sort_expressions;
 use crate::physical_plan::PyExecutionPlan;
 use crate::record_batch::PyRecordBatchStream;
 use crate::sql::logical::PyLogicalPlan;
 use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future};
-use crate::{
-    errors::PyDataFusionResult,
-    expr::{sort_expr::PySortExpr, PyExpr},
-};
+use crate::{errors::PyDataFusionResult, expr::{sort_expr::PySortExpr, PyExpr}};
 
 // https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
 // - we have not decided on the table_provider approach yet
@@ -697,6 +705,131 @@ impl PyDataFrame {
     fn count(&self, py: Python) -> PyDataFusionResult<usize> {
         Ok(wait_for_future(py, self.df.as_ref().clone().count())?)
     }
+
+    fn distributed_plan(&self, py: Python<'_>) -> PyResult<DistributedPlan> {
+        let future_plan = self.df.as_ref().clone().create_physical_plan();
+        wait_for_future(py, future_plan)
+            .map(DistributedPlan::new)
+            .map_err(py_datafusion_err)
+    }
+
+}
+
+#[pyclass(get_all)]
+#[derive(Debug, Clone)]
+pub struct DistributedPlan {
+    physical_plan: PyExecutionPlan,
+}
+
+#[pymethods]
+impl DistributedPlan {
+
+    fn serialize(&self) -> PyResult<Vec<u8>> {
+        PhysicalPlanNode::try_from_physical_plan(self.plan().clone(), codec())
+            .map(|node| node.encode_to_vec())
+            .map_err(py_datafusion_err)
+    }
+
+    fn partition_count(&self) -> usize {
+        self.plan().output_partitioning().partition_count()
+    }
+
+    fn num_bytes(&self) -> Option<usize> {
+        self.stats_field(|stats| stats.total_byte_size)
+    }
+
+    fn num_rows(&self) -> Option<usize> {
+        self.stats_field(|stats| stats.num_rows)
+    }
+
+    fn schema(&self) -> PyResult<PyDFSchema> {
+        DFSchema::try_from(self.plan().schema())
+            .map(PyDFSchema::from)
+            .map_err(py_datafusion_err)
+    }
+
+    fn set_desired_parallelism(&mut self, desired_parallelism: usize) -> PyResult<()> {
+        if self.plan().output_partitioning().partition_count() == desired_parallelism {
+            return Ok(())
+        }
+        let updated_plan = self.plan().clone().transform_up(|node| {
+            if let Some(parquet) = node.as_any().downcast_ref::<ParquetExec>() {
+                // Remove redundant ranges from partition files because ParquetExec refuses to repartition
+                // if any file has a range defined (even when the range actually covers the entire file).
+                // The EnforceDistribution optimizer rule adds ranges for both full and partial files,
+                // so this tries to rever that to trigger a repartition when no files are actually split.
+                let mut file_groups = parquet.base_config().file_groups.clone();
+                for group in file_groups.iter_mut() {
+                    for file in group.iter_mut() {
+                        if let Some(range) = &file.range {
+                            if range.start == 0 && range.end == file.object_meta.size as i64 {
+                                file.range = None;  // remove redundant range
+                            }
+                        }
+                    }
+                }
+                if let Some(repartitioned) = parquet.clone().into_builder().with_file_groups(file_groups)
+                    .build_arc()
+                    .repartitioned(desired_parallelism, &ConfigOptions::default())? {
+                    Ok(Transformed::yes(repartitioned))
+                } else {
+                    Ok(Transformed::no(node))
+                }
+            } else {
+                Ok(Transformed::no(node))
+            }
+        }).map_err(py_datafusion_err)?.data;
+        self.physical_plan = PyExecutionPlan::new(updated_plan);
+        Ok(())
+    }
+}
+
+impl DistributedPlan {
+
+    fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
+        Self {
+            physical_plan: PyExecutionPlan::new(plan)
+        }
+    }
+
+    fn plan(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.physical_plan.plan
+    }
+
+    fn stats_field(&self, field: fn(Statistics) -> Precision<usize>) -> Option<usize> {
+        if let Ok(stats) = self.physical_plan.plan.statistics() {
+            match field(stats) {
+                Precision::Exact(n) => Some(n),
+                _ => None,
+            }
+        } else {
+            None
+        }
+    }
+
+}
+
+#[pyfunction]
+pub fn partition_stream(serialized_plan: &[u8], partition: usize, py: Python) -> PyResult<PyRecordBatchStream> {
+    deltalake::ensure_initialized();
+    let node = PhysicalPlanNode::decode(serialized_plan)
+        .map_err(|e| DataFusionError::External(Box::new(e)))
+        .map_err(py_datafusion_err)?;
+    let ctx = SessionContext::new();
+    let plan = node.try_into_physical_plan(&ctx, ctx.runtime_env().as_ref(), codec())
+        .map_err(py_datafusion_err)?;
+    let stream_with_runtime = get_tokio_runtime().0.spawn(async move {
+        plan.execute(partition, ctx.task_ctx())
+    });
+    wait_for_future(py, stream_with_runtime)
+        .map_err(py_datafusion_err)?
+        .map(PyRecordBatchStream::new)
+        .map_err(py_datafusion_err)
+}
+
+fn codec() -> &'static dyn PhysicalExtensionCodec {
+    static CODEC: DeltaPhysicalCodec = DeltaPhysicalCodec {};
+    &CODEC
 }
 
 /// Print DataFrame
diff --git a/src/lib.rs b/src/lib.rs
@@ -116,6 +116,8 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> {
     #[cfg(feature = "substrait")]
     setup_substrait_module(py, &m)?;
 
+    m.add_class::<dataframe::DistributedPlan>()?;
+    m.add_wrapped(wrap_pyfunction!(dataframe::partition_stream))?;
     Ok(())
 }
 

Original file line number	Diff line number	Diff line change
`@@ -116,6 +116,8 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> {`
`116`	`116`	`#[cfg(feature = "substrait")]`
`117`	`117`	`setup_substrait_module(py, &m)?;`
`118`	`118`
	`119`	`+ m.add_class::<dataframe::DistributedPlan>()?;`
	`120`	`+ m.add_wrapped(wrap_pyfunction!(dataframe::partition_stream))?;`
`119`	`121`	`Ok(())`
`120`	`122`	`}`
`121`	`123`