Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions datafusion/common/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,14 @@ impl Statistics {
col_stats.max_value = col_stats.max_value.max(&item_col_stats.max_value);
col_stats.min_value = col_stats.min_value.min(&item_col_stats.min_value);
col_stats.sum_value = col_stats.sum_value.add(&item_col_stats.sum_value);
col_stats.distinct_count = Precision::Absent;
// Use max as a conservative lower bound for distinct count
// (can't accurately merge NDV since duplicates may exist across partitions)
col_stats.distinct_count = col_stats
.distinct_count
.get_value()
.max(item_col_stats.distinct_count.get_value())
.map(|&v| Precision::Inexact(v))
.unwrap_or(Precision::Absent);
col_stats.byte_size = col_stats.byte_size.add(&item_col_stats.byte_size);
}

Expand Down Expand Up @@ -1352,8 +1359,8 @@ mod tests {
col_stats.max_value,
Precision::Exact(ScalarValue::Int32(Some(20)))
);
// Distinct count should be Absent after merge
assert_eq!(col_stats.distinct_count, Precision::Absent);
// Distinct count should be Inexact(max) after merge as a conservative lower bound
assert_eq!(col_stats.distinct_count, Precision::Inexact(7));
}

#[test]
Expand Down
12 changes: 7 additions & 5 deletions datafusion/core/tests/physical_optimizer/partition_statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,15 @@ mod test {
// - null_count = 0 (partition values from paths are never null)
// - min/max are the merged partition values across files in the group
// - byte_size = num_rows * 4 (Date32 is 4 bytes per row)
// - distinct_count = Inexact(1) per partition file (single partition value per file),
// preserved via max() when merging stats across partitions
let date32_byte_size = num_rows * 4;
column_stats.push(ColumnStatistics {
null_count: Precision::Exact(0),
max_value: Precision::Exact(ScalarValue::Date32(Some(max_date))),
min_value: Precision::Exact(ScalarValue::Date32(Some(min_date))),
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
distinct_count: Precision::Inexact(1),
byte_size: Precision::Exact(date32_byte_size),
});
}
Expand Down Expand Up @@ -577,7 +579,7 @@ mod test {
max_value: Precision::Exact(ScalarValue::Date32(Some(20151))),
min_value: Precision::Exact(ScalarValue::Date32(Some(20148))),
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
distinct_count: Precision::Inexact(1),
byte_size: Precision::Absent,
},
// column 2: right.id (Int32, file column from t2) - right partition 0: ids [3,4]
Expand Down Expand Up @@ -611,7 +613,7 @@ mod test {
max_value: Precision::Exact(ScalarValue::Date32(Some(20151))),
min_value: Precision::Exact(ScalarValue::Date32(Some(20148))),
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
distinct_count: Precision::Inexact(1),
byte_size: Precision::Absent,
},
// column 2: right.id (Int32, file column from t2) - right partition 1: ids [1,2]
Expand Down Expand Up @@ -1247,7 +1249,7 @@ mod test {
DATE_2025_03_01,
))),
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
distinct_count: Precision::Inexact(1),
byte_size: Precision::Exact(8),
},
ColumnStatistics::new_unknown(), // window column
Expand Down Expand Up @@ -1275,7 +1277,7 @@ mod test {
DATE_2025_03_03,
))),
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
distinct_count: Precision::Inexact(1),
byte_size: Precision::Exact(8),
},
ColumnStatistics::new_unknown(), // window column
Expand Down
Loading