Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -345,9 +345,30 @@ public Statistics scaleToRowCount(long newRowCount, boolean downScaleOnly) {
if (downScaleOnly && newRowCount >= numRows) {
return ret;
}
// FIXME: using real scaling by new/old ration might yield better results?
ret.numRows = newRowCount;
ret.dataSize = StatsUtils.safeMult(getAvgRowSize(), newRowCount);

// Adjust column stats to prevent invalid values after scaling: count-based
// stats are set to unknown (-1), zero values preserved. Distribution data cleared.
if (ret.columnStats != null) {
for (ColStatistics cs : ret.columnStats.values()) {
if (cs.getCountDistint() > newRowCount) {
cs.setCountDistint(newRowCount);
}
if (cs.getNumNulls() > 0) {
cs.setNumNulls(-1);
}
if (cs.getNumTrues() > 0) {
cs.setNumTrues(-1);
}
if (cs.getNumFalses() > 0) {
cs.setNumFalses(-1);
}
cs.setBitVectors(null);
cs.setHistogram(null);
}
}

return ret;
}

Expand Down
156 changes: 156 additions & 0 deletions ql/src/test/org/apache/hadoop/hive/ql/plan/TestStatistics.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.plan;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;

import java.util.Arrays;

import org.junit.Test;

public class TestStatistics {

@Test
public void testScaleToRowCountPreventsNegativeNonNullCount() {
Statistics stats = new Statistics(10, 1000, 0, 0);
ColStatistics colStats = new ColStatistics("str_col", "string");
colStats.setNumNulls(9);
colStats.setCountDistint(2);
colStats.setAvgColLen(10.0);
stats.setColumnStats(Arrays.asList(colStats));

Statistics scaled = stats.scaleToRowCount(1, false);

assertEquals(1, scaled.getNumRows());
ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("str_col");
assertEquals(-1, scaledCol.getNumNulls());
}

@Test
public void testScaleToRowCountCapsCountDistinct() {
Statistics stats = new Statistics(100, 1000, 0, 0);
ColStatistics colStats = new ColStatistics("col1", "int");
colStats.setCountDistint(100);
colStats.setNumNulls(0);
stats.setColumnStats(Arrays.asList(colStats));

Statistics scaled = stats.scaleToRowCount(10, false);

ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("col1");
assertEquals(10, scaledCol.getCountDistint());
}

@Test
public void testScaleToRowCountSetsNumNullsToUnknown() {
Statistics stats = new Statistics(100, 1000, 0, 0);
ColStatistics colStats = new ColStatistics("col1", "string");
colStats.setNumNulls(50);
stats.setColumnStats(Arrays.asList(colStats));

Statistics scaled = stats.scaleToRowCount(10, false);

ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("col1");
assertEquals(-1, scaledCol.getNumNulls());
}

@Test
public void testScaleToRowCountSetsBooleanStatsToUnknown() {
Statistics stats = new Statistics(100, 1000, 0, 0);
ColStatistics colStats = new ColStatistics("bool_col", "boolean");
colStats.setNumTrues(30);
colStats.setNumFalses(70);
colStats.setNumNulls(0);
stats.setColumnStats(Arrays.asList(colStats));

Statistics scaled = stats.scaleToRowCount(10, false);

ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("bool_col");
assertEquals(-1, scaledCol.getNumTrues());
assertEquals(-1, scaledCol.getNumFalses());
}

@Test
public void testScaleToRowCountPreservesZeroBooleanStats() {
Statistics stats = new Statistics(100, 1000, 0, 0);
ColStatistics colStats = new ColStatistics("bool_col", "boolean");
colStats.setNumTrues(0);
colStats.setNumFalses(100);
colStats.setNumNulls(0);
stats.setColumnStats(Arrays.asList(colStats));

Statistics scaled = stats.scaleToRowCount(10, false);

ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("bool_col");
assertEquals(0, scaledCol.getNumTrues());
assertEquals(-1, scaledCol.getNumFalses());
}

@Test
public void testScaleToRowCountClearsDistributionData() {
Statistics stats = new Statistics(100, 1000, 0, 0);
ColStatistics colStats = new ColStatistics("col1", "int");
colStats.setNumNulls(0);
colStats.setBitVectors(new byte[]{1, 2, 3});
colStats.setHistogram(new byte[]{4, 5, 6});
stats.setColumnStats(Arrays.asList(colStats));

Statistics scaled = stats.scaleToRowCount(10, false);

ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("col1");
assertNull(scaledCol.getBitVectors());
assertNull(scaledCol.getHistogram());
}

@Test
public void testScaleToRowCountMultipleColumns() {
Statistics stats = new Statistics(100, 1000, 0, 0);

ColStatistics col1 = new ColStatistics("int_col", "int");
col1.setNumNulls(20);
col1.setCountDistint(80);

ColStatistics col2 = new ColStatistics("str_col", "string");
col2.setNumNulls(0);
col2.setCountDistint(50);

ColStatistics col3 = new ColStatistics("bool_col", "boolean");
col3.setNumNulls(10);
col3.setNumTrues(40);
col3.setNumFalses(50);

stats.setColumnStats(Arrays.asList(col1, col2, col3));

Statistics scaled = stats.scaleToRowCount(5, false);

assertEquals(5, scaled.getNumRows());

ColStatistics scaledCol1 = scaled.getColumnStatisticsFromColName("int_col");
assertEquals(-1, scaledCol1.getNumNulls());
assertEquals(5, scaledCol1.getCountDistint());

ColStatistics scaledCol2 = scaled.getColumnStatisticsFromColName("str_col");
assertEquals(0, scaledCol2.getNumNulls());
assertEquals(5, scaledCol2.getCountDistint());

ColStatistics scaledCol3 = scaled.getColumnStatisticsFromColName("bool_col");
assertEquals(-1, scaledCol3.getNumNulls());
assertEquals(-1, scaledCol3.getNumTrues());
assertEquals(-1, scaledCol3.getNumFalses());
}
}
45 changes: 45 additions & 0 deletions ql/src/test/queries/clientpositive/runtime_stats_scaling.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
-- Test runtime statistics scaling: column stats adjustment when row count changes.
-- When runtime row count is smaller than compile-time estimate, count-based stats
-- (numNulls, numTrues, numFalses) must be adjusted to prevent invalid values.

set hive.fetch.task.conversion=none;

create table t_runtime_scaling (
id int,
str_col string,
bool_col boolean
);

-- 10 rows: skewed id values create selectivity mismatch
-- str_col: 9 rows NULL, 1 non-null (tests numNulls for string)
-- bool_col: 2 true, 1 false, 7 NULL (tests numNulls for boolean)
insert into t_runtime_scaling values
(1, NULL, NULL), (2, NULL, NULL), (3, NULL, NULL), (4, NULL, NULL),
(5, NULL, NULL), (6, NULL, true), (7, NULL, true), (8, NULL, NULL),
(9, NULL, NULL), (100, 'only_non_null', false);

analyze table t_runtime_scaling compute statistics;
analyze table t_runtime_scaling compute statistics for columns;

-- Compile-time: estimates ~50% selectivity (5 rows). Runtime: 1 row passes.

-- Test 1: numNulls scaling for string (str_col has 9 nulls, scaled to 1 row)
explain
select str_col from t_runtime_scaling where id > 50;

explain reoptimization
select str_col from t_runtime_scaling where id > 50;

-- Test 2: numNulls scaling for boolean (bool_col has 7 nulls, scaled to 1 row)
explain
select bool_col from t_runtime_scaling where id > 50;

explain reoptimization
select bool_col from t_runtime_scaling where id > 50;

-- Test 3: combined (both columns)
explain
select str_col, bool_col from t_runtime_scaling where id > 50;

explain reoptimization
select str_col, bool_col from t_runtime_scaling where id > 50;
Loading