apache · konstantinb · Feb 5, 2026 · Feb 5, 2026 · Feb 6, 2026
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
@@ -345,9 +345,30 @@ public Statistics scaleToRowCount(long newRowCount, boolean downScaleOnly) {
     if (downScaleOnly && newRowCount >= numRows) {
       return ret;
     }
-    // FIXME: using real scaling by new/old ration might yield better results?
     ret.numRows = newRowCount;
     ret.dataSize = StatsUtils.safeMult(getAvgRowSize(), newRowCount);
+
+    // Adjust column stats to prevent invalid values after scaling: count-based
+    // stats are set to unknown (-1), zero values preserved. Distribution data cleared.
+    if (ret.columnStats != null) {
+      for (ColStatistics cs : ret.columnStats.values()) {
+        if (cs.getCountDistint() > newRowCount) {
+          cs.setCountDistint(newRowCount);
+        }
+        if (cs.getNumNulls() > 0) {
+          cs.setNumNulls(-1);
+        }
+        if (cs.getNumTrues() > 0) {
+          cs.setNumTrues(-1);
+        }
+        if (cs.getNumFalses() > 0) {
+          cs.setNumFalses(-1);
+        }
+        cs.setBitVectors(null);
+        cs.setHistogram(null);
+      }
+    }
+
     return ret;
   }
 

diff --git a/ql/src/test/org/apache/hadoop/hive/ql/plan/TestStatistics.java b/ql/src/test/org/apache/hadoop/hive/ql/plan/TestStatistics.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.plan;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+
+public class TestStatistics {
+
+  @Test
+  public void testScaleToRowCountPreventsNegativeNonNullCount() {
+    Statistics stats = new Statistics(10, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("str_col", "string");
+    colStats.setNumNulls(9);
+    colStats.setCountDistint(2);
+    colStats.setAvgColLen(10.0);
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(1, false);
+
+    assertEquals(1, scaled.getNumRows());
+    ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("str_col");
+    assertEquals(-1, scaledCol.getNumNulls());
+  }
+
+  @Test
+  public void testScaleToRowCountCapsCountDistinct() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("col1", "int");
+    colStats.setCountDistint(100);
+    colStats.setNumNulls(0);
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(10, false);
+
+    ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("col1");
+    assertEquals(10, scaledCol.getCountDistint());
+  }
+
+  @Test
+  public void testScaleToRowCountSetsNumNullsToUnknown() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("col1", "string");
+    colStats.setNumNulls(50);
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(10, false);
+
+    ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("col1");
+    assertEquals(-1, scaledCol.getNumNulls());
+  }
+
+  @Test
+  public void testScaleToRowCountSetsBooleanStatsToUnknown() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("bool_col", "boolean");
+    colStats.setNumTrues(30);
+    colStats.setNumFalses(70);
+    colStats.setNumNulls(0);
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(10, false);
+
+    ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("bool_col");
+    assertEquals(-1, scaledCol.getNumTrues());
+    assertEquals(-1, scaledCol.getNumFalses());
+  }
+
+  @Test
+  public void testScaleToRowCountPreservesZeroBooleanStats() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("bool_col", "boolean");
+    colStats.setNumTrues(0);
+    colStats.setNumFalses(100);
+    colStats.setNumNulls(0);
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(10, false);
+
+    ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("bool_col");
+    assertEquals(0, scaledCol.getNumTrues());
+    assertEquals(-1, scaledCol.getNumFalses());
+  }
+
+  @Test
+  public void testScaleToRowCountClearsDistributionData() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("col1", "int");
+    colStats.setNumNulls(0);
+    colStats.setBitVectors(new byte[]{1, 2, 3});
+    colStats.setHistogram(new byte[]{4, 5, 6});
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(10, false);
+
+    ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("col1");
+    assertNull(scaledCol.getBitVectors());
+    assertNull(scaledCol.getHistogram());
+  }
+
+  @Test
+  public void testScaleToRowCountMultipleColumns() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+
+    ColStatistics col1 = new ColStatistics("int_col", "int");
+    col1.setNumNulls(20);
+    col1.setCountDistint(80);
+
+    ColStatistics col2 = new ColStatistics("str_col", "string");
+    col2.setNumNulls(0);
+    col2.setCountDistint(50);
+
+    ColStatistics col3 = new ColStatistics("bool_col", "boolean");
+    col3.setNumNulls(10);
+    col3.setNumTrues(40);
+    col3.setNumFalses(50);
+
+    stats.setColumnStats(Arrays.asList(col1, col2, col3));
+
+    Statistics scaled = stats.scaleToRowCount(5, false);
+
+    assertEquals(5, scaled.getNumRows());
+
+    ColStatistics scaledCol1 = scaled.getColumnStatisticsFromColName("int_col");
+    assertEquals(-1, scaledCol1.getNumNulls());
+    assertEquals(5, scaledCol1.getCountDistint());
+
+    ColStatistics scaledCol2 = scaled.getColumnStatisticsFromColName("str_col");
+    assertEquals(0, scaledCol2.getNumNulls());
+    assertEquals(5, scaledCol2.getCountDistint());
+
+    ColStatistics scaledCol3 = scaled.getColumnStatisticsFromColName("bool_col");
+    assertEquals(-1, scaledCol3.getNumNulls());
+    assertEquals(-1, scaledCol3.getNumTrues());
+    assertEquals(-1, scaledCol3.getNumFalses());
+  }
+}
diff --git a/ql/src/test/queries/clientpositive/runtime_stats_scaling.q b/ql/src/test/queries/clientpositive/runtime_stats_scaling.q
@@ -0,0 +1,45 @@
+-- Test runtime statistics scaling: column stats adjustment when row count changes.
+-- When runtime row count is smaller than compile-time estimate, count-based stats
+-- (numNulls, numTrues, numFalses) must be adjusted to prevent invalid values.
+
+set hive.fetch.task.conversion=none;
+
+create table t_runtime_scaling (
+  id int,
+  str_col string,
+  bool_col boolean
+);
+
+-- 10 rows: skewed id values create selectivity mismatch
+-- str_col: 9 rows NULL, 1 non-null (tests numNulls for string)
+-- bool_col: 2 true, 1 false, 7 NULL (tests numNulls for boolean)
+insert into t_runtime_scaling values
+  (1, NULL, NULL), (2, NULL, NULL), (3, NULL, NULL), (4, NULL, NULL),
+  (5, NULL, NULL), (6, NULL, true), (7, NULL, true), (8, NULL, NULL),
+  (9, NULL, NULL), (100, 'only_non_null', false);
+
+analyze table t_runtime_scaling compute statistics;
+analyze table t_runtime_scaling compute statistics for columns;
+
+-- Compile-time: estimates ~50% selectivity (5 rows). Runtime: 1 row passes.
+
+-- Test 1: numNulls scaling for string (str_col has 9 nulls, scaled to 1 row)
+explain
+select str_col from t_runtime_scaling where id > 50;
+
+explain reoptimization
+select str_col from t_runtime_scaling where id > 50;
+
+-- Test 2: numNulls scaling for boolean (bool_col has 7 nulls, scaled to 1 row)
+explain
+select bool_col from t_runtime_scaling where id > 50;
+
+explain reoptimization
+select bool_col from t_runtime_scaling where id > 50;
+
+-- Test 3: combined (both columns)
+explain
+select str_col, bool_col from t_runtime_scaling where id > 50;
+
+explain reoptimization
+select str_col, bool_col from t_runtime_scaling where id > 50;