diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 55f9d0c1e158..979dd11227bd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -239,22 +239,33 @@ public static long getNumRows(HiveConf conf, List schema, Table tabl return aggregateStat.getNumRows(); } - private static void estimateStatsForMissingCols(List neededColumns, List columnStats, - HiveConf conf, long nr, List schema) { + /** + * Estimates column statistics for columns specified in {@code neededColumnNames} + * that do not already have statistics in the {@code existingColStats} list. + * + * @return A {@link List} of {@link ColStatistics} objects containing + * both the provided existing statistics and the newly estimated ones. + */ + static List estimateStatsForMissingCols( + List neededColumnNames, List existingColStats, HiveConf conf, long nr, + List schema) { - Set neededCols = new HashSet<>(neededColumns); - Set colsWithStats = new HashSet<>(); + Set neededCols = new HashSet<>(neededColumnNames); + Set columnNamesWithStats = new HashSet<>(existingColStats.size()); - for (ColStatistics cstats : columnStats) { - colsWithStats.add(cstats.getColumnName()); + for (ColStatistics cstats : existingColStats) { + columnNamesWithStats.add(cstats.getColumnName()); } - List missingColStats = new ArrayList<>(Sets.difference(neededCols, colsWithStats)); + List missingColumnNames = new ArrayList<>(Sets.difference(neededCols, columnNamesWithStats)); + ArrayList combined = new ArrayList<>(existingColStats.size() + missingColumnNames.size()); + combined.addAll(existingColStats); - if (!missingColStats.isEmpty()) { - columnStats.addAll( - estimateStats(schema, missingColStats, conf, nr)); + if (!missingColumnNames.isEmpty()) { + combined.addAll(estimateStats(schema, missingColumnNames, conf, nr)); } + + return combined; } public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, @@ -300,7 +311,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p if (needColStats && !metaTable) { colStats = getTableColumnStats(table, neededColumns, colStatsCache, fetchColStats); if (estimateStats) { - estimateStatsForMissingCols(neededColumns, colStats, conf, nr, schema); + colStats = estimateStatsForMissingCols(neededColumns, colStats, conf, nr, schema); } // we should have stats for all columns (estimated or actual) if (neededColumns.size() == colStats.size()) { @@ -386,7 +397,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0; if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) { - estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf, nr, schema); + columnStats = estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf, nr, schema); // There are some partitions with no state (or we didn't fetch any state). // Update the stats with empty list to reflect that in the // state/initialize structures. diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java index c009472fed0a..dfa4f8e02af7 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.stats; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -41,10 +43,12 @@ import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; import org.apache.hadoop.hive.metastore.api.Timestamp; import org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.plan.ColStatistics.Range; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; @@ -565,4 +569,44 @@ void testGetColStatisticsTimestampType() { assertEquals(1700000000L, range.maxValue.longValue(), "maxValue mismatch for TIMESTAMP"); } + @Test + void testEstimateStatsForMissingColsHandlesEmptyList() { + HiveConf conf = new HiveConf(); + + ColumnInfo columnInfoA = new ColumnInfo("a", TypeInfoFactory.intTypeInfo, "t", false); + + List allColumnStats = StatsUtils.estimateStatsForMissingCols( + List.of("a"), Collections.emptyList(), conf, 0, List.of(columnInfoA)); + + assertEquals(1, allColumnStats.size()); + } + + @Test + void testEstimateStatsForMissingColsCombinesExistingStatsAndEstimations() { + HiveConf conf = new HiveConf(); + + ColumnInfo colNeededButNotExists = new ColumnInfo("neededButNotExists", TypeInfoFactory.intTypeInfo, "t", false); + ColumnInfo colNeededAndExists = new ColumnInfo("neededAndExists", TypeInfoFactory.intTypeInfo, "t", false); + ColumnInfo colNotNeededButExists = new ColumnInfo("notNeededButExists", TypeInfoFactory.intTypeInfo, "t", false); + ColumnInfo colNotNeededNotExists = new ColumnInfo("notNeededNotExists", TypeInfoFactory.intTypeInfo, "t", false); + + ColStatistics colStatNeededAndExists = new ColStatistics(); + colStatNeededAndExists.setColumnName(colNeededAndExists.getInternalName()); + ColStatistics colStatNotNeededButExists = new ColStatistics(); + colStatNotNeededButExists.setColumnName(colNotNeededButExists.getInternalName()); + + List allColumnStats = StatsUtils.estimateStatsForMissingCols( + List.of(colNeededAndExists.getInternalName(), colNeededButNotExists.getInternalName()), + List.of(colStatNeededAndExists, colStatNotNeededButExists), + conf, + 0, + List.of(colNeededButNotExists, colNeededAndExists, colNotNeededButExists, colNotNeededNotExists)); + + assertEquals(3, allColumnStats.size()); + assertEquals(allColumnStats.get(0), colStatNeededAndExists); + assertEquals(allColumnStats.get(1), colStatNotNeededButExists); + assertTrue(allColumnStats.get(2).isEstimated()); + assertEquals(allColumnStats.get(2).getColumnName(), colNeededButNotExists.getInternalName()); + } + }