From b067c9ee234892582c222ecfda97cc164fcf3c14 Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Fri, 15 May 2026 15:47:23 +0900 Subject: [PATCH 1/5] HIVE-29598: Fix vectorized outer join wrong results due to stale scratch column values --- ...torMapJoinOuterGenerateResultOperator.java | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java index e83b178e4dc7..0a2447eb52a8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java @@ -27,7 +27,10 @@ import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.IntervalDayTimeColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; @@ -590,6 +593,7 @@ protected void generateOuterNulls(VectorizedRowBatch batch, int[] noMatchs, ColumnVector colVector = batch.cols[column]; colVector.noNulls = false; colVector.isNull[batchIndex] = true; + clearVectorValue(colVector, batchIndex); } // Small table values are set to null. @@ -597,6 +601,7 @@ protected void generateOuterNulls(VectorizedRowBatch batch, int[] noMatchs, ColumnVector colVector = batch.cols[column]; colVector.noNulls = false; colVector.isNull[batchIndex] = true; + clearVectorValue(colVector, batchIndex); } } } @@ -749,6 +754,7 @@ protected void generateOuterNullsRepeatedAll(VectorizedRowBatch batch) throws Hi colVector.noNulls = false; colVector.isNull[0] = true; colVector.isRepeating = true; + clearVectorValue(colVector, 0); } for (int column : smallTableValueColumnMap) { @@ -756,6 +762,24 @@ protected void generateOuterNullsRepeatedAll(VectorizedRowBatch batch) throws Hi colVector.noNulls = false; colVector.isNull[0] = true; colVector.isRepeating = true; + clearVectorValue(colVector, 0); + } + } + + private static void clearVectorValue(ColumnVector colVector, int index) { + if (colVector instanceof LongColumnVector) { + ((LongColumnVector) colVector).vector[index] = 0L; + } else if (colVector instanceof DoubleColumnVector) { + ((DoubleColumnVector) colVector).vector[index] = 0.0; + } else if (colVector instanceof BytesColumnVector) { + BytesColumnVector bcv = (BytesColumnVector) colVector; + bcv.vector[index] = null; + bcv.start[index] = 0; + bcv.length[index] = 0; + } else if (colVector instanceof TimestampColumnVector) { + ((TimestampColumnVector) colVector).setNullValue(index); + } else if (colVector instanceof IntervalDayTimeColumnVector) { + ((IntervalDayTimeColumnVector) colVector).setNullValue(index); } } From 6b31424d46e6440d816ea93be0ceefb82aa497fe Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Mon, 18 May 2026 17:39:53 +0900 Subject: [PATCH 2/5] HIVE-29598: Add regression test for vectorized outer join stale scratch column values --- .../clientpositive/vector_outer_join7.q | 48 ++++++++++ .../llap/vector_outer_join7.q.out | 88 +++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 ql/src/test/queries/clientpositive/vector_outer_join7.q create mode 100644 ql/src/test/results/clientpositive/llap/vector_outer_join7.q.out diff --git a/ql/src/test/queries/clientpositive/vector_outer_join7.q b/ql/src/test/queries/clientpositive/vector_outer_join7.q new file mode 100644 index 000000000000..9dbe313d43a9 --- /dev/null +++ b/ql/src/test/queries/clientpositive/vector_outer_join7.q @@ -0,0 +1,48 @@ +set hive.mapred.mode=nonstrict; +set hive.explain.user=false; +SET hive.vectorized.execution.enabled=true; +SET hive.vectorized.execution.mapjoin.native.enabled=true; +SET hive.auto.convert.join=true; +SET hive.vectorized.reuse.scratch.columns=true; +set hive.fetch.task.conversion=none; + +-- SORT_QUERY_RESULTS + +-- Regression test for HIVE-29598: vectorized outer join wrong results due to +-- stale scratch column values when hive.vectorized.reuse.scratch.columns=true. +-- A CAST expression and the outer join null-marking column share the same scratch +-- slot; downstream ColOrCol reads vector[i] without checking isNull, causing +-- rows that should be NULL to be misclassified. + +CREATE TABLE src_hive29598 (k STRING, v STRING) STORED AS ORC; + +INSERT INTO src_hive29598 VALUES + ('p','1'),('p','2'),('p','3'), + ('q','2'),('q','3'), + ('r','3'), + ('s','3'); + +WITH base AS ( + SELECT k, v FROM src_hive29598 GROUP BY k, v +), +classified AS ( + SELECT t1.k, t1.v, + CASE WHEN t2.k IS NULL THEN 'new' + WHEN t3.k IS NULL THEN 'two_step' + ELSE 'three_step' END AS status + FROM base t1 + LEFT JOIN base t2 + ON t1.k = t2.k + AND CAST(t1.v AS INT) - 1 = CAST(t2.v AS INT) + LEFT JOIN base t3 + ON t1.k = t3.k + AND CAST(t1.v AS INT) - 2 = CAST(t3.v AS INT) + WHERE CAST(t1.v AS INT) >= 3 + GROUP BY t1.k, t1.v, + CASE WHEN t2.k IS NULL THEN 'new' + WHEN t3.k IS NULL THEN 'two_step' + ELSE 'three_step' END +) +SELECT * FROM classified WHERE status = 'new'; + +DROP TABLE src_hive29598; diff --git a/ql/src/test/results/clientpositive/llap/vector_outer_join7.q.out b/ql/src/test/results/clientpositive/llap/vector_outer_join7.q.out new file mode 100644 index 000000000000..0a2585f8f25a --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/vector_outer_join7.q.out @@ -0,0 +1,88 @@ +PREHOOK: query: CREATE TABLE src_hive29598 (k STRING, v STRING) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@src_hive29598 +POSTHOOK: query: CREATE TABLE src_hive29598 (k STRING, v STRING) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@src_hive29598 +PREHOOK: query: INSERT INTO src_hive29598 VALUES + ('p','1'),('p','2'),('p','3'), + ('q','2'),('q','3'), + ('r','3'), + ('s','3') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@src_hive29598 +POSTHOOK: query: INSERT INTO src_hive29598 VALUES + ('p','1'),('p','2'),('p','3'), + ('q','2'),('q','3'), + ('r','3'), + ('s','3') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@src_hive29598 +POSTHOOK: Lineage: src_hive29598.k SCRIPT [] +POSTHOOK: Lineage: src_hive29598.v SCRIPT [] +PREHOOK: query: WITH base AS ( + SELECT k, v FROM src_hive29598 GROUP BY k, v +), +classified AS ( + SELECT t1.k, t1.v, + CASE WHEN t2.k IS NULL THEN 'new' + WHEN t3.k IS NULL THEN 'two_step' + ELSE 'three_step' END AS status + FROM base t1 + LEFT JOIN base t2 + ON t1.k = t2.k + AND CAST(t1.v AS INT) - 1 = CAST(t2.v AS INT) + LEFT JOIN base t3 + ON t1.k = t3.k + AND CAST(t1.v AS INT) - 2 = CAST(t3.v AS INT) + WHERE CAST(t1.v AS INT) >= 3 + GROUP BY t1.k, t1.v, + CASE WHEN t2.k IS NULL THEN 'new' + WHEN t3.k IS NULL THEN 'two_step' + ELSE 'three_step' END +) +SELECT * FROM classified WHERE status = 'new' +PREHOOK: type: QUERY +PREHOOK: Input: default@src_hive29598 +#### A masked pattern was here #### +POSTHOOK: query: WITH base AS ( + SELECT k, v FROM src_hive29598 GROUP BY k, v +), +classified AS ( + SELECT t1.k, t1.v, + CASE WHEN t2.k IS NULL THEN 'new' + WHEN t3.k IS NULL THEN 'two_step' + ELSE 'three_step' END AS status + FROM base t1 + LEFT JOIN base t2 + ON t1.k = t2.k + AND CAST(t1.v AS INT) - 1 = CAST(t2.v AS INT) + LEFT JOIN base t3 + ON t1.k = t3.k + AND CAST(t1.v AS INT) - 2 = CAST(t3.v AS INT) + WHERE CAST(t1.v AS INT) >= 3 + GROUP BY t1.k, t1.v, + CASE WHEN t2.k IS NULL THEN 'new' + WHEN t3.k IS NULL THEN 'two_step' + ELSE 'three_step' END +) +SELECT * FROM classified WHERE status = 'new' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_hive29598 +#### A masked pattern was here #### +r 3 new +s 3 new +PREHOOK: query: DROP TABLE src_hive29598 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@src_hive29598 +PREHOOK: Output: database:default +PREHOOK: Output: default@src_hive29598 +POSTHOOK: query: DROP TABLE src_hive29598 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@src_hive29598 +POSTHOOK: Output: database:default +POSTHOOK: Output: default@src_hive29598 From 18801de6adb94846ea7b802dfa7bfaafaa5b88f8 Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Mon, 18 May 2026 18:10:45 +0900 Subject: [PATCH 3/5] HIVE-29598: Add unit tests and handle DecimalColumnVector in clearVectorValue --- ...torMapJoinOuterGenerateResultOperator.java | 3 + ...torMapJoinOuterGenerateResultOperator.java | 124 ++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java index 0a2447eb52a8..647b33ece049 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.IntervalDayTimeColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; @@ -776,6 +777,8 @@ private static void clearVectorValue(ColumnVector colVector, int index) { bcv.vector[index] = null; bcv.start[index] = 0; bcv.length[index] = 0; + } else if (colVector instanceof DecimalColumnVector) { + ((DecimalColumnVector) colVector).vector[index].setFromLong(0L); } else if (colVector instanceof TimestampColumnVector) { ((TimestampColumnVector) colVector).setNullValue(index); } else if (colVector instanceof IntervalDayTimeColumnVector) { diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java new file mode 100644 index 000000000000..7bd5422df2e8 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.IntervalDayTimeColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.junit.Test; + +import java.lang.reflect.Method; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +/** + * Unit tests for VectorMapJoinOuterGenerateResultOperator.clearVectorValue(). + * + * clearVectorValue() zeroes the raw vector slot whenever isNull[i] is set to + * true in the outer-join null-marking paths, preventing stale non-zero values + * from being misread by downstream operators (e.g. ColOrCol) that use + * vector[i] == 0 to distinguish "false" from "null". + */ +public class TestVectorMapJoinOuterGenerateResultOperator { + + private static void callClearVectorValue(ColumnVector colVector, int index) throws Exception { + Method m = VectorMapJoinOuterGenerateResultOperator.class + .getDeclaredMethod("clearVectorValue", ColumnVector.class, int.class); + m.setAccessible(true); + m.invoke(null, colVector, index); + } + + @Test + public void testClearLongColumnVector() throws Exception { + LongColumnVector cv = new LongColumnVector(4); + cv.vector[2] = 2025L; + + callClearVectorValue(cv, 2); + + assertEquals(0L, cv.vector[2]); + // neighbours must be untouched + assertEquals(0L, cv.vector[1]); + assertEquals(0L, cv.vector[3]); + } + + @Test + public void testClearDoubleColumnVector() throws Exception { + DoubleColumnVector cv = new DoubleColumnVector(4); + cv.vector[1] = 3.14; + + callClearVectorValue(cv, 1); + + assertEquals(0.0, cv.vector[1], 0.0); + } + + @Test + public void testClearBytesColumnVector() throws Exception { + BytesColumnVector cv = new BytesColumnVector(4); + byte[] data = "hello".getBytes(); + cv.vector[0] = data; + cv.start[0] = 1; + cv.length[0] = 3; + + callClearVectorValue(cv, 0); + + assertNull(cv.vector[0]); + assertEquals(0, cv.start[0]); + assertEquals(0, cv.length[0]); + } + + @Test + public void testClearTimestampColumnVector() throws Exception { + TimestampColumnVector cv = new TimestampColumnVector(4); + cv.time[2] = 1234567890000L; + cv.nanos[2] = 999; + + callClearVectorValue(cv, 2); + + // TimestampColumnVector.setNullValue sets time=0, nanos=1 + assertEquals(0L, cv.time[2]); + assertEquals(1, cv.nanos[2]); + } + + @Test + public void testClearDecimalColumnVector() throws Exception { + DecimalColumnVector cv = new DecimalColumnVector(4, 18, 4); + cv.vector[1].setFromLong(12345L); + + callClearVectorValue(cv, 1); + + assertEquals(0L, cv.vector[1].serialize64(4)); + } + + @Test + public void testClearIntervalDayTimeColumnVector() throws Exception { + IntervalDayTimeColumnVector cv = new IntervalDayTimeColumnVector(4); + cv.set(3, new org.apache.hadoop.hive.common.type.HiveIntervalDayTime(5, 0)); + + callClearVectorValue(cv, 3); + + // IntervalDayTimeColumnVector.setNullValue sets totalSeconds=0, nanos=1 + assertEquals(0L, cv.getTotalSeconds(3)); + assertEquals(1, cv.getNanos(3)); + } +} From 937c6d3cb1f3525b8fe5d90336d2b8b661f3c641 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Mon, 18 May 2026 14:29:35 -0700 Subject: [PATCH 4/5] HIVE-29598: addressing PR feedback + using correct test case + unit tests --- ...torMapJoinOuterGenerateResultOperator.java | 41 +-- ...torMapJoinOuterGenerateResultOperator.java | 289 ++++++++++++++---- .../clientpositive/vector_outer_join7.q | 91 +++--- .../llap/vector_outer_join7.q.out | 148 +++++---- .../ql/exec/vector/BytesColumnVector.java | 7 + .../hive/ql/exec/vector/ColumnVector.java | 26 ++ .../ql/exec/vector/DecimalColumnVector.java | 5 + .../ql/exec/vector/DoubleColumnVector.java | 5 + .../vector/IntervalDayTimeColumnVector.java | 5 + .../hive/ql/exec/vector/LongColumnVector.java | 5 + .../ql/exec/vector/TimestampColumnVector.java | 5 + .../ql/exec/vector/TestBytesColumnVector.java | 19 ++ .../exec/vector/TestDecimalColumnVector.java | 44 +++ .../exec/vector/TestDoubleColumnVector.java | 46 +++ .../TestIntervalDayTimeColumnVector.java | 43 +++ .../ql/exec/vector/TestLongColumnVector.java | 46 +++ .../vector/TestTimestampColumnVector.java | 16 + 17 files changed, 624 insertions(+), 217 deletions(-) create mode 100644 storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDecimalColumnVector.java create mode 100644 storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDoubleColumnVector.java create mode 100644 storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestIntervalDayTimeColumnVector.java create mode 100644 storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestLongColumnVector.java diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java index 647b33ece049..903517c10774 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinOuterGenerateResultOperator.java @@ -27,11 +27,7 @@ import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.IntervalDayTimeColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; @@ -591,18 +587,12 @@ protected void generateOuterNulls(VectorizedRowBatch batch, int[] noMatchs, // key as null, too. // for (int column : outerSmallTableKeyColumnMap) { - ColumnVector colVector = batch.cols[column]; - colVector.noNulls = false; - colVector.isNull[batchIndex] = true; - clearVectorValue(colVector, batchIndex); + batch.cols[column].clearValue(batchIndex); } // Small table values are set to null. for (int column : smallTableValueColumnMap) { - ColumnVector colVector = batch.cols[column]; - colVector.noNulls = false; - colVector.isNull[batchIndex] = true; - clearVectorValue(colVector, batchIndex); + batch.cols[column].clearValue(batchIndex); } } } @@ -752,37 +742,14 @@ protected void generateOuterNullsRepeatedAll(VectorizedRowBatch batch) throws Hi // for (int column : outerSmallTableKeyColumnMap) { ColumnVector colVector = batch.cols[column]; - colVector.noNulls = false; - colVector.isNull[0] = true; + colVector.clearValue(0); colVector.isRepeating = true; - clearVectorValue(colVector, 0); } for (int column : smallTableValueColumnMap) { ColumnVector colVector = batch.cols[column]; - colVector.noNulls = false; - colVector.isNull[0] = true; + colVector.clearValue(0); colVector.isRepeating = true; - clearVectorValue(colVector, 0); - } - } - - private static void clearVectorValue(ColumnVector colVector, int index) { - if (colVector instanceof LongColumnVector) { - ((LongColumnVector) colVector).vector[index] = 0L; - } else if (colVector instanceof DoubleColumnVector) { - ((DoubleColumnVector) colVector).vector[index] = 0.0; - } else if (colVector instanceof BytesColumnVector) { - BytesColumnVector bcv = (BytesColumnVector) colVector; - bcv.vector[index] = null; - bcv.start[index] = 0; - bcv.length[index] = 0; - } else if (colVector instanceof DecimalColumnVector) { - ((DecimalColumnVector) colVector).vector[index].setFromLong(0L); - } else if (colVector instanceof TimestampColumnVector) { - ((TimestampColumnVector) colVector).setNullValue(index); - } else if (colVector instanceof IntervalDayTimeColumnVector) { - ((IntervalDayTimeColumnVector) colVector).setNullValue(index); } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java index 7bd5422df2e8..7a7b035a3c50 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec.vector.mapjoin; +import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; @@ -25,100 +26,258 @@ import org.apache.hadoop.hive.ql.exec.vector.IntervalDayTimeColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; -import org.junit.Test; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.VoidColumnVector; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; -import java.lang.reflect.Method; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Stream; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; /** - * Unit tests for VectorMapJoinOuterGenerateResultOperator.clearVectorValue(). - * - * clearVectorValue() zeroes the raw vector slot whenever isNull[i] is set to - * true in the outer-join null-marking paths, preventing stale non-zero values - * from being misread by downstream operators (e.g. ColOrCol) that use - * vector[i] == 0 to distinguish "false" from "null". + * Tests that {@link VectorMapJoinOuterGenerateResultOperator} invokes + * {@code clearValue} on every small-table key and value column for each + * unmatched big-table row. The HIVE-29598 bug is that without that call, + * a stale {@code vector[i]} survives the null marking and leaks into + * downstream operators that read the slot without checking {@code isNull[i]}. */ -public class TestVectorMapJoinOuterGenerateResultOperator { +class TestVectorMapJoinOuterGenerateResultOperator { + + /** + * A concrete subclass of the abstract operator with no-op stubs for the + * abstract methods, just enough to invoke {@code generateOuterNulls} and + * {@code generateOuterNullsRepeatedAll} directly from tests. + */ + private static class TestableOuterOp extends VectorMapJoinOuterGenerateResultOperator { + @Override + protected String getLoggingPrefix() { + return "test"; + } - private static void callClearVectorValue(ColumnVector colVector, int index) throws Exception { - Method m = VectorMapJoinOuterGenerateResultOperator.class - .getDeclaredMethod("clearVectorValue", ColumnVector.class, int.class); - m.setAccessible(true); - m.invoke(null, colVector, index); + @Override + public void processBatch(VectorizedRowBatch batch) { + // No-op: tests invoke the generateOuterNulls* methods directly. + } } - @Test - public void testClearLongColumnVector() throws Exception { - LongColumnVector cv = new LongColumnVector(4); - cv.vector[2] = 2025L; + /** + * LongColumnVector that records every {@code clearSlotValue} invocation. + * Used to assert that the operator dispatched through the new + * {@link org.apache.hadoop.hive.ql.exec.vector.ColumnVector#clearValue(int)} + * contract, rather than just observing the slot-clearing side effect + * (which could also be produced by another mechanism). + */ + private static class TrackingLongColumnVector extends LongColumnVector { + final List clearedIndices = new ArrayList<>(); - callClearVectorValue(cv, 2); + TrackingLongColumnVector(int size) { + super(size); + } - assertEquals(0L, cv.vector[2]); - // neighbours must be untouched - assertEquals(0L, cv.vector[1]); - assertEquals(0L, cv.vector[3]); + @Override + protected void clearSlotValue(int elementNum) { + super.clearSlotValue(elementNum); + clearedIndices.add(elementNum); + } } @Test - public void testClearDoubleColumnVector() throws Exception { - DoubleColumnVector cv = new DoubleColumnVector(4); - cv.vector[1] = 3.14; + void generateOuterNullsCallsClearValueOnEachMappedColumnForEachUnmatchedRow() throws HiveException, IOException { + TestableOuterOp op = new TestableOuterOp(); + op.outerSmallTableKeyColumnMap = new int[] {0}; + op.smallTableValueColumnMap = new int[] {1, 2}; - callClearVectorValue(cv, 1); + VectorizedRowBatch batch = new VectorizedRowBatch(3, 4); + TrackingLongColumnVector keyCol = new TrackingLongColumnVector(4); + TrackingLongColumnVector valCol1 = new TrackingLongColumnVector(4); + TrackingLongColumnVector valCol2 = new TrackingLongColumnVector(4); + keyCol.vector[1] = 99L; // stale values that should be cleared + valCol1.vector[1] = 88L; + valCol2.vector[3] = 77L; + batch.cols[0] = keyCol; + batch.cols[1] = valCol1; + batch.cols[2] = valCol2; - assertEquals(0.0, cv.vector[1], 0.0); - } + int[] noMatchs = new int[] {1, 3}; + op.generateOuterNulls(batch, noMatchs, noMatchs.length); - @Test - public void testClearBytesColumnVector() throws Exception { - BytesColumnVector cv = new BytesColumnVector(4); - byte[] data = "hello".getBytes(); - cv.vector[0] = data; - cv.start[0] = 1; - cv.length[0] = 3; - - callClearVectorValue(cv, 0); - - assertNull(cv.vector[0]); - assertEquals(0, cv.start[0]); - assertEquals(0, cv.length[0]); + // Each tracked column had clearSlotValue invoked at indices 1 and 3. + assertEquals(Arrays.asList(1, 3), keyCol.clearedIndices); + assertEquals(Arrays.asList(1, 3), valCol1.clearedIndices); + assertEquals(Arrays.asList(1, 3), valCol2.clearedIndices); + + // Bookkeeping side effect of clearValue (final base-class method). + assertFalse(keyCol.noNulls); + assertTrue(keyCol.isNull[1]); + assertTrue(keyCol.isNull[3]); + assertFalse(keyCol.isNull[0]); + assertFalse(keyCol.isNull[2]); + + // Stale slot values cleared to 0L. + assertEquals(0L, keyCol.vector[1]); + assertEquals(0L, valCol1.vector[1]); + assertEquals(0L, valCol2.vector[3]); } @Test - public void testClearTimestampColumnVector() throws Exception { - TimestampColumnVector cv = new TimestampColumnVector(4); - cv.time[2] = 1234567890000L; - cv.nanos[2] = 999; + void generateOuterNullsRepeatedAllCallsClearValueAtIndexZeroForEachMappedColumn() throws HiveException, IOException { + TestableOuterOp op = new TestableOuterOp(); + op.outerSmallTableKeyColumnMap = new int[] {0}; + op.smallTableValueColumnMap = new int[] {1}; + + VectorizedRowBatch batch = new VectorizedRowBatch(2, 4); + TrackingLongColumnVector keyCol = new TrackingLongColumnVector(4); + TrackingLongColumnVector valCol = new TrackingLongColumnVector(4); + keyCol.vector[0] = 42L; + valCol.vector[0] = 84L; + batch.cols[0] = keyCol; + batch.cols[1] = valCol; + + op.generateOuterNullsRepeatedAll(batch); - callClearVectorValue(cv, 2); + // Each tracked column had clearSlotValue invoked exactly once at index 0. + assertEquals(Arrays.asList(0), keyCol.clearedIndices); + assertEquals(Arrays.asList(0), valCol.clearedIndices); - // TimestampColumnVector.setNullValue sets time=0, nanos=1 - assertEquals(0L, cv.time[2]); - assertEquals(1, cv.nanos[2]); + // Bookkeeping plus isRepeating set by the operator after clearValue. + assertFalse(keyCol.noNulls); + assertTrue(keyCol.isNull[0]); + assertTrue(keyCol.isRepeating); + assertFalse(valCol.noNulls); + assertTrue(valCol.isNull[0]); + assertTrue(valCol.isRepeating); + + // Stale slot values cleared to 0L. + assertEquals(0L, keyCol.vector[0]); + assertEquals(0L, valCol.vector[0]); } @Test - public void testClearDecimalColumnVector() throws Exception { - DecimalColumnVector cv = new DecimalColumnVector(4, 18, 4); - cv.vector[1].setFromLong(12345L); + void generateOuterNullsSetsBookkeepingOnTypeWithNoClearSlotValueOverride() throws HiveException, IOException { + // VoidColumnVector inherits the base ColumnVector.clearSlotValue() no-op + // — no per-slot value to zero. This verifies the operator still drives + // the bookkeeping (isNull[i], noNulls) through the final clearValue() + // contract on a type that doesn't override the hook. + TestableOuterOp op = new TestableOuterOp(); + op.outerSmallTableKeyColumnMap = new int[] {}; + op.smallTableValueColumnMap = new int[] {0}; + + VectorizedRowBatch batch = new VectorizedRowBatch(1, 4); + VoidColumnVector voidCol = new VoidColumnVector(4); + batch.cols[0] = voidCol; - callClearVectorValue(cv, 1); + int[] noMatchs = new int[] {1, 3}; + op.generateOuterNulls(batch, noMatchs, noMatchs.length); - assertEquals(0L, cv.vector[1].serialize64(4)); + assertFalse(voidCol.noNulls); + assertTrue(voidCol.isNull[1]); + assertTrue(voidCol.isNull[3]); + assertFalse(voidCol.isNull[0]); + assertFalse(voidCol.isNull[2]); } - @Test - public void testClearIntervalDayTimeColumnVector() throws Exception { - IntervalDayTimeColumnVector cv = new IntervalDayTimeColumnVector(4); - cv.set(3, new org.apache.hadoop.hive.common.type.HiveIntervalDayTime(5, 0)); + /** + * Verifies that for every {@link ColumnVector} subclass whose + * {@code clearSlotValue} the PR overrides, the operator's call through + * {@code clearValue} ultimately reaches that override and zeroes the slot + * to the type's cleared state. Complements + * {@link #generateOuterNullsCallsClearValueOnEachMappedColumnForEachUnmatchedRow} + * which proves the dispatch chain on Long alone. + */ + @ParameterizedTest(name = "{0}") + @MethodSource("modifiedColumnVectorTypes") + void generateOuterNullsClearsSlotForEachModifiedType( + String typeName, + ColumnVector cv, + Runnable preLoad, + Runnable assertSlotCleared) throws HiveException, IOException { + + TestableOuterOp op = new TestableOuterOp(); + op.outerSmallTableKeyColumnMap = new int[] {}; + op.smallTableValueColumnMap = new int[] {0}; + + VectorizedRowBatch batch = new VectorizedRowBatch(1, 4); + preLoad.run(); + batch.cols[0] = cv; + + int[] noMatchs = new int[] {2}; + op.generateOuterNulls(batch, noMatchs, noMatchs.length); + + assertTrue(cv.isNull[2]); + assertFalse(cv.noNulls); + assertSlotCleared.run(); + } - callClearVectorValue(cv, 3); + static Stream modifiedColumnVectorTypes() { + final LongColumnVector longCv = new LongColumnVector(4); + final DoubleColumnVector doubleCv = new DoubleColumnVector(4); + final BytesColumnVector bytesCv = new BytesColumnVector(4); + final DecimalColumnVector decCv = new DecimalColumnVector(4, 18, 4); + final TimestampColumnVector tsCv = new TimestampColumnVector(4); + final IntervalDayTimeColumnVector ivCv = new IntervalDayTimeColumnVector(4); - // IntervalDayTimeColumnVector.setNullValue sets totalSeconds=0, nanos=1 - assertEquals(0L, cv.getTotalSeconds(3)); - assertEquals(1, cv.getNanos(3)); + return Stream.of( + Arguments.of( + "LongColumnVector", + longCv, + (Runnable) () -> longCv.vector[2] = 999L, + (Runnable) () -> assertEquals(0L, longCv.vector[2])), + Arguments.of( + "DoubleColumnVector", + doubleCv, + (Runnable) () -> doubleCv.vector[2] = 3.14, + (Runnable) () -> assertEquals(0.0, doubleCv.vector[2])), + Arguments.of( + "BytesColumnVector", + bytesCv, + (Runnable) () -> { + bytesCv.vector[2] = "stale".getBytes(StandardCharsets.UTF_8); + bytesCv.start[2] = 1; + bytesCv.length[2] = 3; + }, + (Runnable) () -> { + assertNull(bytesCv.vector[2]); + assertEquals(0, bytesCv.start[2]); + assertEquals(0, bytesCv.length[2]); + }), + Arguments.of( + "DecimalColumnVector", + decCv, + (Runnable) () -> decCv.vector[2].setFromLong(999L), + (Runnable) () -> assertEquals(0L, decCv.vector[2].serialize64(decCv.scale))), + Arguments.of( + "TimestampColumnVector", + tsCv, + (Runnable) () -> { + tsCv.time[2] = 1234567890000L; + tsCv.nanos[2] = 999; + }, + (Runnable) () -> { + // setNullValue convention: time = 0, nanos = 1 + assertEquals(0L, tsCv.time[2]); + assertEquals(1, tsCv.nanos[2]); + }), + Arguments.of( + "IntervalDayTimeColumnVector", + ivCv, + (Runnable) () -> ivCv.set(2, new HiveIntervalDayTime(5, 0)), + (Runnable) () -> { + // setNullValue convention: totalSeconds = 0, nanos = 1 + assertEquals(0L, ivCv.getTotalSeconds(2)); + assertEquals(1, ivCv.getNanos(2)); + }) + ); } } diff --git a/ql/src/test/queries/clientpositive/vector_outer_join7.q b/ql/src/test/queries/clientpositive/vector_outer_join7.q index 9dbe313d43a9..041601265441 100644 --- a/ql/src/test/queries/clientpositive/vector_outer_join7.q +++ b/ql/src/test/queries/clientpositive/vector_outer_join7.q @@ -1,48 +1,61 @@ -set hive.mapred.mode=nonstrict; set hive.explain.user=false; -SET hive.vectorized.execution.enabled=true; -SET hive.vectorized.execution.mapjoin.native.enabled=true; SET hive.auto.convert.join=true; -SET hive.vectorized.reuse.scratch.columns=true; +SET hive.auto.convert.join.noconditionaltask=true; set hive.fetch.task.conversion=none; -- SORT_QUERY_RESULTS --- Regression test for HIVE-29598: vectorized outer join wrong results due to --- stale scratch column values when hive.vectorized.reuse.scratch.columns=true. --- A CAST expression and the outer join null-marking column share the same scratch --- slot; downstream ColOrCol reads vector[i] without checking isNull, causing --- rows that should be NULL to be misclassified. +-- Regression test for HIVE-29598: vectorized outer-join MapJoin can leave a +-- stale typed value in a scratch slot that the vectorizer has aliased to a +-- smallTableValueMapping target. For unmatched rows, generateOuterNulls() +-- flips isNull[i] = true but does not clear vector[i], so a downstream +-- ColOrCol -> IfExprLongScalarLongScalar chain that reads vector[i] without +-- consulting isNull[i] propagates the stale value into the result. +-- +-- Repro shape: a LEFT OUTER MapJoin whose ON predicate uses a CAST scratch +-- column that is then reused as the small-table boolean-value column; the +-- projection computes CAST((s.s_bool OR p.p_bool) AS INT) over the null-padded +-- rows. The MAX() aggregate barrier prevents Calcite from inlining and +-- simplifying the bug surface away. +-- +-- Expected: zero rows. Every probe row's (s_bool OR p_bool) is TRUE per SQL +-- three-valued logic (matched: FALSE OR TRUE; unmatched: NULL OR TRUE), so +-- CAST(... AS INT) is always 1 and WHERE observed_value = 0 matches nothing. +-- Without the fix: 'C 3 0 1' and 'D 3 0 1' leak through, since the stale int +-- from CastStringToLong ORed with 1 fails the strict == 1 check in +-- IfExprLongScalarLongScalar and stores 0. -CREATE TABLE src_hive29598 (k STRING, v STRING) STORED AS ORC; +CREATE TABLE t (k STRING, v STRING) STORED AS ORC; -INSERT INTO src_hive29598 VALUES - ('p','1'),('p','2'),('p','3'), - ('q','2'),('q','3'), - ('r','3'), - ('s','3'); +INSERT INTO t VALUES + ('A','1'),('A','2'),('A','3'), + ('B','2'),('B','3'), + ('C','3'), + ('D','1'),('D','3'); -WITH base AS ( - SELECT k, v FROM src_hive29598 GROUP BY k, v -), -classified AS ( - SELECT t1.k, t1.v, - CASE WHEN t2.k IS NULL THEN 'new' - WHEN t3.k IS NULL THEN 'two_step' - ELSE 'three_step' END AS status - FROM base t1 - LEFT JOIN base t2 - ON t1.k = t2.k - AND CAST(t1.v AS INT) - 1 = CAST(t2.v AS INT) - LEFT JOIN base t3 - ON t1.k = t3.k - AND CAST(t1.v AS INT) - 2 = CAST(t3.v AS INT) - WHERE CAST(t1.v AS INT) >= 3 - GROUP BY t1.k, t1.v, - CASE WHEN t2.k IS NULL THEN 'new' - WHEN t3.k IS NULL THEN 'two_step' - ELSE 'three_step' END -) -SELECT * FROM classified WHERE status = 'new'; - -DROP TABLE src_hive29598; +WITH + probe AS ( + SELECT k, v, (CAST(v AS INT) > 0) AS p_bool + FROM t WHERE CAST(v AS INT) >= 3 + ), + small_side AS ( + SELECT k, v, (CAST(v AS INT) > 9999) AS s_bool + FROM t + ), + classified AS ( + SELECT p.k, p.v, CAST((s.s_bool OR p.p_bool) AS INT) AS observed_value + FROM probe p + LEFT JOIN small_side s + ON p.k = s.k + AND CAST(p.v AS INT) - 1 = CAST(s.v AS INT) + ), + diagnosed AS ( + SELECT k, v, MAX(observed_value) AS observed_value + FROM classified + GROUP BY k, v + ) +SELECT k, v, + observed_value AS observed_value_returned_by_select, + 1 AS required_value_per_sql_semantics +FROM diagnosed +WHERE observed_value = 0; diff --git a/ql/src/test/results/clientpositive/llap/vector_outer_join7.q.out b/ql/src/test/results/clientpositive/llap/vector_outer_join7.q.out index 0a2585f8f25a..df755cfe4732 100644 --- a/ql/src/test/results/clientpositive/llap/vector_outer_join7.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_outer_join7.q.out @@ -1,88 +1,84 @@ -PREHOOK: query: CREATE TABLE src_hive29598 (k STRING, v STRING) STORED AS ORC +PREHOOK: query: CREATE TABLE t (k STRING, v STRING) STORED AS ORC PREHOOK: type: CREATETABLE PREHOOK: Output: database:default -PREHOOK: Output: default@src_hive29598 -POSTHOOK: query: CREATE TABLE src_hive29598 (k STRING, v STRING) STORED AS ORC +PREHOOK: Output: default@t +POSTHOOK: query: CREATE TABLE t (k STRING, v STRING) STORED AS ORC POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default -POSTHOOK: Output: default@src_hive29598 -PREHOOK: query: INSERT INTO src_hive29598 VALUES - ('p','1'),('p','2'),('p','3'), - ('q','2'),('q','3'), - ('r','3'), - ('s','3') +POSTHOOK: Output: default@t +PREHOOK: query: INSERT INTO t VALUES + ('A','1'),('A','2'),('A','3'), + ('B','2'),('B','3'), + ('C','3'), + ('D','1'),('D','3') PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table -PREHOOK: Output: default@src_hive29598 -POSTHOOK: query: INSERT INTO src_hive29598 VALUES - ('p','1'),('p','2'),('p','3'), - ('q','2'),('q','3'), - ('r','3'), - ('s','3') +PREHOOK: Output: default@t +POSTHOOK: query: INSERT INTO t VALUES + ('A','1'),('A','2'),('A','3'), + ('B','2'),('B','3'), + ('C','3'), + ('D','1'),('D','3') POSTHOOK: type: QUERY POSTHOOK: Input: _dummy_database@_dummy_table -POSTHOOK: Output: default@src_hive29598 -POSTHOOK: Lineage: src_hive29598.k SCRIPT [] -POSTHOOK: Lineage: src_hive29598.v SCRIPT [] -PREHOOK: query: WITH base AS ( - SELECT k, v FROM src_hive29598 GROUP BY k, v -), -classified AS ( - SELECT t1.k, t1.v, - CASE WHEN t2.k IS NULL THEN 'new' - WHEN t3.k IS NULL THEN 'two_step' - ELSE 'three_step' END AS status - FROM base t1 - LEFT JOIN base t2 - ON t1.k = t2.k - AND CAST(t1.v AS INT) - 1 = CAST(t2.v AS INT) - LEFT JOIN base t3 - ON t1.k = t3.k - AND CAST(t1.v AS INT) - 2 = CAST(t3.v AS INT) - WHERE CAST(t1.v AS INT) >= 3 - GROUP BY t1.k, t1.v, - CASE WHEN t2.k IS NULL THEN 'new' - WHEN t3.k IS NULL THEN 'two_step' - ELSE 'three_step' END -) -SELECT * FROM classified WHERE status = 'new' +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.k SCRIPT [] +POSTHOOK: Lineage: t.v SCRIPT [] +PREHOOK: query: WITH + probe AS ( + SELECT k, v, (CAST(v AS INT) > 0) AS p_bool + FROM t WHERE CAST(v AS INT) >= 3 + ), + small_side AS ( + SELECT k, v, (CAST(v AS INT) > 9999) AS s_bool + FROM t + ), + classified AS ( + SELECT p.k, p.v, CAST((s.s_bool OR p.p_bool) AS INT) AS observed_value + FROM probe p + LEFT JOIN small_side s + ON p.k = s.k + AND CAST(p.v AS INT) - 1 = CAST(s.v AS INT) + ), + diagnosed AS ( + SELECT k, v, MAX(observed_value) AS observed_value + FROM classified + GROUP BY k, v + ) +SELECT k, v, + observed_value AS observed_value_returned_by_select, + 1 AS required_value_per_sql_semantics +FROM diagnosed +WHERE observed_value = 0 PREHOOK: type: QUERY -PREHOOK: Input: default@src_hive29598 +PREHOOK: Input: default@t #### A masked pattern was here #### -POSTHOOK: query: WITH base AS ( - SELECT k, v FROM src_hive29598 GROUP BY k, v -), -classified AS ( - SELECT t1.k, t1.v, - CASE WHEN t2.k IS NULL THEN 'new' - WHEN t3.k IS NULL THEN 'two_step' - ELSE 'three_step' END AS status - FROM base t1 - LEFT JOIN base t2 - ON t1.k = t2.k - AND CAST(t1.v AS INT) - 1 = CAST(t2.v AS INT) - LEFT JOIN base t3 - ON t1.k = t3.k - AND CAST(t1.v AS INT) - 2 = CAST(t3.v AS INT) - WHERE CAST(t1.v AS INT) >= 3 - GROUP BY t1.k, t1.v, - CASE WHEN t2.k IS NULL THEN 'new' - WHEN t3.k IS NULL THEN 'two_step' - ELSE 'three_step' END -) -SELECT * FROM classified WHERE status = 'new' +POSTHOOK: query: WITH + probe AS ( + SELECT k, v, (CAST(v AS INT) > 0) AS p_bool + FROM t WHERE CAST(v AS INT) >= 3 + ), + small_side AS ( + SELECT k, v, (CAST(v AS INT) > 9999) AS s_bool + FROM t + ), + classified AS ( + SELECT p.k, p.v, CAST((s.s_bool OR p.p_bool) AS INT) AS observed_value + FROM probe p + LEFT JOIN small_side s + ON p.k = s.k + AND CAST(p.v AS INT) - 1 = CAST(s.v AS INT) + ), + diagnosed AS ( + SELECT k, v, MAX(observed_value) AS observed_value + FROM classified + GROUP BY k, v + ) +SELECT k, v, + observed_value AS observed_value_returned_by_select, + 1 AS required_value_per_sql_semantics +FROM diagnosed +WHERE observed_value = 0 POSTHOOK: type: QUERY -POSTHOOK: Input: default@src_hive29598 +POSTHOOK: Input: default@t #### A masked pattern was here #### -r 3 new -s 3 new -PREHOOK: query: DROP TABLE src_hive29598 -PREHOOK: type: DROPTABLE -PREHOOK: Input: default@src_hive29598 -PREHOOK: Output: database:default -PREHOOK: Output: default@src_hive29598 -POSTHOOK: query: DROP TABLE src_hive29598 -POSTHOOK: type: DROPTABLE -POSTHOOK: Input: default@src_hive29598 -POSTHOOK: Output: database:default -POSTHOOK: Output: default@src_hive29598 diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java index ec98d2ab5b8c..cce0cb9ad949 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java @@ -504,6 +504,13 @@ public void setElement(int outputElementNum, int inputElementNum, ColumnVector i } } + @Override + protected void clearSlotValue(int elementNum) { + vector[elementNum] = null; + start[elementNum] = 0; + length[elementNum] = 0; + } + @Override public void init() { initBuffer(0); diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java index 9f611dfd313b..ee5e3f3885ee 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java @@ -231,6 +231,32 @@ public abstract void setElement(int outputElementNum, int inputElementNum, public abstract void copySelected( boolean selectedInUse, int[] sel, int size, ColumnVector outputColVector); + /** + * Mark the slot null and put its underlying value into a defined cleared + * state. Sets {@code isNull[elementNum] = true} and {@code noNulls = false}, + * then dispatches to {@link #clearSlotValue(int)} for per-type clearing. + * + *

Defends against consumers that read {@code vector[i]} without first + * checking {@code isNull[i]}. Distinct from per-type {@code NULL_VALUE} + * sentinels (e.g. {@link LongColumnVector#NULL_VALUE}), which assume the + * isNull[]-first contract. Final by design — subclasses customize behavior + * by overriding {@link #clearSlotValue(int)}, never this method. + */ + public final void clearValue(int elementNum) { + noNulls = false; + isNull[elementNum] = true; + clearSlotValue(elementNum); + } + + /** + * Per-type slot-clearing hook invoked by {@link #clearValue(int)}. + * Subclasses override to zero out their value array at {@code elementNum}. + * Container and void types inherit the no-op default. + */ + protected void clearSlotValue(int elementNum) { + // Default no-op. + } + /** * Initialize the column vector. This method can be overridden by specific column vector types. * Use this method only if the individual type of the column vector is not known, otherwise its diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.java index 5defd27623b2..e0cdd76de156 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.java @@ -144,6 +144,11 @@ public void setElement(int outputElementNum, int inputElementNum, ColumnVector i } } + @Override + protected void clearSlotValue(int elementNum) { + vector[elementNum].setFromLong(0L); + } + @Override public void stringifyValue(StringBuilder buffer, int row) { if (isRepeating) { diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java index f833bde03f6e..fcf297585c63 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java @@ -220,6 +220,11 @@ public void setElement(int outputElementNum, int inputElementNum, ColumnVector i } } + @Override + protected void clearSlotValue(int elementNum) { + vector[elementNum] = 0.0; + } + @Override public void stringifyValue(StringBuilder buffer, int row) { if (isRepeating) { diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/IntervalDayTimeColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/IntervalDayTimeColumnVector.java index 9324bc0c610d..2b61b09e38cb 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/IntervalDayTimeColumnVector.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/IntervalDayTimeColumnVector.java @@ -311,6 +311,11 @@ public void setNullValue(int elementNum) { nanos[elementNum] = 1; } + @Override + protected void clearSlotValue(int elementNum) { + setNullValue(elementNum); + } + // Copy the current object contents into the output. Only copy selected entries, // as indicated by selectedInUse and the sel array. @Override diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java index bf423674b2aa..dc727b462a74 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java @@ -294,6 +294,11 @@ public void setElement(int outputElementNum, int inputElementNum, ColumnVector i } } + @Override + protected void clearSlotValue(int elementNum) { + vector[elementNum] = 0L; + } + @Override public void stringifyValue(StringBuilder buffer, int row) { if (isRepeating) { diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java index f97156c40381..c49149cdb281 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java @@ -378,6 +378,11 @@ public void setNullValue(int elementNum) { nanos[elementNum] = 1; } + @Override + protected void clearSlotValue(int elementNum) { + setNullValue(elementNum); + } + // Copy the current object contents into the output. Only copy selected entries, // as indicated by selectedInUse and the sel array. @Override diff --git a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestBytesColumnVector.java b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestBytesColumnVector.java index be4ff70935a7..2de07b4223e6 100644 --- a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestBytesColumnVector.java +++ b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestBytesColumnVector.java @@ -25,7 +25,9 @@ import org.junit.Test; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotSame; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; @@ -220,4 +222,21 @@ private static byte[] writeToBytesColumnVector(int rowIdx, BytesColumnVector col col.setValPreallocated(rowIdx, writeSize); return bytes; } + + @Test + public void testClearValue() { + BytesColumnVector cv = new BytesColumnVector(4); + byte[] data = "hello".getBytes(StandardCharsets.UTF_8); + cv.vector[0] = data; + cv.start[0] = 1; + cv.length[0] = 3; + + cv.clearValue(0); + + assertTrue(cv.isNull[0]); + assertFalse(cv.noNulls); + assertNull(cv.vector[0]); + assertEquals(0, cv.start[0]); + assertEquals(0, cv.length[0]); + } } diff --git a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDecimalColumnVector.java b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDecimalColumnVector.java new file mode 100644 index 000000000000..2644ff8f1bcb --- /dev/null +++ b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDecimalColumnVector.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class TestDecimalColumnVector { + + @Test + void clearValueZeroesSlotAndMarksNull() { + DecimalColumnVector cv = new DecimalColumnVector(4, 18, 4); + cv.vector[1].setFromLong(12345L); + cv.vector[2].setFromLong(67890L); + + cv.clearValue(1); + + assertTrue(cv.isNull[1]); + assertFalse(cv.noNulls); + assertEquals(0L, cv.vector[1].serialize64(cv.scale)); + // Neighbour slot untouched: still represents 67890. + assertEquals(67890L, cv.vector[2].serialize64((short) 0)); + assertFalse(cv.isNull[2]); + } +} diff --git a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDoubleColumnVector.java b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDoubleColumnVector.java new file mode 100644 index 000000000000..a67ff94ef327 --- /dev/null +++ b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDoubleColumnVector.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class TestDoubleColumnVector { + + @Test + void clearValueZeroesSlotAndMarksNull() { + DoubleColumnVector cv = new DoubleColumnVector(4); + cv.vector[1] = 3.14; + cv.vector[0] = 1.5; + cv.vector[3] = -2.5; + + cv.clearValue(1); + + assertTrue(cv.isNull[1]); + assertFalse(cv.noNulls); + assertEquals(0.0, cv.vector[1]); + assertEquals(1.5, cv.vector[0]); + assertEquals(-2.5, cv.vector[3]); + assertFalse(cv.isNull[0]); + assertFalse(cv.isNull[3]); + } +} diff --git a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestIntervalDayTimeColumnVector.java b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestIntervalDayTimeColumnVector.java new file mode 100644 index 000000000000..d715508e1484 --- /dev/null +++ b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestIntervalDayTimeColumnVector.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class TestIntervalDayTimeColumnVector { + + @Test + void clearValueZeroesSlotAndMarksNull() { + IntervalDayTimeColumnVector cv = new IntervalDayTimeColumnVector(4); + cv.set(3, new HiveIntervalDayTime(5, 0)); + + cv.clearValue(3); + + assertTrue(cv.isNull[3]); + assertFalse(cv.noNulls); + // setNullValue convention: totalSeconds = 0, nanos = 1 + assertEquals(0L, cv.getTotalSeconds(3)); + assertEquals(1, cv.getNanos(3)); + } +} diff --git a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestLongColumnVector.java b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestLongColumnVector.java new file mode 100644 index 000000000000..c1c8acc25e97 --- /dev/null +++ b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestLongColumnVector.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class TestLongColumnVector { + + @Test + void clearValueZeroesSlotAndMarksNull() { + LongColumnVector cv = new LongColumnVector(4); + cv.vector[2] = 2025L; + cv.vector[1] = 7L; + cv.vector[3] = 9L; + + cv.clearValue(2); + + assertTrue(cv.isNull[2]); + assertFalse(cv.noNulls); + assertEquals(0L, cv.vector[2]); + assertEquals(7L, cv.vector[1]); + assertEquals(9L, cv.vector[3]); + assertFalse(cv.isNull[1]); + assertFalse(cv.isNull[3]); + } +} diff --git a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java index 2d85b115d244..dda52797246a 100644 --- a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java +++ b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.exec.vector; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.sql.Timestamp; @@ -246,4 +248,18 @@ private Thread startVectorManipulationThread(final int vectorLength, final long return thread; } + @Test + public void testClearValue() { + TimestampColumnVector cv = new TimestampColumnVector(4); + cv.time[2] = 1234567890000L; + cv.nanos[2] = 999; + + cv.clearValue(2); + + assertTrue(cv.isNull[2]); + assertFalse(cv.noNulls); + // setNullValue convention: time = 0, nanos = 1 + assertEquals(0L, cv.time[2]); + assertEquals(1, cv.nanos[2]); + } } From 4009d83653c64fb00294601bea345d73bfdd8b10 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Tue, 19 May 2026 07:52:26 -0700 Subject: [PATCH 5/5] HIVE-29598: SQ feedback + comments' cleanup --- ...torMapJoinOuterGenerateResultOperator.java | 51 ++++++------------- .../clientpositive/vector_outer_join7.q | 24 ++------- 2 files changed, 19 insertions(+), 56 deletions(-) diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java index 7a7b035a3c50..de5d49ba688b 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/mapjoin/TestVectorMapJoinOuterGenerateResultOperator.java @@ -47,20 +47,14 @@ import static org.junit.jupiter.api.Assertions.assertTrue; /** - * Tests that {@link VectorMapJoinOuterGenerateResultOperator} invokes - * {@code clearValue} on every small-table key and value column for each - * unmatched big-table row. The HIVE-29598 bug is that without that call, - * a stale {@code vector[i]} survives the null marking and leaks into - * downstream operators that read the slot without checking {@code isNull[i]}. + * HIVE-29598: verifies {@link VectorMapJoinOuterGenerateResultOperator} clears + * every small-table slot for unmatched rows, so stale values cannot carry over + * past the null marking. */ class TestVectorMapJoinOuterGenerateResultOperator { - /** - * A concrete subclass of the abstract operator with no-op stubs for the - * abstract methods, just enough to invoke {@code generateOuterNulls} and - * {@code generateOuterNullsRepeatedAll} directly from tests. - */ - private static class TestableOuterOp extends VectorMapJoinOuterGenerateResultOperator { + /** Concrete subclass that exposes the generateOuterNulls* methods to tests. */ + private static final class TestableOuterOp extends VectorMapJoinOuterGenerateResultOperator { @Override protected String getLoggingPrefix() { return "test"; @@ -68,16 +62,12 @@ protected String getLoggingPrefix() { @Override public void processBatch(VectorizedRowBatch batch) { - // No-op: tests invoke the generateOuterNulls* methods directly. } } /** - * LongColumnVector that records every {@code clearSlotValue} invocation. - * Used to assert that the operator dispatched through the new - * {@link org.apache.hadoop.hive.ql.exec.vector.ColumnVector#clearValue(int)} - * contract, rather than just observing the slot-clearing side effect - * (which could also be produced by another mechanism). + * Records {@code clearSlotValue} invocations to verify the operator dispatches + * through {@code clearValue}, not just produces the slot-clearing side effect. */ private static class TrackingLongColumnVector extends LongColumnVector { final List clearedIndices = new ArrayList<>(); @@ -103,7 +93,7 @@ void generateOuterNullsCallsClearValueOnEachMappedColumnForEachUnmatchedRow() th TrackingLongColumnVector keyCol = new TrackingLongColumnVector(4); TrackingLongColumnVector valCol1 = new TrackingLongColumnVector(4); TrackingLongColumnVector valCol2 = new TrackingLongColumnVector(4); - keyCol.vector[1] = 99L; // stale values that should be cleared + keyCol.vector[1] = 99L; valCol1.vector[1] = 88L; valCol2.vector[3] = 77L; batch.cols[0] = keyCol; @@ -113,26 +103,23 @@ void generateOuterNullsCallsClearValueOnEachMappedColumnForEachUnmatchedRow() th int[] noMatchs = new int[] {1, 3}; op.generateOuterNulls(batch, noMatchs, noMatchs.length); - // Each tracked column had clearSlotValue invoked at indices 1 and 3. assertEquals(Arrays.asList(1, 3), keyCol.clearedIndices); assertEquals(Arrays.asList(1, 3), valCol1.clearedIndices); assertEquals(Arrays.asList(1, 3), valCol2.clearedIndices); - // Bookkeeping side effect of clearValue (final base-class method). assertFalse(keyCol.noNulls); assertTrue(keyCol.isNull[1]); assertTrue(keyCol.isNull[3]); assertFalse(keyCol.isNull[0]); assertFalse(keyCol.isNull[2]); - // Stale slot values cleared to 0L. assertEquals(0L, keyCol.vector[1]); assertEquals(0L, valCol1.vector[1]); assertEquals(0L, valCol2.vector[3]); } @Test - void generateOuterNullsRepeatedAllCallsClearValueAtIndexZeroForEachMappedColumn() throws HiveException, IOException { + void generateOuterNullsRepeatedAllCallsClearValueAtIndexZeroForEachMappedColumn() throws HiveException { TestableOuterOp op = new TestableOuterOp(); op.outerSmallTableKeyColumnMap = new int[] {0}; op.smallTableValueColumnMap = new int[] {1}; @@ -147,11 +134,10 @@ void generateOuterNullsRepeatedAllCallsClearValueAtIndexZeroForEachMappedColumn( op.generateOuterNullsRepeatedAll(batch); - // Each tracked column had clearSlotValue invoked exactly once at index 0. assertEquals(Arrays.asList(0), keyCol.clearedIndices); assertEquals(Arrays.asList(0), valCol.clearedIndices); - // Bookkeeping plus isRepeating set by the operator after clearValue. + // isRepeating is set by the operator, not by clearValue. assertFalse(keyCol.noNulls); assertTrue(keyCol.isNull[0]); assertTrue(keyCol.isRepeating); @@ -159,17 +145,15 @@ void generateOuterNullsRepeatedAllCallsClearValueAtIndexZeroForEachMappedColumn( assertTrue(valCol.isNull[0]); assertTrue(valCol.isRepeating); - // Stale slot values cleared to 0L. assertEquals(0L, keyCol.vector[0]); assertEquals(0L, valCol.vector[0]); } @Test void generateOuterNullsSetsBookkeepingOnTypeWithNoClearSlotValueOverride() throws HiveException, IOException { - // VoidColumnVector inherits the base ColumnVector.clearSlotValue() no-op - // — no per-slot value to zero. This verifies the operator still drives - // the bookkeeping (isNull[i], noNulls) through the final clearValue() - // contract on a type that doesn't override the hook. + // VoidColumnVector inherits the base no-op clearSlotValue — verifies the + // operator still drives the null-marking through clearValue() on a type + // without a per-slot value to zero. TestableOuterOp op = new TestableOuterOp(); op.outerSmallTableKeyColumnMap = new int[] {}; op.smallTableValueColumnMap = new int[] {0}; @@ -189,12 +173,9 @@ void generateOuterNullsSetsBookkeepingOnTypeWithNoClearSlotValueOverride() throw } /** - * Verifies that for every {@link ColumnVector} subclass whose - * {@code clearSlotValue} the PR overrides, the operator's call through - * {@code clearValue} ultimately reaches that override and zeroes the slot - * to the type's cleared state. Complements - * {@link #generateOuterNullsCallsClearValueOnEachMappedColumnForEachUnmatchedRow} - * which proves the dispatch chain on Long alone. + * For each {@link ColumnVector} subclass whose {@code clearSlotValue} is + * overridden, verifies the operator's call through {@code clearValue} reaches + * the override and clears the slot to the type's cleared state. */ @ParameterizedTest(name = "{0}") @MethodSource("modifiedColumnVectorTypes") diff --git a/ql/src/test/queries/clientpositive/vector_outer_join7.q b/ql/src/test/queries/clientpositive/vector_outer_join7.q index 041601265441..141d8c3c68bd 100644 --- a/ql/src/test/queries/clientpositive/vector_outer_join7.q +++ b/ql/src/test/queries/clientpositive/vector_outer_join7.q @@ -1,29 +1,11 @@ -set hive.explain.user=false; SET hive.auto.convert.join=true; SET hive.auto.convert.join.noconditionaltask=true; -set hive.fetch.task.conversion=none; -- SORT_QUERY_RESULTS --- Regression test for HIVE-29598: vectorized outer-join MapJoin can leave a --- stale typed value in a scratch slot that the vectorizer has aliased to a --- smallTableValueMapping target. For unmatched rows, generateOuterNulls() --- flips isNull[i] = true but does not clear vector[i], so a downstream --- ColOrCol -> IfExprLongScalarLongScalar chain that reads vector[i] without --- consulting isNull[i] propagates the stale value into the result. --- --- Repro shape: a LEFT OUTER MapJoin whose ON predicate uses a CAST scratch --- column that is then reused as the small-table boolean-value column; the --- projection computes CAST((s.s_bool OR p.p_bool) AS INT) over the null-padded --- rows. The MAX() aggregate barrier prevents Calcite from inlining and --- simplifying the bug surface away. --- --- Expected: zero rows. Every probe row's (s_bool OR p_bool) is TRUE per SQL --- three-valued logic (matched: FALSE OR TRUE; unmatched: NULL OR TRUE), so --- CAST(... AS INT) is always 1 and WHERE observed_value = 0 matches nothing. --- Without the fix: 'C 3 0 1' and 'D 3 0 1' leak through, since the stale int --- from CastStringToLong ORed with 1 fails the strict == 1 check in --- IfExprLongScalarLongScalar and stores 0. +-- HIVE-29598: regression test for stale scratch-slot values in vectorized +-- outer-join MapJoin. MAX() acts as an aggregation barrier so Calcite cannot +-- inline the inner expression and simplify the bug surface away. CREATE TABLE t (k STRING, v STRING) STORED AS ORC;