From a6a9ea872a9826912d6d78e2ab101a72f8e2df9e Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Tue, 28 Jan 2025 14:41:21 +0900 Subject: [PATCH 1/9] HIVE-27370: support 4 bytes characters --- .../apache/hadoop/hive/ql/udf/UDFSubstr.java | 173 ++++++++++++++---- .../test/queries/clientpositive/udf_substr.q | 8 + .../clientpositive/llap/udf_substr.q.out | 17 ++ 3 files changed, 165 insertions(+), 33 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java index 7c6de37c8073..12e51dcbae3c 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java @@ -96,53 +96,131 @@ private Text evaluateInternal(Text t, int pos, int len) { return r; } - String s = t.toString(); - int[] index = makeIndex(pos, len, s.length()); - if (index == null) { + byte[] utf8String = t.toString().getBytes(); + populateSubstrOffsets(utf8String, pos, len); + if (index[0] == -1) { return r; } - r.set(s.substring(index[0], index[1])); + r.set(new String(utf8String, index[0], index[1])); return r; } - private int[] makeIndex(int pos, int len, int inputLen) { - if ((Math.abs(pos) > inputLen)) { - return null; + private Text evaluateInternal(Text t, int pos) { + r.clear(); + + byte[] utf8String = t.toString().getBytes(); + int offset = getSubstrStartOffset(utf8String, pos); + if (offset == -1) { + return r; } - int start, end; + r.set(new String(utf8String, offset, utf8String.length - offset)); + return r; + } + + private void populateSubstrOffsets(byte[] utf8String, int start, int len) { + int curIdx = -1; + index[0] = -1; + index[1] = -1; + int end = utf8String.length; + + if (start > 0) { + start = start - 1; + } else if (start < 0) { + int length = 0; + for (int i = 0; i != end; ++i) { + if ((utf8String[i] & 0xc0) != 0x80) { + ++length; + } + } + + if (-start > length) { + return; + } + + start = length + start; + } + + if (len == 0) { + return; + } else if (len > end) { + len = end; + } - if (pos > 0) { - start = pos - 1; - } else if (pos < 0) { - start = inputLen + pos; - } else { - start = 0; + int endIdx = start + len - 1; + for (int i = 0; i != end; ++i) { + if ((utf8String[i] & 0xc0) != 0x80) { + ++curIdx; + if (curIdx == start) { + index[0] = i; + } else if (curIdx - 1 == endIdx) { + index[1] = i - index[0]; + } + } } - if ((inputLen - start) < len) { - end = inputLen; - } else { - end = start + len; + if (index[1] == -1) { + index[1] = end - index[0]; } - index[0] = start; - index[1] = end; - return index; } - private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE); + private int getSubstrStartOffset(byte[] utf8String, int start) { + int end = utf8String.length; + + if (start >= 1) { + start = start - 1; + } + if (start < 0) { + int length = 0; + for (int i = 0; i != end; ++i) { + if ((utf8String[i] & 0xc0) != 0x80) { + ++length; + } + } + + if (-start > length) { + return -1; + } - // Even though we are using longs, substr can only deal with ints, so we use - // the maximum int value as the maxValue - private final LongWritable maxLongValue = new LongWritable(Integer.MAX_VALUE); + start = length + start; + } + + int curIdx = -1; + for (int i = 0; i != end; ++i) { + if ((utf8String[i] & 0xc0) != 0x80) { + ++curIdx; + if (curIdx == start) { + return i; + } + } + } + + return -1; + } public Text evaluate(Text s, IntWritable pos) { - return evaluate(s, pos, maxValue); + if ((s == null) || (pos == null)) { + return null; + } + + return evaluateInternal(s, pos.get()); } public Text evaluate(Text s, LongWritable pos) { - return evaluate(s, pos, maxLongValue); + if ((s == null) || (pos == null)) { + return null; + } + + long longPos = pos.get(); + // If an unsupported value is seen, we don't want to return a string + // that doesn't match what the user expects, so we return NULL (still + // unexpected, of course, but probably better than a bad string). + if (longPos > Integer.MAX_VALUE || longPos < Integer.MIN_VALUE) { + return null; + } + + return evaluateInternal(s, (int) pos.get()); } public BytesWritable evaluate(BytesWritable bw, LongWritable pos, LongWritable len) { @@ -172,25 +250,42 @@ public BytesWritable evaluate(BytesWritable bw, IntWritable pos, IntWritable len } private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) { - if (len <= 0) { return new BytesWritable(); } - int[] index = makeIndex(pos, len, bw.getLength()); - if (index == null) { + byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength()); + populateSubstrOffsets(b, pos, len); + if (index[0] == -1) { return new BytesWritable(); } - return new BytesWritable(Arrays.copyOfRange(bw.getBytes(), index[0], index[1])); + return new BytesWritable(arrayCopy(b, index[0], index[1])); + } + + private BytesWritable evaluateInternal(BytesWritable bw, int pos) { + byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength()); + int offset = getSubstrStartOffset(b, pos); + if (offset == -1) { + return new BytesWritable(); + } + + return new BytesWritable(arrayCopy(b, offset, bw.getLength() - offset)); } public BytesWritable evaluate(BytesWritable bw, IntWritable pos){ - return evaluate(bw, pos, maxValue); + if ((bw == null) || (pos == null)) { + return null; + } + return evaluateInternal(bw, pos.get()); } public BytesWritable evaluate(BytesWritable bw, LongWritable pos){ - return evaluate(bw, pos, maxLongValue); + if ((bw == null) || (pos == null)) { + return null; + } + + return evaluateInternal(bw, (int) pos.get()); } @Override @@ -198,6 +293,18 @@ public StatEstimator getStatEstimator() { return new SubStrStatEstimator(); } + private byte[] arrayCopy(byte[] src, int pos, int len) { + byte[] b = new byte[len]; + + int copyIdx = 0; + for (int srcIdx = pos; copyIdx < len; srcIdx++) { + b[copyIdx] = src[srcIdx]; + copyIdx++; + } + + return b; + } + private static class SubStrStatEstimator implements StatEstimator { @Override diff --git a/ql/src/test/queries/clientpositive/udf_substr.q b/ql/src/test/queries/clientpositive/udf_substr.q index a609536f37e5..f1ea68d99c5b 100644 --- a/ql/src/test/queries/clientpositive/udf_substr.q +++ b/ql/src/test/queries/clientpositive/udf_substr.q @@ -89,3 +89,11 @@ FROM src tablesample (1 rows); SELECT substr('ABC', cast(2147483649 as bigint)) FROM src tablesample (1 rows); + +--test 4-byte charactor +set hive.vectorized.execution.enabled=false; +SELECT + substr('あa🤎いiうu', 1, 3) as b1, + substr('あa🤎いiうu', 3) as b2, + substr('あa🤎いiうu', -5) as b3 +FROM src tablesample (1 rows); diff --git a/ql/src/test/results/clientpositive/llap/udf_substr.q.out b/ql/src/test/results/clientpositive/llap/udf_substr.q.out index 9ffa39b03356..c21fa8881726 100644 --- a/ql/src/test/results/clientpositive/llap/udf_substr.q.out +++ b/ql/src/test/results/clientpositive/llap/udf_substr.q.out @@ -240,3 +240,20 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### NULL +PREHOOK: query: SELECT + substr('あa🤎いiうu', 1, 3) as b1, + substr('あa🤎いiうu', 3) as b2, + substr('あa🤎いiうu', -5) as b3 +FROM src tablesample (1 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT + substr('あa🤎いiうu', 1, 3) as b1, + substr('あa🤎いiうu', 3) as b2, + substr('あa🤎いiうu', -5) as b3 +FROM src tablesample (1 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +あa? 🤎いiうu ?いiうu From dcdf3e73f8ea93d2c16badd9eeeb84a487689dfd Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Tue, 28 Jan 2025 18:03:19 +0900 Subject: [PATCH 2/9] fixed test results --- ql/src/test/results/clientpositive/llap/udf_substr.q.out | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ql/src/test/results/clientpositive/llap/udf_substr.q.out b/ql/src/test/results/clientpositive/llap/udf_substr.q.out index c21fa8881726..ebf541a31e6d 100644 --- a/ql/src/test/results/clientpositive/llap/udf_substr.q.out +++ b/ql/src/test/results/clientpositive/llap/udf_substr.q.out @@ -256,4 +256,4 @@ FROM src tablesample (1 rows) POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### -あa? 🤎いiうu ?いiうu +あa🤎 🤎いiうu 🤎いiうu From a17e45b10544a89688fdd648300fd5adf5243bf2 Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Thu, 10 Apr 2025 19:16:09 +0900 Subject: [PATCH 3/9] fixed --- .../expressions/StringSubstrColStart.java | 2 +- .../expressions/StringSubstrColStartLen.java | 2 +- .../apache/hadoop/hive/ql/udf/UDFSubstr.java | 97 +++---------------- 3 files changed, 15 insertions(+), 86 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java index 3d7742bf1407..146cb0709567 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java @@ -75,7 +75,7 @@ public StringSubstrColStart() { * @param len length of the bytes the string holds in the byte array * @param substrStart the Start index for the substring operation */ - static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) { + public static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) { int end = start + len; if (substrStart < 0) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java index 7ef8552123f6..bce8bef59421 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java @@ -86,7 +86,7 @@ public StringSubstrColStartLen() { * @param substrLen the length of the substring * @param offsetArray the array that indexes are populated to. Assume its length >= 2. */ - static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart, + public static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart, int substrLength, int[] offsetArray) { int curIdx = -1; offsetArray[0] = -1; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java index 12e51dcbae3c..cd7810fc62ac 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java @@ -97,7 +97,7 @@ private Text evaluateInternal(Text t, int pos, int len) { } byte[] utf8String = t.toString().getBytes(); - populateSubstrOffsets(utf8String, pos, len); + StringSubstrColStartLen.populateSubstrOffsets(utf8String, 0, utf8String.length, craetePos(pos), len, index); if (index[0] == -1) { return r; } @@ -110,7 +110,7 @@ private Text evaluateInternal(Text t, int pos) { r.clear(); byte[] utf8String = t.toString().getBytes(); - int offset = getSubstrStartOffset(utf8String, pos); + int offset = StringSubstrColStart.getSubstrStartOffset(utf8String, 0, utf8String.length, craetePos(pos)); if (offset == -1) { return r; } @@ -119,86 +119,6 @@ private Text evaluateInternal(Text t, int pos) { return r; } - private void populateSubstrOffsets(byte[] utf8String, int start, int len) { - int curIdx = -1; - index[0] = -1; - index[1] = -1; - int end = utf8String.length; - - if (start > 0) { - start = start - 1; - } else if (start < 0) { - int length = 0; - for (int i = 0; i != end; ++i) { - if ((utf8String[i] & 0xc0) != 0x80) { - ++length; - } - } - - if (-start > length) { - return; - } - - start = length + start; - } - - if (len == 0) { - return; - } else if (len > end) { - len = end; - } - - int endIdx = start + len - 1; - for (int i = 0; i != end; ++i) { - if ((utf8String[i] & 0xc0) != 0x80) { - ++curIdx; - if (curIdx == start) { - index[0] = i; - } else if (curIdx - 1 == endIdx) { - index[1] = i - index[0]; - } - } - } - - if (index[1] == -1) { - index[1] = end - index[0]; - } - } - - private int getSubstrStartOffset(byte[] utf8String, int start) { - int end = utf8String.length; - - if (start >= 1) { - start = start - 1; - } - if (start < 0) { - int length = 0; - for (int i = 0; i != end; ++i) { - if ((utf8String[i] & 0xc0) != 0x80) { - ++length; - } - } - - if (-start > length) { - return -1; - } - - start = length + start; - } - - int curIdx = -1; - for (int i = 0; i != end; ++i) { - if ((utf8String[i] & 0xc0) != 0x80) { - ++curIdx; - if (curIdx == start) { - return i; - } - } - } - - return -1; - } - public Text evaluate(Text s, IntWritable pos) { if ((s == null) || (pos == null)) { return null; @@ -255,7 +175,7 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) { } byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength()); - populateSubstrOffsets(b, pos, len); + StringSubstrColStartLen.populateSubstrOffsets(b, 0, b.length, craetePos(pos), len, index); if (index[0] == -1) { return new BytesWritable(); } @@ -265,7 +185,8 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) { private BytesWritable evaluateInternal(BytesWritable bw, int pos) { byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength()); - int offset = getSubstrStartOffset(b, pos); + int offset = StringSubstrColStart.getSubstrStartOffset(b, 0, b.length, craetePos(pos)); + if (offset == -1) { return new BytesWritable(); } @@ -305,6 +226,14 @@ private byte[] arrayCopy(byte[] src, int pos, int len) { return b; } + private int craetePos(int pos) { + if (pos <= 0) { + return pos; + } + + return pos - 1; + } + private static class SubStrStatEstimator implements StatEstimator { @Override From c5292c818cd8fed65f84833d8975a088254468e7 Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Tue, 15 Apr 2025 17:10:48 +0900 Subject: [PATCH 4/9] fixed javadoc --- .../ql/exec/vector/expressions/StringSubstrColStartLen.java | 4 ++-- ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java index bce8bef59421..957c46ea6b1b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java @@ -83,8 +83,8 @@ public StringSubstrColStartLen() { * @param start start offset of the byte array the string starts at * @param len length of the bytes the string holds in the byte array * @param substrStart the Start index for the substring operation - * @param substrLen the length of the substring - * @param offsetArray the array that indexes are populated to. Assume its length >= 2. + * @param substrLength the length of the substring + * @param offsetArray the array that indexes are populated to. Assume its {@literal length >= 2}. */ public static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart, int substrLength, int[] offsetArray) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java index cd7810fc62ac..419f65778a5c 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java @@ -186,7 +186,6 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) { private BytesWritable evaluateInternal(BytesWritable bw, int pos) { byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength()); int offset = StringSubstrColStart.getSubstrStartOffset(b, 0, b.length, craetePos(pos)); - if (offset == -1) { return new BytesWritable(); } From 8127a2b6ca08db72c229442ebd55e9ba437b3326 Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Tue, 24 Jun 2025 13:58:13 +0900 Subject: [PATCH 5/9] fixed --- .../apache/hadoop/hive/ql/udf/UDFSubstr.java | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java index 419f65778a5c..2f95f11d53dd 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java @@ -97,7 +97,7 @@ private Text evaluateInternal(Text t, int pos, int len) { } byte[] utf8String = t.toString().getBytes(); - StringSubstrColStartLen.populateSubstrOffsets(utf8String, 0, utf8String.length, craetePos(pos), len, index); + StringSubstrColStartLen.populateSubstrOffsets(utf8String, 0, utf8String.length, adjustStartPos(pos), len, index); if (index[0] == -1) { return r; } @@ -110,7 +110,7 @@ private Text evaluateInternal(Text t, int pos) { r.clear(); byte[] utf8String = t.toString().getBytes(); - int offset = StringSubstrColStart.getSubstrStartOffset(utf8String, 0, utf8String.length, craetePos(pos)); + int offset = StringSubstrColStart.getSubstrStartOffset(utf8String, 0, utf8String.length, adjustStartPos(pos)); if (offset == -1) { return r; } @@ -174,23 +174,25 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) { return new BytesWritable(); } - byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength()); - StringSubstrColStartLen.populateSubstrOffsets(b, 0, b.length, craetePos(pos), len, index); + // Even though we are using longs, substr can only deal with ints, so we use + // the maximum int value as the maxValue + StringSubstrColStartLen.populateSubstrOffsets(bw.getBytes(), 0, bw.getLength(), adjustStartPos(pos), len, index); if (index[0] == -1) { return new BytesWritable(); } - return new BytesWritable(arrayCopy(b, index[0], index[1])); + return new BytesWritable(arrayCopy(bw.getBytes(), index[0], index[1])); } private BytesWritable evaluateInternal(BytesWritable bw, int pos) { - byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength()); - int offset = StringSubstrColStart.getSubstrStartOffset(b, 0, b.length, craetePos(pos)); + // Even though we are using longs, substr can only deal with ints, so we use + // the maximum int value as the maxValue + int offset = StringSubstrColStart.getSubstrStartOffset(bw.getBytes(), 0, bw.getLength(), adjustStartPos(pos)); if (offset == -1) { return new BytesWritable(); } - return new BytesWritable(arrayCopy(b, offset, bw.getLength() - offset)); + return new BytesWritable(arrayCopy(bw.getBytes(), offset, bw.getLength() - offset)); } public BytesWritable evaluate(BytesWritable bw, IntWritable pos){ @@ -221,11 +223,10 @@ private byte[] arrayCopy(byte[] src, int pos, int len) { b[copyIdx] = src[srcIdx]; copyIdx++; } - return b; } - private int craetePos(int pos) { + private int adjustStartPos(int pos) { if (pos <= 0) { return pos; } From 82a49b3b33eddafbea7ff552dc29a0a1c4137b62 Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Thu, 26 Jun 2025 14:59:22 +0900 Subject: [PATCH 6/9] fixed --- .../apache/hadoop/hive/ql/udf/UDFSubstr.java | 33 ++++--------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java index 2f95f11d53dd..676706e9fb8d 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.udf; -import java.util.Arrays; import java.util.List; import java.util.Optional; @@ -36,6 +35,8 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; +import static java.util.Arrays.copyOfRange; + /** * UDFSubstr. * @@ -181,7 +182,7 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) { return new BytesWritable(); } - return new BytesWritable(arrayCopy(bw.getBytes(), index[0], index[1])); + return new BytesWritable(copyOfRange(bw.getBytes(), index[0], index[0] + index[1])); } private BytesWritable evaluateInternal(BytesWritable bw, int pos) { @@ -192,7 +193,7 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos) { return new BytesWritable(); } - return new BytesWritable(arrayCopy(bw.getBytes(), offset, bw.getLength() - offset)); + return new BytesWritable(copyOfRange(bw.getBytes(), offset, bw.getLength())); } public BytesWritable evaluate(BytesWritable bw, IntWritable pos){ @@ -215,17 +216,6 @@ public StatEstimator getStatEstimator() { return new SubStrStatEstimator(); } - private byte[] arrayCopy(byte[] src, int pos, int len) { - byte[] b = new byte[len]; - - int copyIdx = 0; - for (int srcIdx = pos; copyIdx < len; srcIdx++) { - b[copyIdx] = src[srcIdx]; - copyIdx++; - } - return b; - } - private int adjustStartPos(int pos) { if (pos <= 0) { return pos; @@ -244,7 +234,6 @@ public Optional estimate(List csList) { // 99 rows with 0 length // orig avg is 10 // new avg is 5 (if substr(5)) ; but in reality it will stay ~10 - Optional start = getRangeWidth(csList.get(1).getRange()); Range startRange = csList.get(1).getRange(); if (startRange != null && startRange.minValue != null) { double newAvgColLen = cs.getAvgColLen() - startRange.minValue.doubleValue(); @@ -255,7 +244,7 @@ public Optional estimate(List csList) { if (csList.size() > 2) { Range lengthRange = csList.get(2).getRange(); if (lengthRange != null && lengthRange.maxValue != null) { - Double w = lengthRange.maxValue.doubleValue(); + double w = lengthRange.maxValue.doubleValue(); if (cs.getAvgColLen() > w) { cs.setAvgColLen(w); } @@ -263,15 +252,5 @@ public Optional estimate(List csList) { } return Optional.of(cs); } - - private Optional getRangeWidth(Range range) { - if (range != null) { - if (range.minValue != null && range.maxValue != null) { - return Optional.of(range.maxValue.doubleValue() - range.minValue.doubleValue()); - } - } - return Optional.empty(); - } - - } + } } From e75df14aa47c3c0124a2619c0518df31fe4406e2 Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Wed, 2 Jul 2025 12:55:54 +0900 Subject: [PATCH 7/9] fixed --- .../apache/hadoop/hive/ql/udf/UDFSubstr.java | 75 +++++++++++-------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java index 676706e9fb8d..912ffd775cd9 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.udf; +import java.util.Arrays; import java.util.List; import java.util.Optional; @@ -35,7 +36,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; -import static java.util.Arrays.copyOfRange; +import java.nio.charset.StandardCharsets; /** * UDFSubstr. @@ -97,26 +98,55 @@ private Text evaluateInternal(Text t, int pos, int len) { return r; } - byte[] utf8String = t.toString().getBytes(); - StringSubstrColStartLen.populateSubstrOffsets(utf8String, 0, utf8String.length, adjustStartPos(pos), len, index); + StringSubstrColStartLen.populateSubstrOffsets(t.getBytes(), 0, t.getLength(), adjustStartPos(pos), len, index); if (index[0] == -1) { return r; } - r.set(new String(utf8String, index[0], index[1])); + r.set(new String(t.getBytes(), index[0], index[1], StandardCharsets.UTF_8)); return r; } + private int[] makeIndex(int pos, int len, int inputLen) { + if ((Math.abs(pos) > inputLen)) { + return null; + } + + int start, end; + + if (pos > 0) { + start = pos - 1; + } else if (pos < 0) { + start = inputLen + pos; + } else { + start = 0; + } + + if ((inputLen - start) < len) { + end = inputLen; + } else { + end = start + len; + } + index[0] = start; + index[1] = end; + return index; + } + + private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE); + + //Even though we are using longs, substr can only deal with ints, so we use + // the maximum int value as the maxValue + private final LongWritable maxLongValue = new LongWritable(Integer.MAX_VALUE); + private Text evaluateInternal(Text t, int pos) { r.clear(); - byte[] utf8String = t.toString().getBytes(); - int offset = StringSubstrColStart.getSubstrStartOffset(utf8String, 0, utf8String.length, adjustStartPos(pos)); + int offset = StringSubstrColStart.getSubstrStartOffset(t.getBytes(), 0, t.getLength(), adjustStartPos(pos)); if (offset == -1) { return r; } - r.set(new String(utf8String, offset, utf8String.length - offset)); + r.set(new String(t.getBytes(), offset, t.getLength() - offset, StandardCharsets.UTF_8)); return r; } @@ -171,44 +201,25 @@ public BytesWritable evaluate(BytesWritable bw, IntWritable pos, IntWritable len } private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) { - if (len <= 0) { - return new BytesWritable(); - } - // Even though we are using longs, substr can only deal with ints, so we use - // the maximum int value as the maxValue - StringSubstrColStartLen.populateSubstrOffsets(bw.getBytes(), 0, bw.getLength(), adjustStartPos(pos), len, index); - if (index[0] == -1) { + if (len <= 0) { return new BytesWritable(); } - return new BytesWritable(copyOfRange(bw.getBytes(), index[0], index[0] + index[1])); - } - - private BytesWritable evaluateInternal(BytesWritable bw, int pos) { - // Even though we are using longs, substr can only deal with ints, so we use - // the maximum int value as the maxValue - int offset = StringSubstrColStart.getSubstrStartOffset(bw.getBytes(), 0, bw.getLength(), adjustStartPos(pos)); - if (offset == -1) { + int[] index = makeIndex(pos, len, bw.getLength()); + if (index == null) { return new BytesWritable(); } - return new BytesWritable(copyOfRange(bw.getBytes(), offset, bw.getLength())); + return new BytesWritable(Arrays.copyOfRange(bw.getBytes(), index[0], index[1])); } public BytesWritable evaluate(BytesWritable bw, IntWritable pos){ - if ((bw == null) || (pos == null)) { - return null; - } - return evaluateInternal(bw, pos.get()); + return evaluate(bw, pos, maxValue); } public BytesWritable evaluate(BytesWritable bw, LongWritable pos){ - if ((bw == null) || (pos == null)) { - return null; - } - - return evaluateInternal(bw, (int) pos.get()); + return evaluate(bw, pos, maxLongValue); } @Override From 371b02bdfbf3f221be151b5bfff283414544306e Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Thu, 3 Jul 2025 15:41:59 +0900 Subject: [PATCH 8/9] fixed --- ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java index 912ffd775cd9..260a7aa972f0 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java @@ -103,7 +103,7 @@ private Text evaluateInternal(Text t, int pos, int len) { return r; } - r.set(new String(t.getBytes(), index[0], index[1], StandardCharsets.UTF_8)); + r.set(t.getBytes(), index[0], index[1]); return r; } @@ -146,7 +146,7 @@ private Text evaluateInternal(Text t, int pos) { return r; } - r.set(new String(t.getBytes(), offset, t.getLength() - offset, StandardCharsets.UTF_8)); + r.set(t.getBytes(), offset, t.getLength() - offset); return r; } From 7e57af6cea11367d20e04f63ef7c66d79df04b39 Mon Sep 17 00:00:00 2001 From: Ryu Kobayashi Date: Fri, 4 Jul 2025 11:04:48 +0900 Subject: [PATCH 9/9] Update ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java Co-authored-by: Shohei Okumiya --- ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java index 260a7aa972f0..18e5f9077265 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java @@ -36,8 +36,6 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; -import java.nio.charset.StandardCharsets; - /** * UDFSubstr. *