diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java index 3d7742bf1407..146cb0709567 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java @@ -75,7 +75,7 @@ public StringSubstrColStart() { * @param len length of the bytes the string holds in the byte array * @param substrStart the Start index for the substring operation */ - static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) { + public static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) { int end = start + len; if (substrStart < 0) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java index 7ef8552123f6..957c46ea6b1b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java @@ -83,10 +83,10 @@ public StringSubstrColStartLen() { * @param start start offset of the byte array the string starts at * @param len length of the bytes the string holds in the byte array * @param substrStart the Start index for the substring operation - * @param substrLen the length of the substring - * @param offsetArray the array that indexes are populated to. Assume its length >= 2. + * @param substrLength the length of the substring + * @param offsetArray the array that indexes are populated to. Assume its {@literal length >= 2}. */ - static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart, + public static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart, int substrLength, int[] offsetArray) { int curIdx = -1; offsetArray[0] = -1; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java index 7c6de37c8073..18e5f9077265 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java @@ -96,13 +96,12 @@ private Text evaluateInternal(Text t, int pos, int len) { return r; } - String s = t.toString(); - int[] index = makeIndex(pos, len, s.length()); - if (index == null) { + StringSubstrColStartLen.populateSubstrOffsets(t.getBytes(), 0, t.getLength(), adjustStartPos(pos), len, index); + if (index[0] == -1) { return r; } - r.set(s.substring(index[0], index[1])); + r.set(t.getBytes(), index[0], index[1]); return r; } @@ -133,16 +132,44 @@ private int[] makeIndex(int pos, int len, int inputLen) { private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE); - // Even though we are using longs, substr can only deal with ints, so we use + //Even though we are using longs, substr can only deal with ints, so we use // the maximum int value as the maxValue private final LongWritable maxLongValue = new LongWritable(Integer.MAX_VALUE); + private Text evaluateInternal(Text t, int pos) { + r.clear(); + + int offset = StringSubstrColStart.getSubstrStartOffset(t.getBytes(), 0, t.getLength(), adjustStartPos(pos)); + if (offset == -1) { + return r; + } + + r.set(t.getBytes(), offset, t.getLength() - offset); + return r; + } + public Text evaluate(Text s, IntWritable pos) { - return evaluate(s, pos, maxValue); + if ((s == null) || (pos == null)) { + return null; + } + + return evaluateInternal(s, pos.get()); } public Text evaluate(Text s, LongWritable pos) { - return evaluate(s, pos, maxLongValue); + if ((s == null) || (pos == null)) { + return null; + } + + long longPos = pos.get(); + // If an unsupported value is seen, we don't want to return a string + // that doesn't match what the user expects, so we return NULL (still + // unexpected, of course, but probably better than a bad string). + if (longPos > Integer.MAX_VALUE || longPos < Integer.MIN_VALUE) { + return null; + } + + return evaluateInternal(s, (int) pos.get()); } public BytesWritable evaluate(BytesWritable bw, LongWritable pos, LongWritable len) { @@ -198,6 +225,14 @@ public StatEstimator getStatEstimator() { return new SubStrStatEstimator(); } + private int adjustStartPos(int pos) { + if (pos <= 0) { + return pos; + } + + return pos - 1; + } + private static class SubStrStatEstimator implements StatEstimator { @Override @@ -208,7 +243,6 @@ public Optional estimate(List csList) { // 99 rows with 0 length // orig avg is 10 // new avg is 5 (if substr(5)) ; but in reality it will stay ~10 - Optional start = getRangeWidth(csList.get(1).getRange()); Range startRange = csList.get(1).getRange(); if (startRange != null && startRange.minValue != null) { double newAvgColLen = cs.getAvgColLen() - startRange.minValue.doubleValue(); @@ -219,7 +253,7 @@ public Optional estimate(List csList) { if (csList.size() > 2) { Range lengthRange = csList.get(2).getRange(); if (lengthRange != null && lengthRange.maxValue != null) { - Double w = lengthRange.maxValue.doubleValue(); + double w = lengthRange.maxValue.doubleValue(); if (cs.getAvgColLen() > w) { cs.setAvgColLen(w); } @@ -227,15 +261,5 @@ public Optional estimate(List csList) { } return Optional.of(cs); } - - private Optional getRangeWidth(Range range) { - if (range != null) { - if (range.minValue != null && range.maxValue != null) { - return Optional.of(range.maxValue.doubleValue() - range.minValue.doubleValue()); - } - } - return Optional.empty(); - } - - } + } } diff --git a/ql/src/test/queries/clientpositive/udf_substr.q b/ql/src/test/queries/clientpositive/udf_substr.q index a609536f37e5..f1ea68d99c5b 100644 --- a/ql/src/test/queries/clientpositive/udf_substr.q +++ b/ql/src/test/queries/clientpositive/udf_substr.q @@ -89,3 +89,11 @@ FROM src tablesample (1 rows); SELECT substr('ABC', cast(2147483649 as bigint)) FROM src tablesample (1 rows); + +--test 4-byte charactor +set hive.vectorized.execution.enabled=false; +SELECT + substr('あa🤎いiうu', 1, 3) as b1, + substr('あa🤎いiうu', 3) as b2, + substr('あa🤎いiうu', -5) as b3 +FROM src tablesample (1 rows); diff --git a/ql/src/test/results/clientpositive/llap/udf_substr.q.out b/ql/src/test/results/clientpositive/llap/udf_substr.q.out index 9ffa39b03356..ebf541a31e6d 100644 --- a/ql/src/test/results/clientpositive/llap/udf_substr.q.out +++ b/ql/src/test/results/clientpositive/llap/udf_substr.q.out @@ -240,3 +240,20 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### NULL +PREHOOK: query: SELECT + substr('あa🤎いiうu', 1, 3) as b1, + substr('あa🤎いiうu', 3) as b2, + substr('あa🤎いiうu', -5) as b3 +FROM src tablesample (1 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT + substr('あa🤎いiうu', 1, 3) as b1, + substr('あa🤎いiうu', 3) as b2, + substr('あa🤎いiうu', -5) as b3 +FROM src tablesample (1 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +あa🤎 🤎いiうu 🤎いiうu