From a6a9ea872a9826912d6d78e2ab101a72f8e2df9e Mon Sep 17 00:00:00 2001
From: Ryu Kobayashi <beter.max@gmail.com>
Date: Tue, 28 Jan 2025 14:41:21 +0900
Subject: [PATCH 1/9] HIVE-27370: support 4 bytes characters

---
 .../apache/hadoop/hive/ql/udf/UDFSubstr.java  | 173 ++++++++++++++----
 .../test/queries/clientpositive/udf_substr.q  |   8 +
 .../clientpositive/llap/udf_substr.q.out      |  17 ++
 3 files changed, 165 insertions(+), 33 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
index 7c6de37c8073..12e51dcbae3c 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
@@ -96,53 +96,131 @@ private Text evaluateInternal(Text t, int pos, int len) {
       return r;
     }
 
-    String s = t.toString();
-    int[] index = makeIndex(pos, len, s.length());
-    if (index == null) {
+    byte[] utf8String = t.toString().getBytes();
+    populateSubstrOffsets(utf8String, pos, len);
+    if (index[0] == -1) {
       return r;
     }
 
-    r.set(s.substring(index[0], index[1]));
+    r.set(new String(utf8String, index[0], index[1]));
     return r;
   }
 
-  private int[] makeIndex(int pos, int len, int inputLen) {
-    if ((Math.abs(pos) > inputLen)) {
-      return null;
+  private Text evaluateInternal(Text t, int pos) {
+    r.clear();
+
+    byte[] utf8String = t.toString().getBytes();
+    int offset = getSubstrStartOffset(utf8String, pos);
+    if (offset == -1) {
+      return r;
     }
 
-    int start, end;
+    r.set(new String(utf8String, offset, utf8String.length - offset));
+    return r;
+  }
+
+  private void populateSubstrOffsets(byte[] utf8String, int start, int len) {
+    int curIdx = -1;
+    index[0] = -1;
+    index[1] = -1;
+    int end = utf8String.length;
+
+    if (start > 0) {
+      start = start - 1;
+    } else if (start < 0) {
+      int length = 0;
+      for (int i = 0; i != end; ++i) {
+        if ((utf8String[i] & 0xc0) != 0x80) {
+          ++length;
+        }
+      }
+
+      if (-start > length) {
+        return;
+      }
+
+      start = length + start;
+    }
+
+    if (len == 0) {
+      return;
+    } else if (len > end) {
+      len = end;
+    }
 
-    if (pos > 0) {
-      start = pos - 1;
-    } else if (pos < 0) {
-      start = inputLen + pos;
-    } else {
-      start = 0;
+    int endIdx = start + len - 1;
+    for (int i = 0; i != end; ++i) {
+      if ((utf8String[i] & 0xc0) != 0x80) {
+        ++curIdx;
+        if (curIdx == start) {
+          index[0] = i;
+        } else if (curIdx - 1 == endIdx) {
+          index[1] = i - index[0];
+        }
+      }
     }
 
-    if ((inputLen - start) < len) {
-      end = inputLen;
-    } else {
-      end = start + len;
+    if (index[1] == -1) {
+      index[1] = end - index[0];
     }
-    index[0] = start;
-    index[1] = end;
-    return index;
   }
 
-  private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE);
+  private int getSubstrStartOffset(byte[] utf8String, int start) {
+    int end = utf8String.length;
+
+    if (start >= 1) {
+      start = start - 1;
+    }
+    if (start < 0) {
+      int length = 0;
+      for (int i = 0; i != end; ++i) {
+        if ((utf8String[i] & 0xc0) != 0x80) {
+          ++length;
+        }
+      }
+
+      if (-start > length) {
+        return -1;
+      }
 
-  // Even though we are using longs, substr can only deal with ints, so we use
-  // the maximum int value as the maxValue
-  private final LongWritable maxLongValue = new LongWritable(Integer.MAX_VALUE);
+      start = length + start;
+    }
+
+    int curIdx = -1;
+    for (int i = 0; i != end; ++i) {
+      if ((utf8String[i] & 0xc0) != 0x80) {
+        ++curIdx;
+        if (curIdx == start) {
+          return i;
+        }
+      }
+    }
+
+    return -1;
+  }
 
   public Text evaluate(Text s, IntWritable pos) {
-    return evaluate(s, pos, maxValue);
+    if ((s == null) || (pos == null)) {
+      return null;
+    }
+
+    return evaluateInternal(s, pos.get());
   }
 
   public Text evaluate(Text s, LongWritable pos) {
-    return evaluate(s, pos, maxLongValue);
+    if ((s == null) || (pos == null)) {
+      return null;
+    }
+
+    long longPos = pos.get();
+    // If an unsupported value is seen, we don't want to return a string
+    // that doesn't match what the user expects, so we return NULL (still
+    // unexpected, of course, but probably better than a bad string).
+    if (longPos > Integer.MAX_VALUE || longPos < Integer.MIN_VALUE) {
+      return null;
+    }
+
+    return evaluateInternal(s, (int) pos.get());
   }
 
   public BytesWritable evaluate(BytesWritable bw, LongWritable pos, LongWritable len) {
@@ -172,25 +250,42 @@ public BytesWritable evaluate(BytesWritable bw, IntWritable pos, IntWritable len
   }
 
   private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) {
-
     if (len <= 0) {
       return new BytesWritable();
     }
 
-    int[] index = makeIndex(pos, len, bw.getLength());
-    if (index == null) {
+    byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength());
+    populateSubstrOffsets(b, pos, len);
+    if (index[0] == -1) {
       return new BytesWritable();
     }
 
-    return new BytesWritable(Arrays.copyOfRange(bw.getBytes(), index[0], index[1]));
+    return new BytesWritable(arrayCopy(b, index[0], index[1]));
+  }
+
+  private BytesWritable evaluateInternal(BytesWritable bw, int pos) {
+    byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength());
+    int offset = getSubstrStartOffset(b, pos);
+    if (offset == -1) {
+      return new BytesWritable();
+    }
+
+    return new BytesWritable(arrayCopy(b, offset, bw.getLength() - offset));
   }
 
   public BytesWritable evaluate(BytesWritable bw, IntWritable pos){
-    return evaluate(bw, pos, maxValue);
+    if ((bw == null) || (pos == null)) {
+      return null;
+    }
+    return evaluateInternal(bw, pos.get());
   }
 
   public BytesWritable evaluate(BytesWritable bw, LongWritable pos){
-    return evaluate(bw, pos, maxLongValue);
+    if ((bw == null) || (pos == null)) {
+      return null;
+    }
+
+    return evaluateInternal(bw, (int) pos.get());
   }
 
   @Override
@@ -198,6 +293,18 @@ public StatEstimator getStatEstimator() {
     return new SubStrStatEstimator();
   }
 
+  private byte[] arrayCopy(byte[] src, int pos, int len) {
+    byte[] b = new byte[len];
+
+    int copyIdx = 0;
+    for (int srcIdx = pos; copyIdx < len; srcIdx++) {
+      b[copyIdx] = src[srcIdx];
+      copyIdx++;
+    }
+
+    return b;
+  }
+
   private static class SubStrStatEstimator implements StatEstimator {
 
     @Override
diff --git a/ql/src/test/queries/clientpositive/udf_substr.q b/ql/src/test/queries/clientpositive/udf_substr.q
index a609536f37e5..f1ea68d99c5b 100644
--- a/ql/src/test/queries/clientpositive/udf_substr.q
+++ b/ql/src/test/queries/clientpositive/udf_substr.q
@@ -89,3 +89,11 @@ FROM src tablesample (1 rows);
 SELECT
   substr('ABC', cast(2147483649 as bigint))
 FROM src tablesample (1 rows);
+
+--test 4-byte charactor
+set hive.vectorized.execution.enabled=false;
+SELECT
+  substr('あa🤎いiうu', 1, 3) as b1,
+  substr('あa🤎いiうu', 3) as b2,
+  substr('あa🤎いiうu', -5) as b3
+FROM src tablesample (1 rows);
diff --git a/ql/src/test/results/clientpositive/llap/udf_substr.q.out b/ql/src/test/results/clientpositive/llap/udf_substr.q.out
index 9ffa39b03356..c21fa8881726 100644
--- a/ql/src/test/results/clientpositive/llap/udf_substr.q.out
+++ b/ql/src/test/results/clientpositive/llap/udf_substr.q.out
@@ -240,3 +240,20 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@src
 #### A masked pattern was here ####
 NULL
+PREHOOK: query: SELECT
+  substr('あa🤎いiうu', 1, 3) as b1,
+  substr('あa🤎いiうu', 3) as b2,
+  substr('あa🤎いiうu', -5) as b3
+FROM src tablesample (1 rows)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT
+  substr('あa🤎いiうu', 1, 3) as b1,
+  substr('あa🤎いiうu', 3) as b2,
+  substr('あa🤎いiうu', -5) as b3
+FROM src tablesample (1 rows)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+あa?	🤎いiうu	?いiうu

From dcdf3e73f8ea93d2c16badd9eeeb84a487689dfd Mon Sep 17 00:00:00 2001
From: Ryu Kobayashi <beter.max@gmail.com>
Date: Tue, 28 Jan 2025 18:03:19 +0900
Subject: [PATCH 2/9] fixed test results

---
 ql/src/test/results/clientpositive/llap/udf_substr.q.out | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ql/src/test/results/clientpositive/llap/udf_substr.q.out b/ql/src/test/results/clientpositive/llap/udf_substr.q.out
index c21fa8881726..ebf541a31e6d 100644
--- a/ql/src/test/results/clientpositive/llap/udf_substr.q.out
+++ b/ql/src/test/results/clientpositive/llap/udf_substr.q.out
@@ -256,4 +256,4 @@ FROM src tablesample (1 rows)
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@src
 #### A masked pattern was here ####
-あa?	🤎いiうu	?いiうu
+あa🤎	🤎いiうu	🤎いiうu

From a17e45b10544a89688fdd648300fd5adf5243bf2 Mon Sep 17 00:00:00 2001
From: Ryu Kobayashi <beter.max@gmail.com>
Date: Thu, 10 Apr 2025 19:16:09 +0900
Subject: [PATCH 3/9] fixed

---
 .../expressions/StringSubstrColStart.java     |  2 +-
 .../expressions/StringSubstrColStartLen.java  |  2 +-
 .../apache/hadoop/hive/ql/udf/UDFSubstr.java  | 97 +++----------------
 3 files changed, 15 insertions(+), 86 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java
index 3d7742bf1407..146cb0709567 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java
@@ -75,7 +75,7 @@ public StringSubstrColStart() {
    * @param len length of the bytes the string holds in the byte array
    * @param substrStart the Start index for the substring operation
    */
-  static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
+  public static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
     int end = start + len;
 
     if (substrStart < 0) {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
index 7ef8552123f6..bce8bef59421 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
@@ -86,7 +86,7 @@ public StringSubstrColStartLen() {
    * @param substrLen the length of the substring
    * @param offsetArray the array that indexes are populated to. Assume its length >= 2.
    */
-  static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart,
+  public static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart,
       int substrLength, int[] offsetArray) {
     int curIdx = -1;
     offsetArray[0] = -1;
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
index 12e51dcbae3c..cd7810fc62ac 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
@@ -97,7 +97,7 @@ private Text evaluateInternal(Text t, int pos, int len) {
     }
 
     byte[] utf8String = t.toString().getBytes();
-    populateSubstrOffsets(utf8String, pos, len);
+    StringSubstrColStartLen.populateSubstrOffsets(utf8String, 0, utf8String.length, craetePos(pos), len, index);
     if (index[0] == -1) {
       return r;
     }
@@ -110,7 +110,7 @@ private Text evaluateInternal(Text t, int pos) {
     r.clear();
 
     byte[] utf8String = t.toString().getBytes();
-    int offset = getSubstrStartOffset(utf8String, pos);
+    int offset = StringSubstrColStart.getSubstrStartOffset(utf8String, 0, utf8String.length, craetePos(pos));
     if (offset == -1) {
       return r;
     }
@@ -119,86 +119,6 @@ private Text evaluateInternal(Text t, int pos) {
     return r;
   }
 
-  private void populateSubstrOffsets(byte[] utf8String, int start, int len) {
-    int curIdx = -1;
-    index[0] = -1;
-    index[1] = -1;
-    int end = utf8String.length;
-
-    if (start > 0) {
-      start = start - 1;
-    } else if (start < 0) {
-      int length = 0;
-      for (int i = 0; i != end; ++i) {
-        if ((utf8String[i] & 0xc0) != 0x80) {
-          ++length;
-        }
-      }
-
-      if (-start > length) {
-        return;
-      }
-
-      start = length + start;
-    }
-
-    if (len == 0) {
-      return;
-    } else if (len > end) {
-      len = end;
-    }
-
-    int endIdx = start + len - 1;
-    for (int i = 0; i != end; ++i) {
-      if ((utf8String[i] & 0xc0) != 0x80) {
-        ++curIdx;
-        if (curIdx == start) {
-          index[0] = i;
-        } else if (curIdx - 1 == endIdx) {
-          index[1] = i - index[0];
-        }
-      }
-    }
-
-    if (index[1] == -1) {
-      index[1] = end - index[0];
-    }
-  }
-
-  private int getSubstrStartOffset(byte[] utf8String, int start) {
-    int end = utf8String.length;
-
-    if (start >= 1) {
-      start = start - 1;
-    }
-    if (start < 0) {
-      int length = 0;
-      for (int i = 0; i != end; ++i) {
-        if ((utf8String[i] & 0xc0) != 0x80) {
-          ++length;
-        }
-      }
-
-      if (-start > length) {
-        return -1;
-      }
-
-      start = length + start;
-    }
-
-    int curIdx = -1;
-    for (int i = 0; i != end; ++i) {
-      if ((utf8String[i] & 0xc0) != 0x80) {
-        ++curIdx;
-        if (curIdx == start) {
-          return i;
-        }
-      }
-    }
-
-    return -1;
-  }
-
   public Text evaluate(Text s, IntWritable pos) {
     if ((s == null) || (pos == null)) {
       return null;
@@ -255,7 +175,7 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) {
     }
 
     byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength());
-    populateSubstrOffsets(b, pos, len);
+    StringSubstrColStartLen.populateSubstrOffsets(b, 0, b.length, craetePos(pos), len, index);
     if (index[0] == -1) {
       return new BytesWritable();
     }
@@ -265,7 +185,8 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) {
 
   private BytesWritable evaluateInternal(BytesWritable bw, int pos) {
     byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength());
-    int offset = getSubstrStartOffset(b, pos);
+    int offset = StringSubstrColStart.getSubstrStartOffset(b, 0, b.length, craetePos(pos));
+
     if (offset == -1) {
       return new BytesWritable();
     }
@@ -305,6 +226,14 @@ private byte[] arrayCopy(byte[] src, int pos, int len) {
     return b;
   }
 
+  private int craetePos(int pos) {
+    if (pos <= 0) {
+      return pos;
+    }
+
+    return pos - 1;
+  }
+
   private static class SubStrStatEstimator implements StatEstimator {
 
     @Override

From c5292c818cd8fed65f84833d8975a088254468e7 Mon Sep 17 00:00:00 2001
From: Ryu Kobayashi <beter.max@gmail.com>
Date: Tue, 15 Apr 2025 17:10:48 +0900
Subject: [PATCH 4/9] fixed javadoc

---
 .../ql/exec/vector/expressions/StringSubstrColStartLen.java   | 4 ++--
 ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java      | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
index bce8bef59421..957c46ea6b1b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
@@ -83,8 +83,8 @@ public StringSubstrColStartLen() {
    * @param start start offset of the byte array the string starts at
    * @param len length of the bytes the string holds in the byte array
    * @param substrStart the Start index for the substring operation
-   * @param substrLen the length of the substring
-   * @param offsetArray the array that indexes are populated to. Assume its length >= 2.
+   * @param substrLength the length of the substring
+   * @param offsetArray the array that indexes are populated to. Assume its {@literal length >= 2}.
    */
   public static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart,
       int substrLength, int[] offsetArray) {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
index cd7810fc62ac..419f65778a5c 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
@@ -186,7 +186,6 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) {
   private BytesWritable evaluateInternal(BytesWritable bw, int pos) {
     byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength());
     int offset = StringSubstrColStart.getSubstrStartOffset(b, 0, b.length, craetePos(pos));
-
     if (offset == -1) {
       return new BytesWritable();
     }

From 8127a2b6ca08db72c229442ebd55e9ba437b3326 Mon Sep 17 00:00:00 2001
From: Ryu Kobayashi <beter.max@gmail.com>
Date: Tue, 24 Jun 2025 13:58:13 +0900
Subject: [PATCH 5/9] fixed

---
 .../apache/hadoop/hive/ql/udf/UDFSubstr.java  | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
index 419f65778a5c..2f95f11d53dd 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
@@ -97,7 +97,7 @@ private Text evaluateInternal(Text t, int pos, int len) {
     }
 
     byte[] utf8String = t.toString().getBytes();
-    StringSubstrColStartLen.populateSubstrOffsets(utf8String, 0, utf8String.length, craetePos(pos), len, index);
+    StringSubstrColStartLen.populateSubstrOffsets(utf8String, 0, utf8String.length, adjustStartPos(pos), len, index);
     if (index[0] == -1) {
       return r;
     }
@@ -110,7 +110,7 @@ private Text evaluateInternal(Text t, int pos) {
     r.clear();
 
     byte[] utf8String = t.toString().getBytes();
-    int offset = StringSubstrColStart.getSubstrStartOffset(utf8String, 0, utf8String.length, craetePos(pos));
+    int offset = StringSubstrColStart.getSubstrStartOffset(utf8String, 0, utf8String.length, adjustStartPos(pos));
     if (offset == -1) {
       return r;
     }
@@ -174,23 +174,25 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) {
       return new BytesWritable();
     }
 
-    byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength());
-    StringSubstrColStartLen.populateSubstrOffsets(b, 0, b.length, craetePos(pos), len, index);
+    // Even though we are using longs, substr can only deal with ints, so we use
+    // the maximum int value as the maxValue
+    StringSubstrColStartLen.populateSubstrOffsets(bw.getBytes(), 0, bw.getLength(), adjustStartPos(pos), len, index);
     if (index[0] == -1) {
       return new BytesWritable();
     }
 
-    return new BytesWritable(arrayCopy(b, index[0], index[1]));
+    return new BytesWritable(arrayCopy(bw.getBytes(), index[0], index[1]));
   }
 
   private BytesWritable evaluateInternal(BytesWritable bw, int pos) {
-    byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength());
-    int offset = StringSubstrColStart.getSubstrStartOffset(b, 0, b.length, craetePos(pos));
+    // Even though we are using longs, substr can only deal with ints, so we use
+    // the maximum int value as the maxValue
+    int offset = StringSubstrColStart.getSubstrStartOffset(bw.getBytes(), 0, bw.getLength(), adjustStartPos(pos));
     if (offset == -1) {
       return new BytesWritable();
     }
 
-    return new BytesWritable(arrayCopy(b, offset, bw.getLength() - offset));
+    return new BytesWritable(arrayCopy(bw.getBytes(), offset, bw.getLength() - offset));
   }
 
   public BytesWritable evaluate(BytesWritable bw, IntWritable pos){
@@ -221,11 +223,10 @@ private byte[] arrayCopy(byte[] src, int pos, int len) {
       b[copyIdx] = src[srcIdx];
       copyIdx++;
     }
-
     return b;
   }
 
-  private int craetePos(int pos) {
+  private int adjustStartPos(int pos) {
     if (pos <= 0) {
       return pos;
     }

From 82a49b3b33eddafbea7ff552dc29a0a1c4137b62 Mon Sep 17 00:00:00 2001
From: Ryu Kobayashi <beter.max@gmail.com>
Date: Thu, 26 Jun 2025 14:59:22 +0900
Subject: [PATCH 6/9] fixed

---
 .../apache/hadoop/hive/ql/udf/UDFSubstr.java  | 33 ++++---------------
 1 file changed, 6 insertions(+), 27 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
index 2f95f11d53dd..676706e9fb8d 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
@@ -18,7 +18,6 @@
 
 package org.apache.hadoop.hive.ql.udf;
 
-import java.util.Arrays;
 import java.util.List;
 import java.util.Optional;
 
@@ -36,6 +35,8 @@
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 
+import static java.util.Arrays.copyOfRange;
+
 /**
  * UDFSubstr.
  *
@@ -181,7 +182,7 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) {
       return new BytesWritable();
     }
 
-    return new BytesWritable(arrayCopy(bw.getBytes(), index[0], index[1]));
+    return new BytesWritable(copyOfRange(bw.getBytes(), index[0], index[0] + index[1]));
   }
 
   private BytesWritable evaluateInternal(BytesWritable bw, int pos) {
@@ -192,7 +193,7 @@ private BytesWritable evaluateInternal(BytesWritable bw, int pos) {
       return new BytesWritable();
     }
 
-    return new BytesWritable(arrayCopy(bw.getBytes(), offset, bw.getLength() - offset));
+    return new BytesWritable(copyOfRange(bw.getBytes(), offset, bw.getLength()));
   }
 
   public BytesWritable evaluate(BytesWritable bw, IntWritable pos){
@@ -215,17 +216,6 @@ public StatEstimator getStatEstimator() {
     return new SubStrStatEstimator();
   }
 
-  private byte[] arrayCopy(byte[] src, int pos, int len) {
-    byte[] b = new byte[len];
-
-    int copyIdx = 0;
-    for (int srcIdx = pos; copyIdx < len; srcIdx++) {
-      b[copyIdx] = src[srcIdx];
-      copyIdx++;
-    }
-    return b;
-  }
-
   private int adjustStartPos(int pos) {
     if (pos <= 0) {
       return pos;
@@ -244,7 +234,6 @@ public Optional<ColStatistics> estimate(List<ColStatistics> csList) {
       // 99 rows with 0 length
       // orig avg is 10
       // new avg is 5 (if substr(5)) ; but in reality it will stay ~10
-      Optional<Double> start = getRangeWidth(csList.get(1).getRange());
       Range startRange = csList.get(1).getRange();
       if (startRange != null && startRange.minValue != null) {
         double newAvgColLen = cs.getAvgColLen() - startRange.minValue.doubleValue();
@@ -255,7 +244,7 @@ public Optional<ColStatistics> estimate(List<ColStatistics> csList) {
       if (csList.size() > 2) {
         Range lengthRange = csList.get(2).getRange();
         if (lengthRange != null && lengthRange.maxValue != null) {
-          Double w = lengthRange.maxValue.doubleValue();
+          double w = lengthRange.maxValue.doubleValue();
           if (cs.getAvgColLen() > w) {
             cs.setAvgColLen(w);
           }
@@ -263,15 +252,5 @@ public Optional<ColStatistics> estimate(List<ColStatistics> csList) {
       }
       return Optional.of(cs);
     }
-
-    private Optional<Double> getRangeWidth(Range range) {
-      if (range != null) {
-        if (range.minValue != null && range.maxValue != null) {
-          return Optional.of(range.maxValue.doubleValue() - range.minValue.doubleValue());
-        }
-      }
-      return Optional.empty();
-    }
-
-  }
+ }
 }

From e75df14aa47c3c0124a2619c0518df31fe4406e2 Mon Sep 17 00:00:00 2001
From: Ryu Kobayashi <beter.max@gmail.com>
Date: Wed, 2 Jul 2025 12:55:54 +0900
Subject: [PATCH 7/9] fixed

---
 .../apache/hadoop/hive/ql/udf/UDFSubstr.java  | 75 +++++++++++--------
 1 file changed, 43 insertions(+), 32 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
index 676706e9fb8d..912ffd775cd9 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
@@ -18,6 +18,7 @@
 
 package org.apache.hadoop.hive.ql.udf;
 
+import java.util.Arrays;
 import java.util.List;
 import java.util.Optional;
 
@@ -35,7 +36,7 @@
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 
-import static java.util.Arrays.copyOfRange;
+import java.nio.charset.StandardCharsets;
 
 /**
  * UDFSubstr.
@@ -97,26 +98,55 @@ private Text evaluateInternal(Text t, int pos, int len) {
       return r;
     }
 
-    byte[] utf8String = t.toString().getBytes();
-    StringSubstrColStartLen.populateSubstrOffsets(utf8String, 0, utf8String.length, adjustStartPos(pos), len, index);
+    StringSubstrColStartLen.populateSubstrOffsets(t.getBytes(), 0, t.getLength(), adjustStartPos(pos), len, index);
     if (index[0] == -1) {
       return r;
     }
 
-    r.set(new String(utf8String, index[0], index[1]));
+    r.set(new String(t.getBytes(), index[0], index[1], StandardCharsets.UTF_8));
     return r;
   }
 
+  private int[] makeIndex(int pos, int len, int inputLen) {
+    if ((Math.abs(pos) > inputLen)) {
+      return null;
+    }
+
+    int start, end;
+
+    if (pos > 0) {
+      start = pos - 1;
+    } else if (pos < 0) {
+      start = inputLen + pos;
+    } else {
+      start = 0;
+    }
+
+    if ((inputLen - start) < len) {
+      end = inputLen;
+    } else {
+      end = start + len;
+    }
+    index[0] = start;
+    index[1] = end;
+    return index;
+  }
+
+  private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE);
+
+  //Even though we are using longs, substr can only deal with ints, so we use
+  // the maximum int value as the maxValue
+  private final LongWritable maxLongValue = new LongWritable(Integer.MAX_VALUE);
+
   private Text evaluateInternal(Text t, int pos) {
     r.clear();
 
-    byte[] utf8String = t.toString().getBytes();
-    int offset = StringSubstrColStart.getSubstrStartOffset(utf8String, 0, utf8String.length, adjustStartPos(pos));
+    int offset = StringSubstrColStart.getSubstrStartOffset(t.getBytes(), 0, t.getLength(), adjustStartPos(pos));
     if (offset == -1) {
       return r;
     }
 
-    r.set(new String(utf8String, offset, utf8String.length - offset));
+    r.set(new String(t.getBytes(), offset, t.getLength() - offset, StandardCharsets.UTF_8));
     return r;
   }
 
@@ -171,44 +201,25 @@ public BytesWritable evaluate(BytesWritable bw, IntWritable pos, IntWritable len
   }
 
   private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) {
-    if (len <= 0) {
-      return new BytesWritable();
-    }
 
-    // Even though we are using longs, substr can only deal with ints, so we use
-    // the maximum int value as the maxValue
-    StringSubstrColStartLen.populateSubstrOffsets(bw.getBytes(), 0, bw.getLength(), adjustStartPos(pos), len, index);
-    if (index[0] == -1) {
+    if (len <= 0) {
       return new BytesWritable();
     }
 
-    return new BytesWritable(copyOfRange(bw.getBytes(), index[0], index[0] + index[1]));
-  }
-
-  private BytesWritable evaluateInternal(BytesWritable bw, int pos) {
-    // Even though we are using longs, substr can only deal with ints, so we use
-    // the maximum int value as the maxValue
-    int offset = StringSubstrColStart.getSubstrStartOffset(bw.getBytes(), 0, bw.getLength(), adjustStartPos(pos));
-    if (offset == -1) {
+    int[] index = makeIndex(pos, len, bw.getLength());
+    if (index == null) {
       return new BytesWritable();
     }
 
-    return new BytesWritable(copyOfRange(bw.getBytes(), offset, bw.getLength()));
+    return new BytesWritable(Arrays.copyOfRange(bw.getBytes(), index[0], index[1]));
   }
 
   public BytesWritable evaluate(BytesWritable bw, IntWritable pos){
-    if ((bw == null) || (pos == null)) {
-      return null;
-    }
-    return evaluateInternal(bw, pos.get());
+    return evaluate(bw, pos, maxValue);
   }
 
   public BytesWritable evaluate(BytesWritable bw, LongWritable pos){
-    if ((bw == null) || (pos == null)) {
-      return null;
-    }
-
-    return evaluateInternal(bw, (int) pos.get());
+    return evaluate(bw, pos, maxLongValue);
   }
 
   @Override

From 371b02bdfbf3f221be151b5bfff283414544306e Mon Sep 17 00:00:00 2001
From: Ryu Kobayashi <beter.max@gmail.com>
Date: Thu, 3 Jul 2025 15:41:59 +0900
Subject: [PATCH 8/9] fixed

---
 ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
index 912ffd775cd9..260a7aa972f0 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
@@ -103,7 +103,7 @@ private Text evaluateInternal(Text t, int pos, int len) {
       return r;
     }
 
-    r.set(new String(t.getBytes(), index[0], index[1], StandardCharsets.UTF_8));
+    r.set(t.getBytes(), index[0], index[1]);
     return r;
   }
 
@@ -146,7 +146,7 @@ private Text evaluateInternal(Text t, int pos) {
       return r;
     }
 
-    r.set(new String(t.getBytes(), offset, t.getLength() - offset, StandardCharsets.UTF_8));
+    r.set(t.getBytes(), offset, t.getLength() - offset);
     return r;
   }
 

From 7e57af6cea11367d20e04f63ef7c66d79df04b39 Mon Sep 17 00:00:00 2001
From: Ryu Kobayashi <beter.max@gmail.com>
Date: Fri, 4 Jul 2025 11:04:48 +0900
Subject: [PATCH 9/9] Update
 ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java

Co-authored-by: Shohei Okumiya <git@okumin.com>
---
 ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
index 260a7aa972f0..18e5f9077265 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
@@ -36,8 +36,6 @@
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 
-import java.nio.charset.StandardCharsets;
-
 /**
  * UDFSubstr.
  *