From 7bf8c7b7acf19536819f39c4ddaecfddc6548b98 Mon Sep 17 00:00:00 2001 From: Ron Ladin Date: Sat, 14 Mar 2026 22:30:22 +0200 Subject: [PATCH 1/3] [TEXT-103] Add support for weighted Levenshtein distance --- .../text/similarity/LevenshteinDistance.java | 470 ++++++++++++------ .../similarity/LevenshteinDistanceTest.java | 58 ++- 2 files changed, 373 insertions(+), 155 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java index 479b3fadea..a7cf2c4f00 100644 --- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java @@ -26,6 +26,12 @@ * This is the number of changes needed to change one sequence into another, where each change is a single character modification (deletion, insertion or * substitution). *

+ * + *

+ * This implementation supports configurable costs for insertion, deletion, and substitution operations. By default, all costs are set to 1 for + * backward compatibility. + *

+ * *

* This code has been adapted from Apache Commons Lang 3.3. *

@@ -37,7 +43,22 @@ public class LevenshteinDistance implements EditDistance { /** - * The singleton instance. + * Default cost for an insertion operation. + */ + private static final int DEFAULT_INSERT_COST = 1; + + /** + * Default cost for a deletion operation. + */ + private static final int DEFAULT_DELETE_COST = 1; + + /** + * Default cost for a substitution (replace) operation. + */ + private static final int DEFAULT_REPLACE_COST = 1; + + /** + * The singleton instance (uses default costs and no threshold). */ private static final LevenshteinDistance INSTANCE = new LevenshteinDistance(); @@ -51,78 +72,83 @@ public static LevenshteinDistance getDefaultInstance() { } /** - * Finds the Levenshtein distance between two CharSequences if it's less than or equal to a given threshold. + * Finds the Levenshtein distance between two CharSequences if it's less than or equal to a given + * threshold, using configurable costs for insert, delete, and replace operations. + * + *

+ * This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield and + * Chas Emerick's implementation of the Levenshtein distance algorithm. + *

* *

- * This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield and Chas Emerick's implementation of the Levenshtein distance - * algorithm. + * Note: The stripe-width optimisation used in the default (all-costs-1) case relies on the + * assumption that each operation costs exactly 1. When custom costs are supplied the stripe + * cannot be reliably bounded to {@code 2*threshold+1}, so the full O(nm) DP table is used + * instead, returning -1 only when the final distance exceeds the threshold. *

* *
-     * limitedCompare(null, *, *)             = Throws {@link IllegalArgumentException}
-     * limitedCompare(*, null, *)             = Throws {@link IllegalArgumentException}
-     * limitedCompare(*, *, -1)               = Throws {@link IllegalArgumentException}
-     * limitedCompare("","", 0)               = 0
-     * limitedCompare("aaapppp", "", 8)       = 7
-     * limitedCompare("aaapppp", "", 7)       = 7
-     * limitedCompare("aaapppp", "", 6))      = -1
-     * limitedCompare("elephant", "hippo", 7) = 7
-     * limitedCompare("elephant", "hippo", 6) = -1
-     * limitedCompare("hippo", "elephant", 7) = 7
-     * limitedCompare("hippo", "elephant", 6) = -1
+     * limitedCompare(null, *, *, *, *, *)             = Throws {@link IllegalArgumentException}
+     * limitedCompare(*, null, *, *, *, *)             = Throws {@link IllegalArgumentException}
+     * limitedCompare(*, *, -1, *, *, *)               = Throws {@link IllegalArgumentException}
+     * limitedCompare("","", 0, 1, 1, 1)               = 0
+     * limitedCompare("aaapppp", "", 8, 1, 1, 1)       = 7
+     * limitedCompare("aaapppp", "", 7, 1, 1, 1)       = 7
+     * limitedCompare("aaapppp", "", 6, 1, 1, 1))      = -1
+     * limitedCompare("elephant", "hippo", 7, 1, 1, 1) = 7
+     * limitedCompare("elephant", "hippo", 6, 1, 1, 1) = -1
+     * limitedCompare("hippo", "elephant", 7, 1, 1, 1) = 7
+     * limitedCompare("hippo", "elephant", 6, 1, 1, 1) = -1
      * 
* - * @param left the first SimilarityInput, must not be null. - * @param right the second SimilarityInput, must not be null. - * @param threshold the target threshold, must not be negative. - * @return result distance, or -1 + * @param left the first SimilarityInput, must not be null. + * @param right the second SimilarityInput, must not be null. + * @param threshold the target threshold, must not be negative. + * @param insertCost the cost of an insertion operation, must not be negative. + * @param deleteCost the cost of a deletion operation, must not be negative. + * @param replaceCost the cost of a substitution operation, must not be negative. + * @return result distance, or -1 if the distance exceeds the threshold. */ - private static int limitedCompare(SimilarityInput left, SimilarityInput right, final int threshold) { // NOPMD + private static int limitedCompare(SimilarityInput left, SimilarityInput right, // NOPMD + final int threshold, final int insertCost, final int deleteCost, final int replaceCost) { if (left == null || right == null) { throw new IllegalArgumentException("CharSequences must not be null"); } - - /* - * This implementation only computes the distance if it's less than or equal to the threshold value, returning -1 if it's greater. The advantage is - * performance: unbounded distance is O(nm), but a bound of k allows us to reduce it to O(km) time by only computing a diagonal stripe of width 2k + 1 - * of the cost table. It is also possible to use this to compute the unbounded Levenshtein distance by starting the threshold at 1 and doubling each - * time until the distance is found; this is O(dm), where d is the distance. - * - * One subtlety comes from needing to ignore entries on the border of our stripe, for example, - * p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry to the left - * of the leftmost member We must ignore the entry above the rightmost member - * - * Another subtlety comes from our stripe running off the matrix if the strings aren't of the same size. Since string s is always swapped to be the - * shorter of the two, the stripe will always run off to the upper right instead of the lower left of the matrix. - * - * As a concrete example, suppose s is of length 5, t is of length 7, and our threshold is 1. In this case we're going to walk a stripe of length 3. The - * matrix would look like so: - * - *
 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 | | | |#|#| 6 | | | | |#| 7 | | | | | | 
- * - * Note how the stripe leads off the table as there is no possible way to turn a string of length 5 into one of length 7 in edit distance of 1. - * - * Additionally, this implementation decreases memory usage by using two single-dimensional arrays and swapping them back and forth instead of - * allocating an entire n by m matrix. This requires a few minor changes, such as immediately returning when it's detected that the stripe has run off - * the matrix and initially filling the arrays with large values so that entries we don't compute are ignored. - * - * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for some discussion. - */ + if (threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } int n = left.length(); // length of left int m = right.length(); // length of right - // if one string is empty, the edit distance is necessarily the length - // of the other + // If one string is empty, the edit distance is the cost of inserting/deleting + // all characters of the other string. if (n == 0) { - return m <= threshold ? m : -1; + final int dist = m * insertCost; + return dist <= threshold ? dist : -1; } if (m == 0) { - return n <= threshold ? n : -1; + final int dist = n * deleteCost; + return dist <= threshold ? dist : -1; + } + + // When all costs equal 1, use the classic diagonal-stripe optimisation. + // For asymmetric costs the stripe width is not reliably bounded, so fall + // back to the full O(nm) table and threshold-check only at the end. + if (insertCost == 1 && deleteCost == 1 && replaceCost == 1) { + return limitedCompareUniformCost(left, right, threshold, n, m); } + return limitedCompareCustomCost(left, right, threshold, insertCost, deleteCost, replaceCost, n, m); + } + + /** + * Classic stripe-optimised O(km) limited compare for uniform unit costs. + * This preserves the original algorithm exactly. + */ + private static int limitedCompareUniformCost(SimilarityInput left, SimilarityInput right, + final int threshold, int n, int m) { if (n > m) { - // swap the two strings to consume less memory final SimilarityInput tmp = left; left = right; right = tmp; @@ -130,66 +156,50 @@ private static int limitedCompare(SimilarityInput left, SimilarityInput threshold) { return -1; } - int[] p = new int[n + 1]; // 'previous' cost array, horizontally - int[] d = new int[n + 1]; // cost array, horizontally - int[] tempD; // placeholder to assist in swapping p and d + int[] p = new int[n + 1]; + int[] d = new int[n + 1]; + int[] tempD; - // fill in starting table values final int boundary = Math.min(n, threshold) + 1; for (int i = 0; i < boundary; i++) { p[i] = i; } - // these fills ensure that the value above the rightmost entry of our - // stripe will be ignored in following loop iterations Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); Arrays.fill(d, Integer.MAX_VALUE); - // iterates through t for (int j = 1; j <= m; j++) { - final E rightJ = right.at(j - 1); // jth character of right + final E rightJ = right.at(j - 1); d[0] = j; - // compute stripe indices, constrain to array size final int min = Math.max(1, j - threshold); final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min(n, j + threshold); - // ignore entry left of leftmost if (min > 1) { d[min - 1] = Integer.MAX_VALUE; } int lowerBound = Integer.MAX_VALUE; - // iterates through [min, max] in s for (int i = min; i <= max; i++) { if (left.at(i - 1).equals(rightJ)) { - // diagonally left and up d[i] = p[i - 1]; } else { - // 1 + minimum of cell to the left, to the top, diagonally - // left and up d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); } lowerBound = Math.min(lowerBound, d[i]); } - // if the lower bound is greater than the threshold, then exit early if (lowerBound > threshold) { return -1; } - // copy current distance counts to 'previous row' distance counts tempD = p; p = d; d = tempD; } - // if p[n] is greater than the threshold, there's no guarantee on it - // being the correct - // distance if (p[n] <= threshold) { return p[n]; } @@ -197,93 +207,186 @@ private static int limitedCompare(SimilarityInput left, SimilarityInput - * A higher score indicates a greater distance. + * When {@code deleteCost != insertCost} swapping the strings would change the + * semantics (delete on the original becomes insert on the swapped copy), so + * we always keep left as-is and pay the correct directional cost. *

+ */ + private static int limitedCompareCustomCost(final SimilarityInput left, final SimilarityInput right, + final int threshold, final int insertCost, final int deleteCost, final int replaceCost, + final int n, final int m) { + + // p[i] = cost to convert left[0..i-1] to right[0..j-1] (previous row) + int[] p = new int[n + 1]; + int[] d = new int[n + 1]; + + // Base case: convert left[0..i-1] to empty string via i deletions. + for (int i = 0; i <= n; i++) { + p[i] = i * deleteCost; + } + + for (int j = 1; j <= m; j++) { + final E rightJ = right.at(j - 1); + // Base case: convert empty string to right[0..j-1] via j insertions. + d[0] = j * insertCost; + + for (int i = 1; i <= n; i++) { + if (left.at(i - 1).equals(rightJ)) { + // Characters match ? no operation needed (cost 0). + d[i] = p[i - 1]; + } else { + // Minimum of: delete left[i-1], insert right[j-1], or replace. + d[i] = Math.min( + Math.min(d[i - 1] + insertCost, // insert right[j-1] + p[i] + deleteCost), // delete left[i-1] + p[i - 1] + replaceCost // replace + ); + } + } + + // Swap rows. + final int[] tempD = p; + p = d; + d = tempD; + } + + if (p[n] <= threshold) { + return p[n]; + } + return -1; + } + + /** + * Finds the Levenshtein distance between two Strings using configurable + * insert, delete, and replace costs. * *

- * This implementation only need one single-dimensional arrays of length s.length() + 1 + * A higher score indicates a greater distance. *

* *
-     * unlimitedCompare(null, *)             = Throws {@link IllegalArgumentException}
-     * unlimitedCompare(*, null)             = Throws {@link IllegalArgumentException}
-     * unlimitedCompare("","")               = 0
-     * unlimitedCompare("","a")              = 1
-     * unlimitedCompare("aaapppp", "")       = 7
-     * unlimitedCompare("frog", "fog")       = 1
-     * unlimitedCompare("fly", "ant")        = 3
-     * unlimitedCompare("elephant", "hippo") = 7
-     * unlimitedCompare("hippo", "elephant") = 7
-     * unlimitedCompare("hippo", "zzzzzzzz") = 8
-     * unlimitedCompare("hello", "hallo")    = 1
+     * unlimitedCompare(null, *, *, *, *)             = Throws {@link IllegalArgumentException}
+     * unlimitedCompare(*, null, *, *, *)             = Throws {@link IllegalArgumentException}
+     * unlimitedCompare("","", 1, 1, 1)               = 0
+     * unlimitedCompare("","a", 1, 1, 1)              = 1
+     * unlimitedCompare("aaapppp", "", 1, 1, 1)       = 7
+     * unlimitedCompare("frog", "fog", 1, 1, 1)       = 1
+     * unlimitedCompare("fly", "ant", 1, 1, 1)        = 3
+     * unlimitedCompare("elephant", "hippo", 1, 1, 1) = 7
+     * unlimitedCompare("hippo", "elephant", 1, 1, 1) = 7
+     * unlimitedCompare("hippo", "zzzzzzzz", 1, 1, 1) = 8
+     * unlimitedCompare("hello", "hallo", 1, 1, 1)    = 1
      * 
* - * @param left the first CharSequence, must not be null. - * @param right the second CharSequence, must not be null. - * @return result distance, or -1. + * @param left the first CharSequence, must not be null. + * @param right the second CharSequence, must not be null. + * @param insertCost the cost of an insertion operation, must not be negative. + * @param deleteCost the cost of a deletion operation, must not be negative. + * @param replaceCost the cost of a substitution operation, must not be negative. + * @return result distance. * @throws IllegalArgumentException if either CharSequence input is {@code null}. */ - private static int unlimitedCompare(SimilarityInput left, SimilarityInput right) { + private static int unlimitedCompare(SimilarityInput left, SimilarityInput right, + final int insertCost, final int deleteCost, final int replaceCost) { if (left == null || right == null) { throw new IllegalArgumentException("CharSequences must not be null"); } - /* - * This implementation use two variable to record the previous cost counts, So this implementation use less memory than previous impl. - */ + int n = left.length(); // length of left int m = right.length(); // length of right if (n == 0) { - return m; + return m * insertCost; } if (m == 0) { - return n; + return n * deleteCost; } - if (n > m) { - // swap the input strings to consume less memory + + // When costs are symmetric (insert == delete) we can safely swap the + // shorter string into 'left' to minimise the working array size. + // When insert != delete, swapping reverses the semantics of those two + // operations, so we must keep the original orientation. + final boolean canSwap = insertCost == deleteCost; + if (canSwap && n > m) { final SimilarityInput tmp = left; left = right; right = tmp; n = m; m = right.length(); } + + // Single rolling array of length n+1. final int[] p = new int[n + 1]; - // indexes into strings left and right - int i; // iterates through left - int j; // iterates through right + + // Base case: converting left[0..i-1] ? "" costs i deletions. + for (int i = 0; i <= n; i++) { + p[i] = i * deleteCost; + } + int upperLeft; int upper; - E rightJ; // jth character of right - int cost; // cost - for (i = 0; i <= n; i++) { - p[i] = i; - } - for (j = 1; j <= m; j++) { + + for (int j = 1; j <= m; j++) { upperLeft = p[0]; - rightJ = right.at(j - 1); - p[0] = j; + final E rightJ = right.at(j - 1); + // Base case: converting "" ? right[0..j-1] costs j insertions. + p[0] = j * insertCost; - for (i = 1; i <= n; i++) { + for (int i = 1; i <= n; i++) { upper = p[i]; - cost = left.at(i - 1).equals(rightJ) ? 0 : 1; - // minimum of cell to the left+1, to the top+1, diagonally left and up +cost - p[i] = Math.min(Math.min(p[i - 1] + 1, p[i] + 1), upperLeft + cost); + if (left.at(i - 1).equals(rightJ)) { + // Characters match ? carry diagonal (no cost). + p[i] = upperLeft; + } else { + // Minimum of insert, delete, or replace. + p[i] = Math.min( + Math.min(p[i - 1] + insertCost, // insert right[j-1] + p[i] + deleteCost), // delete left[i-1] + upperLeft + replaceCost // replace + ); + } upperLeft = upper; } } return p[n]; } + // ------------------------------------------------------------------------- + // Instance state + // ------------------------------------------------------------------------- + /** - * Threshold. + * Threshold (nullable). When non-null, {@link #limitedCompare} is used + * instead of {@link #unlimitedCompare}. */ private final Integer threshold; /** - * Constructs a default instance that uses a version of the algorithm that does not use a threshold parameter. + * Cost of inserting a character into the left sequence. + */ + private final int insertCost; + + /** + * Cost of deleting a character from the left sequence. + */ + private final int deleteCost; + + /** + * Cost of substituting one character for another. + */ + private final int replaceCost; + + // ------------------------------------------------------------------------- + // Constructors + // ------------------------------------------------------------------------- + + /** + * Constructs a default instance that uses a version of the algorithm that does not use a + * threshold parameter, with all operation costs set to 1. * * @see LevenshteinDistance#getDefaultInstance() * @deprecated Use {@link #getDefaultInstance()}. @@ -294,18 +397,66 @@ public LevenshteinDistance() { } /** - * Constructs a new instance. If the threshold is not null, distance calculations will be limited to a maximum length. If the threshold is null, the - * unlimited version of the algorithm will be used. + * Constructs a new instance with the given threshold and all operation costs set to 1. * - * @param threshold If this is null then distances calculations will not be limited. This may not be negative. + *

+ * If the threshold is not null, distance calculations will be limited to a maximum length. If + * the threshold is null, the unlimited version of the algorithm will be used. + *

+ * + * @param threshold If this is null then distances calculations will not be limited. + * This may not be negative. */ public LevenshteinDistance(final Integer threshold) { + this(threshold, DEFAULT_INSERT_COST, DEFAULT_DELETE_COST, DEFAULT_REPLACE_COST); + } + + /** + * Constructs a new instance with the given threshold and custom operation costs. + * + *

+ * If the threshold is not null, distance calculations will be limited to a maximum value. + * If the threshold is null, the unlimited version of the algorithm will be used. + *

+ * + *

+ * All cost parameters must be non-negative integers. Passing 0 for a cost makes that + * operation free; passing values greater than 1 makes it more expensive relative to + * the other operations. + *

+ * + * @param threshold If this is null then distance calculations will not be limited. + * This may not be negative. + * @param insertCost the cost of inserting a character, must not be negative. + * @param deleteCost the cost of deleting a character, must not be negative. + * @param replaceCost the cost of replacing (substituting) a character, must not be negative. + * @throws IllegalArgumentException if threshold is negative, or any cost is negative. + * @since 1.13.0 + */ + public LevenshteinDistance(final Integer threshold, final int insertCost, final int deleteCost, + final int replaceCost) { if (threshold != null && threshold < 0) { throw new IllegalArgumentException("Threshold must not be negative"); } - this.threshold = threshold; + if (insertCost < 0) { + throw new IllegalArgumentException("Insert cost must not be negative"); + } + if (deleteCost < 0) { + throw new IllegalArgumentException("Delete cost must not be negative"); + } + if (replaceCost < 0) { + throw new IllegalArgumentException("Replace cost must not be negative"); + } + this.threshold = threshold; + this.insertCost = insertCost; + this.deleteCost = deleteCost; + this.replaceCost = replaceCost; } + // ------------------------------------------------------------------------- + // Public API + // ------------------------------------------------------------------------- + /** * Computes the Levenshtein distance between two Strings. * @@ -313,29 +464,23 @@ public LevenshteinDistance(final Integer threshold) { * A higher score indicates a greater distance. *

* - *

- * Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large - * strings. - *

- * *
      * distance.apply(null, *)             = Throws {@link IllegalArgumentException}
      * distance.apply(*, null)             = Throws {@link IllegalArgumentException}
      * distance.apply("","")               = 0
-     * distance.apply("","a")              = 1
-     * distance.apply("aaapppp", "")       = 7
-     * distance.apply("frog", "fog")       = 1
-     * distance.apply("fly", "ant")        = 3
-     * distance.apply("elephant", "hippo") = 7
-     * distance.apply("hippo", "elephant") = 7
-     * distance.apply("hippo", "zzzzzzzz") = 8
-     * distance.apply("hello", "hallo")    = 1
+     * distance.apply("","a")              = insertCost
+     * distance.apply("aaapppp", "")       = 7 * deleteCost
+     * distance.apply("frog", "fog")       = 1 * deleteCost (one deletion)
+     * distance.apply("fly", "ant")        = replaceCost + replaceCost + replaceCost
+     * distance.apply("elephant", "hippo") = 7  (with default costs)
+     * distance.apply("hippo", "elephant") = 7  (with default costs)
+     * distance.apply("hello", "hallo")    = 1  (with default costs)
      * 
* * @param left the first input, must not be null. * @param right the second input, must not be null. - * @return result distance, or -1. - * @throws IllegalArgumentException if either String input {@code null}. + * @return result distance, or -1 if a threshold is set and the distance exceeds it. + * @throws IllegalArgumentException if either String input is {@code null}. */ @Override public Integer apply(final CharSequence left, final CharSequence right) { @@ -349,41 +494,60 @@ public Integer apply(final CharSequence left, final CharSequence right) { * A higher score indicates a greater distance. *

* - *
-     * distance.apply(null, *)             = Throws {@link IllegalArgumentException}
-     * distance.apply(*, null)             = Throws {@link IllegalArgumentException}
-     * distance.apply("","")               = 0
-     * distance.apply("","a")              = 1
-     * distance.apply("aaapppp", "")       = 7
-     * distance.apply("frog", "fog")       = 1
-     * distance.apply("fly", "ant")        = 3
-     * distance.apply("elephant", "hippo") = 7
-     * distance.apply("hippo", "elephant") = 7
-     * distance.apply("hippo", "zzzzzzzz") = 8
-     * distance.apply("hello", "hallo")    = 1
-     * 
- * * @param The type of similarity score unit. * @param left the first input, must not be null. * @param right the second input, must not be null. - * @return result distance, or -1. - * @throws IllegalArgumentException if either String input {@code null}. + * @return result distance, or -1 if a threshold is set and the distance exceeds it. + * @throws IllegalArgumentException if either input is {@code null}. * @since 1.13.0 */ public Integer apply(final SimilarityInput left, final SimilarityInput right) { if (threshold != null) { - return limitedCompare(left, right, threshold); + return limitedCompare(left, right, threshold, insertCost, deleteCost, replaceCost); } - return unlimitedCompare(left, right); + return unlimitedCompare(left, right, insertCost, deleteCost, replaceCost); } + // ------------------------------------------------------------------------- + // Accessors + // ------------------------------------------------------------------------- + /** * Gets the distance threshold. * - * @return The distance threshold. + * @return The distance threshold, or {@code null} if no threshold is set. */ public Integer getThreshold() { return threshold; } -} + /** + * Gets the cost of an insertion operation. + * + * @return The insertion cost. + * @since 1.13.0 + */ + public int getInsertCost() { + return insertCost; + } + + /** + * Gets the cost of a deletion operation. + * + * @return The deletion cost. + * @since 1.13.0 + */ + public int getDeleteCost() { + return deleteCost; + } + + /** + * Gets the cost of a substitution (replace) operation. + * + * @return The replacement cost. + * @since 1.13.0 + */ + public int getReplaceCost() { + return replaceCost; + } +} \ No newline at end of file diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java index e7a88ca498..ecf24ceea8 100644 --- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * https://www.apache.org/licenses/LICENSE-2.0 + * https://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -181,4 +181,58 @@ void testGetThresholdDirectlyAfterObjectInstantiation() { assertNull(LevenshteinDistance.getDefaultInstance().getThreshold()); } -} + // ------------------------------------------------------------------------- + // New Weighted Levenshtein Distance Tests + // ------------------------------------------------------------------------- + + @Test + void testConstructorWithNegativeCosts() { + assertThrows(IllegalArgumentException.class, () -> new LevenshteinDistance(null, -1, 1, 1)); + assertThrows(IllegalArgumentException.class, () -> new LevenshteinDistance(null, 1, -1, 1)); + assertThrows(IllegalArgumentException.class, () -> new LevenshteinDistance(null, 1, 1, -1)); + } + + @Test + void testGetLevenshteinDistance_WeightedUnlimited() { + // Substitution is very expensive (10) vs Insert/Delete (1 each) + final LevenshteinDistance dist = new LevenshteinDistance(null, 1, 1, 10); + // 'a' -> 'b' should choose delete 'a' (1) and insert 'b' (1) = distance 2, + // instead of replace (10). + assertEquals(2, dist.apply("a", "b")); + + // All operations are free (0) + final LevenshteinDistance freeDist = new LevenshteinDistance(null, 0, 0, 0); + assertEquals(0, freeDist.apply("abc", "def")); + + // Asymmetric costs: Insert=10, Delete=1, Replace=100 + final LevenshteinDistance asymmetric = new LevenshteinDistance(null, 10, 1, 100); + assertEquals(1, asymmetric.apply("a", "")); // Delete 'a' = 1 + assertEquals(10, asymmetric.apply("", "a")); // Insert 'a' = 10 + } + + @Test + void testGetLevenshteinDistance_WeightedThreshold() { + // Distance is 2 (via delete/insert), threshold is 5 -> result 2 + final LevenshteinDistance weighted = new LevenshteinDistance(5, 1, 1, 10); + assertEquals(2, weighted.apply("a", "b")); + + // Distance is 2, threshold is 1 -> result -1 + final LevenshteinDistance strict = new LevenshteinDistance(1, 1, 1, 10); + assertEquals(-1, strict.apply("a", "b")); + + // Empty strings with weighted threshold + assertEquals(0, new LevenshteinDistance(5, 2, 2, 2).apply("", "")); + assertEquals(4, new LevenshteinDistance(5, 2, 2, 2).apply("aa", "")); + assertEquals(-1, new LevenshteinDistance(1, 2, 2, 2).apply("aa", "")); + } + + @Test + void testWeightedAccessors() { + final LevenshteinDistance dist = new LevenshteinDistance(10, 2, 3, 4); + assertEquals(10, dist.getThreshold()); + assertEquals(2, dist.getInsertCost()); + assertEquals(3, dist.getDeleteCost()); + assertEquals(4, dist.getReplaceCost()); + } + +} \ No newline at end of file From 3242454c5e65ed0a7f508a879e8bcaa9e33a66c0 Mon Sep 17 00:00:00 2001 From: Ron Ladin Date: Sun, 15 Mar 2026 19:58:12 +0200 Subject: [PATCH 2/3] Refactor LevenshteinDistance to use Builder pattern; apply Checkstyle and formatting fixes per PR review --- .../text/similarity/LevenshteinDistance.java | 496 ++++++++++-------- .../similarity/LevenshteinDistanceTest.java | 37 +- 2 files changed, 303 insertions(+), 230 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java index a7cf2c4f00..ea7f5aa890 100644 --- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java @@ -19,20 +19,33 @@ import java.util.Arrays; /** - * An algorithm for measuring the difference between two character sequences using the Levenshtein - * Distance. + * An algorithm for measuring the difference between two character sequences using the + * Levenshtein Distance. * *

- * This is the number of changes needed to change one sequence into another, where each change is a single character modification (deletion, insertion or - * substitution). + * This is the number of changes needed to change one sequence into another, where each change is a + * single character modification (deletion, insertion or substitution). *

* *

- * This implementation supports configurable costs for insertion, deletion, and substitution operations. By default, all costs are set to 1 for - * backward compatibility. + * This implementation supports configurable costs for insertion, deletion, and substitution + * operations. By default, all costs are set to 1 for backward compatibility. *

* *

+ * Use {@link Builder} to construct instances with custom thresholds and operation costs: + *

+ * + *
+ * LevenshteinDistance dist = LevenshteinDistance.builder()
+ *     .threshold(10)
+ *     .insertCost(1)
+ *     .deleteCost(2)
+ *     .replaceCost(3)
+ *     .build();
+ * 
+ * + *

* This code has been adapted from Apache Commons Lang 3.3. *

* @@ -43,19 +56,109 @@ public class LevenshteinDistance implements EditDistance { /** - * Default cost for an insertion operation. + * Builds {@link LevenshteinDistance} instances. + * + *

+ * All costs default to 1. The threshold defaults to {@code null} (unlimited). + *

+ * + *
+     * LevenshteinDistance dist = LevenshteinDistance.builder()
+     *     .threshold(5)
+     *     .insertCost(1)
+     *     .deleteCost(1)
+     *     .replaceCost(2)
+     *     .build();
+     * 
+ * + * @since 1.13.0 */ - private static final int DEFAULT_INSERT_COST = 1; + public static final class Builder { - /** - * Default cost for a deletion operation. - */ - private static final int DEFAULT_DELETE_COST = 1; + /** + * Default cost for any single edit operation. + */ + private static final int DEFAULT_COST = 1; - /** - * Default cost for a substitution (replace) operation. - */ - private static final int DEFAULT_REPLACE_COST = 1; + /** Threshold for limited compare, or {@code null} for unlimited. */ + private Integer threshold; + + /** Cost of inserting a character. */ + private int insertCost = DEFAULT_COST; + + /** Cost of deleting a character. */ + private int deleteCost = DEFAULT_COST; + + /** Cost of substituting one character for another. */ + private int replaceCost = DEFAULT_COST; + + /** + * Constructs a new builder with default values. + */ + private Builder() { + // use LevenshteinDistance.builder() factory method + } + + /** + * Builds a new {@link LevenshteinDistance} from the current state of this builder. + * + * @return a new {@link LevenshteinDistance}. + * @throws IllegalArgumentException if the threshold is negative, or any cost is negative. + */ + public LevenshteinDistance build() { + return new LevenshteinDistance(this); + } + + /** + * Sets the cost of a deletion operation. + * + * @param deleteCost the cost of deleting a character; must not be negative. + * @return {@code this} builder. + */ + public Builder deleteCost(final int deleteCost) { + this.deleteCost = deleteCost; + return this; + } + + /** + * Sets the cost of an insertion operation. + * + * @param insertCost the cost of inserting a character; must not be negative. + * @return {@code this} builder. + */ + public Builder insertCost(final int insertCost) { + this.insertCost = insertCost; + return this; + } + + /** + * Sets the cost of a substitution (replace) operation. + * + * @param replaceCost the cost of replacing a character; must not be negative. + * @return {@code this} builder. + */ + public Builder replaceCost(final int replaceCost) { + this.replaceCost = replaceCost; + return this; + } + + /** + * Sets the threshold for limited distance calculation. + * + *

+ * When set, {@link LevenshteinDistance#apply} returns {@code -1} if the computed + * distance exceeds this value. When {@code null}, the unlimited algorithm is used. + *

+ * + * @param threshold the maximum distance to report; must not be negative, or {@code null} + * for no limit. + * @return {@code this} builder. + */ + public Builder threshold(final Integer threshold) { + this.threshold = threshold; + return this; + } + } /** * The singleton instance (uses default costs and no threshold). @@ -63,51 +166,62 @@ public class LevenshteinDistance implements EditDistance { private static final LevenshteinDistance INSTANCE = new LevenshteinDistance(); /** - * Gets the default instance. + * Returns a new {@link Builder} for constructing {@link LevenshteinDistance} instances. + * + * @return a new {@link Builder}. + * @since 1.13.0 + */ + public static Builder builder() { + return new Builder(); + } + + /** + * Gets the default instance, which uses no threshold and all operation costs set to 1. * - * @return The default instance. + * @return the default instance. */ public static LevenshteinDistance getDefaultInstance() { return INSTANCE; } /** - * Finds the Levenshtein distance between two CharSequences if it's less than or equal to a given - * threshold, using configurable costs for insert, delete, and replace operations. + * Finds the Levenshtein distance between two CharSequences if it is less than or equal to a + * given threshold, using configurable costs for insert, delete, and replace operations. * *

- * This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield and - * Chas Emerick's implementation of the Levenshtein distance algorithm. + * This implementation follows from Algorithms on Strings, Trees and Sequences by + * Dan Gusfield and Chas Emerick's implementation of the Levenshtein distance algorithm. *

* *

- * Note: The stripe-width optimisation used in the default (all-costs-1) case relies on the - * assumption that each operation costs exactly 1. When custom costs are supplied the stripe - * cannot be reliably bounded to {@code 2*threshold+1}, so the full O(nm) DP table is used - * instead, returning -1 only when the final distance exceeds the threshold. + * Note: The stripe-width optimisation used in the unit-cost case relies on the assumption that + * each operation costs exactly 1. When custom costs are supplied the stripe cannot be reliably + * bounded to {@code 2*threshold+1}, so the full O(nm) DP table is used instead, returning + * {@code -1} only when the final distance exceeds the threshold. *

* *
-     * limitedCompare(null, *, *, *, *, *)             = Throws {@link IllegalArgumentException}
-     * limitedCompare(*, null, *, *, *, *)             = Throws {@link IllegalArgumentException}
-     * limitedCompare(*, *, -1, *, *, *)               = Throws {@link IllegalArgumentException}
+     * limitedCompare(null, *, *, *, *, *)             = throws {@link IllegalArgumentException}
+     * limitedCompare(*, null, *, *, *, *)             = throws {@link IllegalArgumentException}
+     * limitedCompare(*, *, -1, *, *, *)               = throws {@link IllegalArgumentException}
      * limitedCompare("","", 0, 1, 1, 1)               = 0
      * limitedCompare("aaapppp", "", 8, 1, 1, 1)       = 7
      * limitedCompare("aaapppp", "", 7, 1, 1, 1)       = 7
-     * limitedCompare("aaapppp", "", 6, 1, 1, 1))      = -1
+     * limitedCompare("aaapppp", "", 6, 1, 1, 1)       = -1
      * limitedCompare("elephant", "hippo", 7, 1, 1, 1) = 7
      * limitedCompare("elephant", "hippo", 6, 1, 1, 1) = -1
      * limitedCompare("hippo", "elephant", 7, 1, 1, 1) = 7
      * limitedCompare("hippo", "elephant", 6, 1, 1, 1) = -1
      * 
* + * @param the element type of the {@link SimilarityInput}. * @param left the first SimilarityInput, must not be null. * @param right the second SimilarityInput, must not be null. * @param threshold the target threshold, must not be negative. * @param insertCost the cost of an insertion operation, must not be negative. * @param deleteCost the cost of a deletion operation, must not be negative. * @param replaceCost the cost of a substitution operation, must not be negative. - * @return result distance, or -1 if the distance exceeds the threshold. + * @return result distance, or {@code -1} if the distance exceeds the threshold. */ private static int limitedCompare(SimilarityInput left, SimilarityInput right, // NOPMD final int threshold, final int insertCost, final int deleteCost, final int replaceCost) { @@ -118,11 +232,9 @@ private static int limitedCompare(SimilarityInput left, SimilarityInput int limitedCompare(SimilarityInput left, SimilarityInput + * Uses two rolling arrays to keep memory at O(min(n, m)). + *

+ * + *

+ * When {@code deleteCost != insertCost} swapping the strings would change the semantics + * (delete on the original becomes insert on the swapped copy), so the orientation is always + * kept as-is and the correct directional cost is applied. + *

+ * + * @param the element type of the {@link SimilarityInput}. + * @param left the first SimilarityInput, must not be null. + * @param right the second SimilarityInput, must not be null. + * @param threshold the target threshold. + * @param n the length of {@code left}. + * @param m the length of {@code right}. + * @param costs int array of length 3: {@code {insertCost, deleteCost, replaceCost}}. + * @return result distance, or {@code -1} if the distance exceeds the threshold. + */ + private static int limitedCompareCustomCost(final SimilarityInput left, + final SimilarityInput right, final int threshold, final int n, final int m, + final int[] costs) { + final int insertCost = costs[0]; + final int deleteCost = costs[1]; + final int replaceCost = costs[2]; + + int[] p = new int[n + 1]; + int[] d = new int[n + 1]; + + for (int i = 0; i <= n; i++) { + p[i] = i * deleteCost; + } + + for (int j = 1; j <= m; j++) { + final E rightJ = right.at(j - 1); + d[0] = j * insertCost; + + for (int i = 1; i <= n; i++) { + if (left.at(i - 1).equals(rightJ)) { + d[i] = p[i - 1]; + } else { + d[i] = Math.min( + Math.min(d[i - 1] + insertCost, p[i] + deleteCost), + p[i - 1] + replaceCost); + } + } + + final int[] tempD = p; + p = d; + d = tempD; + } + + return p[n] <= threshold ? p[n] : -1; } /** * Classic stripe-optimised O(km) limited compare for uniform unit costs. + * + *

* This preserves the original algorithm exactly. + *

+ * + * @param the element type of the {@link SimilarityInput}. + * @param left the first SimilarityInput, must not be null. + * @param right the second SimilarityInput, must not be null. + * @param threshold the target threshold. + * @param n the length of {@code left} (after optional swap). + * @param m the length of {@code right} (after optional swap). + * @return result distance, or {@code -1} if the distance exceeds the threshold. */ - private static int limitedCompareUniformCost(SimilarityInput left, SimilarityInput right, - final int threshold, int n, int m) { + private static int limitedCompareUniformCost(SimilarityInput left, + SimilarityInput right, final int threshold, int n, int m) { if (n > m) { final SimilarityInput tmp = left; @@ -200,77 +379,20 @@ private static int limitedCompareUniformCost(SimilarityInput left, Simila d = tempD; } - if (p[n] <= threshold) { - return p[n]; - } - return -1; + return p[n] <= threshold ? p[n] : -1; } /** - * Full O(nm) limited compare for custom (non-uniform) operation costs. - * Uses two rolling arrays to keep memory at O(min(n,m)). - * - *

- * When {@code deleteCost != insertCost} swapping the strings would change the - * semantics (delete on the original becomes insert on the swapped copy), so - * we always keep left as-is and pay the correct directional cost. - *

- */ - private static int limitedCompareCustomCost(final SimilarityInput left, final SimilarityInput right, - final int threshold, final int insertCost, final int deleteCost, final int replaceCost, - final int n, final int m) { - - // p[i] = cost to convert left[0..i-1] to right[0..j-1] (previous row) - int[] p = new int[n + 1]; - int[] d = new int[n + 1]; - - // Base case: convert left[0..i-1] to empty string via i deletions. - for (int i = 0; i <= n; i++) { - p[i] = i * deleteCost; - } - - for (int j = 1; j <= m; j++) { - final E rightJ = right.at(j - 1); - // Base case: convert empty string to right[0..j-1] via j insertions. - d[0] = j * insertCost; - - for (int i = 1; i <= n; i++) { - if (left.at(i - 1).equals(rightJ)) { - // Characters match ? no operation needed (cost 0). - d[i] = p[i - 1]; - } else { - // Minimum of: delete left[i-1], insert right[j-1], or replace. - d[i] = Math.min( - Math.min(d[i - 1] + insertCost, // insert right[j-1] - p[i] + deleteCost), // delete left[i-1] - p[i - 1] + replaceCost // replace - ); - } - } - - // Swap rows. - final int[] tempD = p; - p = d; - d = tempD; - } - - if (p[n] <= threshold) { - return p[n]; - } - return -1; - } - - /** - * Finds the Levenshtein distance between two Strings using configurable - * insert, delete, and replace costs. + * Finds the Levenshtein distance between two Strings using configurable insert, delete, and + * replace costs. * *

* A higher score indicates a greater distance. *

* *
-     * unlimitedCompare(null, *, *, *, *)             = Throws {@link IllegalArgumentException}
-     * unlimitedCompare(*, null, *, *, *)             = Throws {@link IllegalArgumentException}
+     * unlimitedCompare(null, *, *, *, *)             = throws {@link IllegalArgumentException}
+     * unlimitedCompare(*, null, *, *, *)             = throws {@link IllegalArgumentException}
      * unlimitedCompare("","", 1, 1, 1)               = 0
      * unlimitedCompare("","a", 1, 1, 1)              = 1
      * unlimitedCompare("aaapppp", "", 1, 1, 1)       = 7
@@ -282,13 +404,14 @@ private static  int limitedCompareCustomCost(final SimilarityInput left, f
      * unlimitedCompare("hello", "hallo", 1, 1, 1)    = 1
      * 
* - * @param left the first CharSequence, must not be null. - * @param right the second CharSequence, must not be null. + * @param the element type of the {@link SimilarityInput}. + * @param left the first SimilarityInput, must not be null. + * @param right the second SimilarityInput, must not be null. * @param insertCost the cost of an insertion operation, must not be negative. * @param deleteCost the cost of a deletion operation, must not be negative. * @param replaceCost the cost of a substitution operation, must not be negative. * @return result distance. - * @throws IllegalArgumentException if either CharSequence input is {@code null}. + * @throws IllegalArgumentException if either input is {@code null}. */ private static int unlimitedCompare(SimilarityInput left, SimilarityInput right, final int insertCost, final int deleteCost, final int replaceCost) { @@ -296,8 +419,8 @@ private static int unlimitedCompare(SimilarityInput left, SimilarityInput throw new IllegalArgumentException("CharSequences must not be null"); } - int n = left.length(); // length of left - int m = right.length(); // length of right + int n = left.length(); + int m = right.length(); if (n == 0) { return m * insertCost; @@ -306,10 +429,9 @@ private static int unlimitedCompare(SimilarityInput left, SimilarityInput return n * deleteCost; } - // When costs are symmetric (insert == delete) we can safely swap the - // shorter string into 'left' to minimise the working array size. - // When insert != delete, swapping reverses the semantics of those two - // operations, so we must keep the original orientation. + // When insert == delete costs are symmetric; swapping the shorter string into + // 'left' minimises working-array size without changing semantics. + // When insert != delete, swapping reverses their roles, so we keep the original order. final boolean canSwap = insertCost == deleteCost; if (canSwap && n > m) { final SimilarityInput tmp = left; @@ -319,10 +441,8 @@ private static int unlimitedCompare(SimilarityInput left, SimilarityInput m = right.length(); } - // Single rolling array of length n+1. final int[] p = new int[n + 1]; - // Base case: converting left[0..i-1] ? "" costs i deletions. for (int i = 0; i <= n; i++) { p[i] = i * deleteCost; } @@ -333,21 +453,16 @@ private static int unlimitedCompare(SimilarityInput left, SimilarityInput for (int j = 1; j <= m; j++) { upperLeft = p[0]; final E rightJ = right.at(j - 1); - // Base case: converting "" ? right[0..j-1] costs j insertions. p[0] = j * insertCost; for (int i = 1; i <= n; i++) { upper = p[i]; if (left.at(i - 1).equals(rightJ)) { - // Characters match ? carry diagonal (no cost). p[i] = upperLeft; } else { - // Minimum of insert, delete, or replace. p[i] = Math.min( - Math.min(p[i - 1] + insertCost, // insert right[j-1] - p[i] + deleteCost), // delete left[i-1] - upperLeft + replaceCost // replace - ); + Math.min(p[i - 1] + insertCost, p[i] + deleteCost), + upperLeft + replaceCost); } upperLeft = upper; } @@ -355,108 +470,75 @@ private static int unlimitedCompare(SimilarityInput left, SimilarityInput return p[n]; } - // ------------------------------------------------------------------------- - // Instance state - // ------------------------------------------------------------------------- - /** - * Threshold (nullable). When non-null, {@link #limitedCompare} is used - * instead of {@link #unlimitedCompare}. + * Threshold (nullable). When non-null, {@link #limitedCompare} is used instead of + * {@link #unlimitedCompare}. */ private final Integer threshold; - /** - * Cost of inserting a character into the left sequence. - */ + /** Cost of inserting a character into the left sequence. */ private final int insertCost; - /** - * Cost of deleting a character from the left sequence. - */ + /** Cost of deleting a character from the left sequence. */ private final int deleteCost; - /** - * Cost of substituting one character for another. - */ + /** Cost of substituting one character for another. */ private final int replaceCost; - // ------------------------------------------------------------------------- - // Constructors - // ------------------------------------------------------------------------- - /** - * Constructs a default instance that uses a version of the algorithm that does not use a - * threshold parameter, with all operation costs set to 1. + * Constructs a default instance that uses the unlimited algorithm with all operation costs + * set to 1. * * @see LevenshteinDistance#getDefaultInstance() - * @deprecated Use {@link #getDefaultInstance()}. + * @deprecated Use {@link #getDefaultInstance()} or {@link #builder()}. */ @Deprecated public LevenshteinDistance() { - this(null); + this(builder()); } /** * Constructs a new instance with the given threshold and all operation costs set to 1. * *

- * If the threshold is not null, distance calculations will be limited to a maximum length. If - * the threshold is null, the unlimited version of the algorithm will be used. + * If the threshold is not null, distance calculations will be limited to that maximum value. + * If the threshold is null, the unlimited version of the algorithm will be used. *

* - * @param threshold If this is null then distances calculations will not be limited. - * This may not be negative. + * @param threshold if this is null then distance calculations will not be limited; + * otherwise it must not be negative. + * @deprecated Use {@link #builder()}. */ + @Deprecated public LevenshteinDistance(final Integer threshold) { - this(threshold, DEFAULT_INSERT_COST, DEFAULT_DELETE_COST, DEFAULT_REPLACE_COST); + this(builder().threshold(threshold)); } /** - * Constructs a new instance with the given threshold and custom operation costs. - * - *

- * If the threshold is not null, distance calculations will be limited to a maximum value. - * If the threshold is null, the unlimited version of the algorithm will be used. - *

- * - *

- * All cost parameters must be non-negative integers. Passing 0 for a cost makes that - * operation free; passing values greater than 1 makes it more expensive relative to - * the other operations. - *

+ * Constructs a new {@link LevenshteinDistance} from a {@link Builder}. * - * @param threshold If this is null then distance calculations will not be limited. - * This may not be negative. - * @param insertCost the cost of inserting a character, must not be negative. - * @param deleteCost the cost of deleting a character, must not be negative. - * @param replaceCost the cost of replacing (substituting) a character, must not be negative. - * @throws IllegalArgumentException if threshold is negative, or any cost is negative. - * @since 1.13.0 + * @param builder the builder; must not be null. + * @throws IllegalArgumentException if the threshold is negative, or any cost is negative. */ - public LevenshteinDistance(final Integer threshold, final int insertCost, final int deleteCost, - final int replaceCost) { - if (threshold != null && threshold < 0) { + private LevenshteinDistance(final Builder builder) { + if (builder.threshold != null && builder.threshold < 0) { throw new IllegalArgumentException("Threshold must not be negative"); } - if (insertCost < 0) { + if (builder.insertCost < 0) { throw new IllegalArgumentException("Insert cost must not be negative"); } - if (deleteCost < 0) { + if (builder.deleteCost < 0) { throw new IllegalArgumentException("Delete cost must not be negative"); } - if (replaceCost < 0) { + if (builder.replaceCost < 0) { throw new IllegalArgumentException("Replace cost must not be negative"); } - this.threshold = threshold; - this.insertCost = insertCost; - this.deleteCost = deleteCost; - this.replaceCost = replaceCost; + this.threshold = builder.threshold; + this.insertCost = builder.insertCost; + this.deleteCost = builder.deleteCost; + this.replaceCost = builder.replaceCost; } - // ------------------------------------------------------------------------- - // Public API - // ------------------------------------------------------------------------- - /** * Computes the Levenshtein distance between two Strings. * @@ -465,13 +547,13 @@ public LevenshteinDistance(final Integer threshold, final int insertCost, final *

* *
-     * distance.apply(null, *)             = Throws {@link IllegalArgumentException}
-     * distance.apply(*, null)             = Throws {@link IllegalArgumentException}
+     * distance.apply(null, *)             = throws {@link IllegalArgumentException}
+     * distance.apply(*, null)             = throws {@link IllegalArgumentException}
      * distance.apply("","")               = 0
      * distance.apply("","a")              = insertCost
      * distance.apply("aaapppp", "")       = 7 * deleteCost
-     * distance.apply("frog", "fog")       = 1 * deleteCost (one deletion)
-     * distance.apply("fly", "ant")        = replaceCost + replaceCost + replaceCost
+     * distance.apply("frog", "fog")       = 1 * deleteCost
+     * distance.apply("fly", "ant")        = 3 * replaceCost
      * distance.apply("elephant", "hippo") = 7  (with default costs)
      * distance.apply("hippo", "elephant") = 7  (with default costs)
      * distance.apply("hello", "hallo")    = 1  (with default costs)
@@ -479,7 +561,7 @@ public LevenshteinDistance(final Integer threshold, final int insertCost, final
      *
      * @param left  the first input, must not be null.
      * @param right the second input, must not be null.
-     * @return result distance, or -1 if a threshold is set and the distance exceeds it.
+     * @return result distance, or {@code -1} if a threshold is set and the distance exceeds it.
      * @throws IllegalArgumentException if either String input is {@code null}.
      */
     @Override
@@ -488,16 +570,16 @@ public Integer apply(final CharSequence left, final CharSequence right) {
     }
 
     /**
-     * Computes the Levenshtein distance between two inputs.
+     * Computes the Levenshtein distance between two {@link SimilarityInput} instances.
      *
      * 

* A higher score indicates a greater distance. *

* - * @param The type of similarity score unit. + * @param the type of element compared by the similarity score. * @param left the first input, must not be null. * @param right the second input, must not be null. - * @return result distance, or -1 if a threshold is set and the distance exceeds it. + * @return result distance, or {@code -1} if a threshold is set and the distance exceeds it. * @throws IllegalArgumentException if either input is {@code null}. * @since 1.13.0 */ @@ -508,23 +590,20 @@ public Integer apply(final SimilarityInput left, final SimilarityInput return unlimitedCompare(left, right, insertCost, deleteCost, replaceCost); } - // ------------------------------------------------------------------------- - // Accessors - // ------------------------------------------------------------------------- - /** - * Gets the distance threshold. + * Gets the cost of a deletion operation. * - * @return The distance threshold, or {@code null} if no threshold is set. + * @return the deletion cost. + * @since 1.13.0 */ - public Integer getThreshold() { - return threshold; + public int getDeleteCost() { + return deleteCost; } /** * Gets the cost of an insertion operation. * - * @return The insertion cost. + * @return the insertion cost. * @since 1.13.0 */ public int getInsertCost() { @@ -532,22 +611,21 @@ public int getInsertCost() { } /** - * Gets the cost of a deletion operation. + * Gets the cost of a substitution (replace) operation. * - * @return The deletion cost. + * @return the replacement cost. * @since 1.13.0 */ - public int getDeleteCost() { - return deleteCost; + public int getReplaceCost() { + return replaceCost; } /** - * Gets the cost of a substitution (replace) operation. + * Gets the distance threshold. * - * @return The replacement cost. - * @since 1.13.0 + * @return the distance threshold, or {@code null} if no threshold is set. */ - public int getReplaceCost() { - return replaceCost; + public Integer getThreshold() { + return threshold; } -} \ No newline at end of file +} diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java index ecf24ceea8..bdddd6ac52 100644 --- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * https://www.apache.org/licenses/LICENSE-2.0 + * https://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -181,58 +181,53 @@ void testGetThresholdDirectlyAfterObjectInstantiation() { assertNull(LevenshteinDistance.getDefaultInstance().getThreshold()); } - // ------------------------------------------------------------------------- - // New Weighted Levenshtein Distance Tests - // ------------------------------------------------------------------------- - @Test void testConstructorWithNegativeCosts() { - assertThrows(IllegalArgumentException.class, () -> new LevenshteinDistance(null, -1, 1, 1)); - assertThrows(IllegalArgumentException.class, () -> new LevenshteinDistance(null, 1, -1, 1)); - assertThrows(IllegalArgumentException.class, () -> new LevenshteinDistance(null, 1, 1, -1)); + assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().insertCost(-1).build()); + assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().deleteCost(-1).build()); + assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().replaceCost(-1).build()); } @Test void testGetLevenshteinDistance_WeightedUnlimited() { // Substitution is very expensive (10) vs Insert/Delete (1 each) - final LevenshteinDistance dist = new LevenshteinDistance(null, 1, 1, 10); + final LevenshteinDistance dist = LevenshteinDistance.builder().insertCost(1).deleteCost(1).replaceCost(10).build(); // 'a' -> 'b' should choose delete 'a' (1) and insert 'b' (1) = distance 2, // instead of replace (10). assertEquals(2, dist.apply("a", "b")); // All operations are free (0) - final LevenshteinDistance freeDist = new LevenshteinDistance(null, 0, 0, 0); + final LevenshteinDistance freeDist = LevenshteinDistance.builder().insertCost(0).deleteCost(0).replaceCost(0).build(); assertEquals(0, freeDist.apply("abc", "def")); // Asymmetric costs: Insert=10, Delete=1, Replace=100 - final LevenshteinDistance asymmetric = new LevenshteinDistance(null, 10, 1, 100); - assertEquals(1, asymmetric.apply("a", "")); // Delete 'a' = 1 - assertEquals(10, asymmetric.apply("", "a")); // Insert 'a' = 10 + final LevenshteinDistance asymmetric = LevenshteinDistance.builder().insertCost(10).deleteCost(1).replaceCost(100).build(); + assertEquals(1, asymmetric.apply("a", "")); // Delete 'a' = 1 + assertEquals(10, asymmetric.apply("", "a")); // Insert 'a' = 10 } @Test void testGetLevenshteinDistance_WeightedThreshold() { // Distance is 2 (via delete/insert), threshold is 5 -> result 2 - final LevenshteinDistance weighted = new LevenshteinDistance(5, 1, 1, 10); + final LevenshteinDistance weighted = LevenshteinDistance.builder().threshold(5).insertCost(1).deleteCost(1).replaceCost(10).build(); assertEquals(2, weighted.apply("a", "b")); // Distance is 2, threshold is 1 -> result -1 - final LevenshteinDistance strict = new LevenshteinDistance(1, 1, 1, 10); + final LevenshteinDistance strict = LevenshteinDistance.builder().threshold(1).insertCost(1).deleteCost(1).replaceCost(10).build(); assertEquals(-1, strict.apply("a", "b")); // Empty strings with weighted threshold - assertEquals(0, new LevenshteinDistance(5, 2, 2, 2).apply("", "")); - assertEquals(4, new LevenshteinDistance(5, 2, 2, 2).apply("aa", "")); - assertEquals(-1, new LevenshteinDistance(1, 2, 2, 2).apply("aa", "")); + assertEquals(0, LevenshteinDistance.builder().threshold(5).insertCost(2).deleteCost(2).replaceCost(2).build().apply("", "")); + assertEquals(4, LevenshteinDistance.builder().threshold(5).insertCost(2).deleteCost(2).replaceCost(2).build().apply("aa", "")); + assertEquals(-1, LevenshteinDistance.builder().threshold(1).insertCost(2).deleteCost(2).replaceCost(2).build().apply("aa", "")); } @Test void testWeightedAccessors() { - final LevenshteinDistance dist = new LevenshteinDistance(10, 2, 3, 4); + final LevenshteinDistance dist = LevenshteinDistance.builder().threshold(10).insertCost(2).deleteCost(3).replaceCost(4).build(); assertEquals(10, dist.getThreshold()); assertEquals(2, dist.getInsertCost()); assertEquals(3, dist.getDeleteCost()); assertEquals(4, dist.getReplaceCost()); } - -} \ No newline at end of file +} From 39c918d50e79fcaa28bc2d676614604ae3273063 Mon Sep 17 00:00:00 2001 From: Ron Ladin Date: Sun, 15 Mar 2026 21:13:49 +0200 Subject: [PATCH 3/3] Apply PR feedback: Update @since tags to 1.16.0 and use 'set' prefix for builder methods --- .../text/similarity/LevenshteinDistance.java | 38 +++++++++---------- .../similarity/LevenshteinDistanceTest.java | 24 ++++++------ 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java index ea7f5aa890..7e05e68cc4 100644 --- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java @@ -38,10 +38,10 @@ * *
  * LevenshteinDistance dist = LevenshteinDistance.builder()
- *     .threshold(10)
- *     .insertCost(1)
- *     .deleteCost(2)
- *     .replaceCost(3)
+ *     .setThreshold(10)
+ *     .setInsertCost(1)
+ *     .setDeleteCost(2)
+ *     .setReplaceCost(3)
  *     .build();
  * 
* @@ -64,14 +64,14 @@ public class LevenshteinDistance implements EditDistance { * *
      * LevenshteinDistance dist = LevenshteinDistance.builder()
-     *     .threshold(5)
-     *     .insertCost(1)
-     *     .deleteCost(1)
-     *     .replaceCost(2)
+     *     .setThreshold(5)
+     *     .setInsertCost(1)
+     *     .setDeleteCost(1)
+     *     .setReplaceCost(2)
      *     .build();
      * 
* - * @since 1.13.0 + * @since 1.16.0 */ public static final class Builder { @@ -115,7 +115,7 @@ public LevenshteinDistance build() { * @param deleteCost the cost of deleting a character; must not be negative. * @return {@code this} builder. */ - public Builder deleteCost(final int deleteCost) { + public Builder setDeleteCost(final int deleteCost) { this.deleteCost = deleteCost; return this; } @@ -126,7 +126,7 @@ public Builder deleteCost(final int deleteCost) { * @param insertCost the cost of inserting a character; must not be negative. * @return {@code this} builder. */ - public Builder insertCost(final int insertCost) { + public Builder setInsertCost(final int insertCost) { this.insertCost = insertCost; return this; } @@ -137,7 +137,7 @@ public Builder insertCost(final int insertCost) { * @param replaceCost the cost of replacing a character; must not be negative. * @return {@code this} builder. */ - public Builder replaceCost(final int replaceCost) { + public Builder setReplaceCost(final int replaceCost) { this.replaceCost = replaceCost; return this; } @@ -154,7 +154,7 @@ public Builder replaceCost(final int replaceCost) { * for no limit. * @return {@code this} builder. */ - public Builder threshold(final Integer threshold) { + public Builder setThreshold(final Integer threshold) { this.threshold = threshold; return this; } @@ -169,7 +169,7 @@ public Builder threshold(final Integer threshold) { * Returns a new {@link Builder} for constructing {@link LevenshteinDistance} instances. * * @return a new {@link Builder}. - * @since 1.13.0 + * @since 1.16.0 */ public static Builder builder() { return new Builder(); @@ -511,7 +511,7 @@ public LevenshteinDistance() { */ @Deprecated public LevenshteinDistance(final Integer threshold) { - this(builder().threshold(threshold)); + this(builder().setThreshold(threshold)); } /** @@ -581,7 +581,7 @@ public Integer apply(final CharSequence left, final CharSequence right) { * @param right the second input, must not be null. * @return result distance, or {@code -1} if a threshold is set and the distance exceeds it. * @throws IllegalArgumentException if either input is {@code null}. - * @since 1.13.0 + * @since 1.16.0 */ public Integer apply(final SimilarityInput left, final SimilarityInput right) { if (threshold != null) { @@ -594,7 +594,7 @@ public Integer apply(final SimilarityInput left, final SimilarityInput * Gets the cost of a deletion operation. * * @return the deletion cost. - * @since 1.13.0 + * @since 1.16.0 */ public int getDeleteCost() { return deleteCost; @@ -604,7 +604,7 @@ public int getDeleteCost() { * Gets the cost of an insertion operation. * * @return the insertion cost. - * @since 1.13.0 + * @since 1.16.0 */ public int getInsertCost() { return insertCost; @@ -614,7 +614,7 @@ public int getInsertCost() { * Gets the cost of a substitution (replace) operation. * * @return the replacement cost. - * @since 1.13.0 + * @since 1.16.0 */ public int getReplaceCost() { return replaceCost; diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java index bdddd6ac52..ed8b38454a 100644 --- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java @@ -183,25 +183,25 @@ void testGetThresholdDirectlyAfterObjectInstantiation() { @Test void testConstructorWithNegativeCosts() { - assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().insertCost(-1).build()); - assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().deleteCost(-1).build()); - assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().replaceCost(-1).build()); + assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().setInsertCost(-1).build()); + assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().setDeleteCost(-1).build()); + assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().setReplaceCost(-1).build()); } @Test void testGetLevenshteinDistance_WeightedUnlimited() { // Substitution is very expensive (10) vs Insert/Delete (1 each) - final LevenshteinDistance dist = LevenshteinDistance.builder().insertCost(1).deleteCost(1).replaceCost(10).build(); + final LevenshteinDistance dist = LevenshteinDistance.builder().setInsertCost(1).setDeleteCost(1).setReplaceCost(10).build(); // 'a' -> 'b' should choose delete 'a' (1) and insert 'b' (1) = distance 2, // instead of replace (10). assertEquals(2, dist.apply("a", "b")); // All operations are free (0) - final LevenshteinDistance freeDist = LevenshteinDistance.builder().insertCost(0).deleteCost(0).replaceCost(0).build(); + final LevenshteinDistance freeDist = LevenshteinDistance.builder().setInsertCost(0).setDeleteCost(0).setReplaceCost(0).build(); assertEquals(0, freeDist.apply("abc", "def")); // Asymmetric costs: Insert=10, Delete=1, Replace=100 - final LevenshteinDistance asymmetric = LevenshteinDistance.builder().insertCost(10).deleteCost(1).replaceCost(100).build(); + final LevenshteinDistance asymmetric = LevenshteinDistance.builder().setInsertCost(10).setDeleteCost(1).setReplaceCost(100).build(); assertEquals(1, asymmetric.apply("a", "")); // Delete 'a' = 1 assertEquals(10, asymmetric.apply("", "a")); // Insert 'a' = 10 } @@ -209,22 +209,22 @@ void testGetLevenshteinDistance_WeightedUnlimited() { @Test void testGetLevenshteinDistance_WeightedThreshold() { // Distance is 2 (via delete/insert), threshold is 5 -> result 2 - final LevenshteinDistance weighted = LevenshteinDistance.builder().threshold(5).insertCost(1).deleteCost(1).replaceCost(10).build(); + final LevenshteinDistance weighted = LevenshteinDistance.builder().setThreshold(5).setInsertCost(1).setDeleteCost(1).setReplaceCost(10).build(); assertEquals(2, weighted.apply("a", "b")); // Distance is 2, threshold is 1 -> result -1 - final LevenshteinDistance strict = LevenshteinDistance.builder().threshold(1).insertCost(1).deleteCost(1).replaceCost(10).build(); + final LevenshteinDistance strict = LevenshteinDistance.builder().setThreshold(1).setInsertCost(1).setDeleteCost(1).setReplaceCost(10).build(); assertEquals(-1, strict.apply("a", "b")); // Empty strings with weighted threshold - assertEquals(0, LevenshteinDistance.builder().threshold(5).insertCost(2).deleteCost(2).replaceCost(2).build().apply("", "")); - assertEquals(4, LevenshteinDistance.builder().threshold(5).insertCost(2).deleteCost(2).replaceCost(2).build().apply("aa", "")); - assertEquals(-1, LevenshteinDistance.builder().threshold(1).insertCost(2).deleteCost(2).replaceCost(2).build().apply("aa", "")); + assertEquals(0, LevenshteinDistance.builder().setThreshold(5).setInsertCost(2).setDeleteCost(2).setReplaceCost(2).build().apply("", "")); + assertEquals(4, LevenshteinDistance.builder().setThreshold(5).setInsertCost(2).setDeleteCost(2).setReplaceCost(2).build().apply("aa", "")); + assertEquals(-1, LevenshteinDistance.builder().setThreshold(1).setInsertCost(2).setDeleteCost(2).setReplaceCost(2).build().apply("aa", "")); } @Test void testWeightedAccessors() { - final LevenshteinDistance dist = LevenshteinDistance.builder().threshold(10).insertCost(2).deleteCost(3).replaceCost(4).build(); + final LevenshteinDistance dist = LevenshteinDistance.builder().setThreshold(10).setInsertCost(2).setDeleteCost(3).setReplaceCost(4).build(); assertEquals(10, dist.getThreshold()); assertEquals(2, dist.getInsertCost()); assertEquals(3, dist.getDeleteCost());