diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java index 479b3fadea..7e05e68cc4 100644 --- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java @@ -19,13 +19,32 @@ import java.util.Arrays; /** - * An algorithm for measuring the difference between two character sequences using the Levenshtein - * Distance. + * An algorithm for measuring the difference between two character sequences using the + * Levenshtein Distance. * *

- * This is the number of changes needed to change one sequence into another, where each change is a single character modification (deletion, insertion or - * substitution). + * This is the number of changes needed to change one sequence into another, where each change is a + * single character modification (deletion, insertion or substitution). *

+ * + *

+ * This implementation supports configurable costs for insertion, deletion, and substitution + * operations. By default, all costs are set to 1 for backward compatibility. + *

+ * + *

+ * Use {@link Builder} to construct instances with custom thresholds and operation costs: + *

+ * + *
+ * LevenshteinDistance dist = LevenshteinDistance.builder()
+ *     .setThreshold(10)
+ *     .setInsertCost(1)
+ *     .setDeleteCost(2)
+ *     .setReplaceCost(3)
+ *     .build();
+ * 
+ * *

* This code has been adapted from Apache Commons Lang 3.3. *

@@ -37,92 +56,278 @@ public class LevenshteinDistance implements EditDistance { /** - * The singleton instance. + * Builds {@link LevenshteinDistance} instances. + * + *

+ * All costs default to 1. The threshold defaults to {@code null} (unlimited). + *

+ * + *
+     * LevenshteinDistance dist = LevenshteinDistance.builder()
+     *     .setThreshold(5)
+     *     .setInsertCost(1)
+     *     .setDeleteCost(1)
+     *     .setReplaceCost(2)
+     *     .build();
+     * 
+ * + * @since 1.16.0 + */ + public static final class Builder { + + /** + * Default cost for any single edit operation. + */ + private static final int DEFAULT_COST = 1; + + /** Threshold for limited compare, or {@code null} for unlimited. */ + private Integer threshold; + + /** Cost of inserting a character. */ + private int insertCost = DEFAULT_COST; + + /** Cost of deleting a character. */ + private int deleteCost = DEFAULT_COST; + + /** Cost of substituting one character for another. */ + private int replaceCost = DEFAULT_COST; + + /** + * Constructs a new builder with default values. + */ + private Builder() { + // use LevenshteinDistance.builder() factory method + } + + /** + * Builds a new {@link LevenshteinDistance} from the current state of this builder. + * + * @return a new {@link LevenshteinDistance}. + * @throws IllegalArgumentException if the threshold is negative, or any cost is negative. + */ + public LevenshteinDistance build() { + return new LevenshteinDistance(this); + } + + /** + * Sets the cost of a deletion operation. + * + * @param deleteCost the cost of deleting a character; must not be negative. + * @return {@code this} builder. + */ + public Builder setDeleteCost(final int deleteCost) { + this.deleteCost = deleteCost; + return this; + } + + /** + * Sets the cost of an insertion operation. + * + * @param insertCost the cost of inserting a character; must not be negative. + * @return {@code this} builder. + */ + public Builder setInsertCost(final int insertCost) { + this.insertCost = insertCost; + return this; + } + + /** + * Sets the cost of a substitution (replace) operation. + * + * @param replaceCost the cost of replacing a character; must not be negative. + * @return {@code this} builder. + */ + public Builder setReplaceCost(final int replaceCost) { + this.replaceCost = replaceCost; + return this; + } + + /** + * Sets the threshold for limited distance calculation. + * + *

+ * When set, {@link LevenshteinDistance#apply} returns {@code -1} if the computed + * distance exceeds this value. When {@code null}, the unlimited algorithm is used. + *

+ * + * @param threshold the maximum distance to report; must not be negative, or {@code null} + * for no limit. + * @return {@code this} builder. + */ + public Builder setThreshold(final Integer threshold) { + this.threshold = threshold; + return this; + } + } + + /** + * The singleton instance (uses default costs and no threshold). */ private static final LevenshteinDistance INSTANCE = new LevenshteinDistance(); /** - * Gets the default instance. + * Returns a new {@link Builder} for constructing {@link LevenshteinDistance} instances. * - * @return The default instance. + * @return a new {@link Builder}. + * @since 1.16.0 + */ + public static Builder builder() { + return new Builder(); + } + + /** + * Gets the default instance, which uses no threshold and all operation costs set to 1. + * + * @return the default instance. */ public static LevenshteinDistance getDefaultInstance() { return INSTANCE; } /** - * Finds the Levenshtein distance between two CharSequences if it's less than or equal to a given threshold. + * Finds the Levenshtein distance between two CharSequences if it is less than or equal to a + * given threshold, using configurable costs for insert, delete, and replace operations. * *

- * This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield and Chas Emerick's implementation of the Levenshtein distance - * algorithm. + * This implementation follows from Algorithms on Strings, Trees and Sequences by + * Dan Gusfield and Chas Emerick's implementation of the Levenshtein distance algorithm. + *

+ * + *

+ * Note: The stripe-width optimisation used in the unit-cost case relies on the assumption that + * each operation costs exactly 1. When custom costs are supplied the stripe cannot be reliably + * bounded to {@code 2*threshold+1}, so the full O(nm) DP table is used instead, returning + * {@code -1} only when the final distance exceeds the threshold. *

* *
-     * limitedCompare(null, *, *)             = Throws {@link IllegalArgumentException}
-     * limitedCompare(*, null, *)             = Throws {@link IllegalArgumentException}
-     * limitedCompare(*, *, -1)               = Throws {@link IllegalArgumentException}
-     * limitedCompare("","", 0)               = 0
-     * limitedCompare("aaapppp", "", 8)       = 7
-     * limitedCompare("aaapppp", "", 7)       = 7
-     * limitedCompare("aaapppp", "", 6))      = -1
-     * limitedCompare("elephant", "hippo", 7) = 7
-     * limitedCompare("elephant", "hippo", 6) = -1
-     * limitedCompare("hippo", "elephant", 7) = 7
-     * limitedCompare("hippo", "elephant", 6) = -1
+     * limitedCompare(null, *, *, *, *, *)             = throws {@link IllegalArgumentException}
+     * limitedCompare(*, null, *, *, *, *)             = throws {@link IllegalArgumentException}
+     * limitedCompare(*, *, -1, *, *, *)               = throws {@link IllegalArgumentException}
+     * limitedCompare("","", 0, 1, 1, 1)               = 0
+     * limitedCompare("aaapppp", "", 8, 1, 1, 1)       = 7
+     * limitedCompare("aaapppp", "", 7, 1, 1, 1)       = 7
+     * limitedCompare("aaapppp", "", 6, 1, 1, 1)       = -1
+     * limitedCompare("elephant", "hippo", 7, 1, 1, 1) = 7
+     * limitedCompare("elephant", "hippo", 6, 1, 1, 1) = -1
+     * limitedCompare("hippo", "elephant", 7, 1, 1, 1) = 7
+     * limitedCompare("hippo", "elephant", 6, 1, 1, 1) = -1
      * 
* - * @param left the first SimilarityInput, must not be null. - * @param right the second SimilarityInput, must not be null. - * @param threshold the target threshold, must not be negative. - * @return result distance, or -1 + * @param the element type of the {@link SimilarityInput}. + * @param left the first SimilarityInput, must not be null. + * @param right the second SimilarityInput, must not be null. + * @param threshold the target threshold, must not be negative. + * @param insertCost the cost of an insertion operation, must not be negative. + * @param deleteCost the cost of a deletion operation, must not be negative. + * @param replaceCost the cost of a substitution operation, must not be negative. + * @return result distance, or {@code -1} if the distance exceeds the threshold. */ - private static int limitedCompare(SimilarityInput left, SimilarityInput right, final int threshold) { // NOPMD + private static int limitedCompare(SimilarityInput left, SimilarityInput right, // NOPMD + final int threshold, final int insertCost, final int deleteCost, final int replaceCost) { if (left == null || right == null) { throw new IllegalArgumentException("CharSequences must not be null"); } + if (threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } - /* - * This implementation only computes the distance if it's less than or equal to the threshold value, returning -1 if it's greater. The advantage is - * performance: unbounded distance is O(nm), but a bound of k allows us to reduce it to O(km) time by only computing a diagonal stripe of width 2k + 1 - * of the cost table. It is also possible to use this to compute the unbounded Levenshtein distance by starting the threshold at 1 and doubling each - * time until the distance is found; this is O(dm), where d is the distance. - * - * One subtlety comes from needing to ignore entries on the border of our stripe, for example, - * p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry to the left - * of the leftmost member We must ignore the entry above the rightmost member - * - * Another subtlety comes from our stripe running off the matrix if the strings aren't of the same size. Since string s is always swapped to be the - * shorter of the two, the stripe will always run off to the upper right instead of the lower left of the matrix. - * - * As a concrete example, suppose s is of length 5, t is of length 7, and our threshold is 1. In this case we're going to walk a stripe of length 3. The - * matrix would look like so: - * - *
 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 | | | |#|#| 6 | | | | |#| 7 | | | | | | 
- * - * Note how the stripe leads off the table as there is no possible way to turn a string of length 5 into one of length 7 in edit distance of 1. - * - * Additionally, this implementation decreases memory usage by using two single-dimensional arrays and swapping them back and forth instead of - * allocating an entire n by m matrix. This requires a few minor changes, such as immediately returning when it's detected that the stripe has run off - * the matrix and initially filling the arrays with large values so that entries we don't compute are ignored. - * - * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for some discussion. - */ - - int n = left.length(); // length of left - int m = right.length(); // length of right + final int n = left.length(); + final int m = right.length(); - // if one string is empty, the edit distance is necessarily the length - // of the other if (n == 0) { - return m <= threshold ? m : -1; + final int dist = m * insertCost; + return dist <= threshold ? dist : -1; } if (m == 0) { - return n <= threshold ? n : -1; + final int dist = n * deleteCost; + return dist <= threshold ? dist : -1; + } + + if (insertCost == 1 && deleteCost == 1 && replaceCost == 1) { + return limitedCompareUniformCost(left, right, threshold, n, m); + } + return limitedCompareCustomCost(left, right, threshold, n, m, + new int[] {insertCost, deleteCost, replaceCost}); + } + + /** + * Full O(nm) limited compare for custom (non-uniform) operation costs. + * + *

+ * Uses two rolling arrays to keep memory at O(min(n, m)). + *

+ * + *

+ * When {@code deleteCost != insertCost} swapping the strings would change the semantics + * (delete on the original becomes insert on the swapped copy), so the orientation is always + * kept as-is and the correct directional cost is applied. + *

+ * + * @param the element type of the {@link SimilarityInput}. + * @param left the first SimilarityInput, must not be null. + * @param right the second SimilarityInput, must not be null. + * @param threshold the target threshold. + * @param n the length of {@code left}. + * @param m the length of {@code right}. + * @param costs int array of length 3: {@code {insertCost, deleteCost, replaceCost}}. + * @return result distance, or {@code -1} if the distance exceeds the threshold. + */ + private static int limitedCompareCustomCost(final SimilarityInput left, + final SimilarityInput right, final int threshold, final int n, final int m, + final int[] costs) { + final int insertCost = costs[0]; + final int deleteCost = costs[1]; + final int replaceCost = costs[2]; + + int[] p = new int[n + 1]; + int[] d = new int[n + 1]; + + for (int i = 0; i <= n; i++) { + p[i] = i * deleteCost; } + for (int j = 1; j <= m; j++) { + final E rightJ = right.at(j - 1); + d[0] = j * insertCost; + + for (int i = 1; i <= n; i++) { + if (left.at(i - 1).equals(rightJ)) { + d[i] = p[i - 1]; + } else { + d[i] = Math.min( + Math.min(d[i - 1] + insertCost, p[i] + deleteCost), + p[i - 1] + replaceCost); + } + } + + final int[] tempD = p; + p = d; + d = tempD; + } + + return p[n] <= threshold ? p[n] : -1; + } + + /** + * Classic stripe-optimised O(km) limited compare for uniform unit costs. + * + *

+ * This preserves the original algorithm exactly. + *

+ * + * @param the element type of the {@link SimilarityInput}. + * @param left the first SimilarityInput, must not be null. + * @param right the second SimilarityInput, must not be null. + * @param threshold the target threshold. + * @param n the length of {@code left} (after optional swap). + * @param m the length of {@code right} (after optional swap). + * @return result distance, or {@code -1} if the distance exceeds the threshold. + */ + private static int limitedCompareUniformCost(SimilarityInput left, + SimilarityInput right, final int threshold, int n, int m) { + if (n > m) { - // swap the two strings to consume less memory final SimilarityInput tmp = left; left = right; right = tmp; @@ -130,147 +335,135 @@ private static int limitedCompare(SimilarityInput left, SimilarityInput threshold) { return -1; } - int[] p = new int[n + 1]; // 'previous' cost array, horizontally - int[] d = new int[n + 1]; // cost array, horizontally - int[] tempD; // placeholder to assist in swapping p and d + int[] p = new int[n + 1]; + int[] d = new int[n + 1]; + int[] tempD; - // fill in starting table values final int boundary = Math.min(n, threshold) + 1; for (int i = 0; i < boundary; i++) { p[i] = i; } - // these fills ensure that the value above the rightmost entry of our - // stripe will be ignored in following loop iterations Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); Arrays.fill(d, Integer.MAX_VALUE); - // iterates through t for (int j = 1; j <= m; j++) { - final E rightJ = right.at(j - 1); // jth character of right + final E rightJ = right.at(j - 1); d[0] = j; - // compute stripe indices, constrain to array size final int min = Math.max(1, j - threshold); final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min(n, j + threshold); - // ignore entry left of leftmost if (min > 1) { d[min - 1] = Integer.MAX_VALUE; } int lowerBound = Integer.MAX_VALUE; - // iterates through [min, max] in s for (int i = min; i <= max; i++) { if (left.at(i - 1).equals(rightJ)) { - // diagonally left and up d[i] = p[i - 1]; } else { - // 1 + minimum of cell to the left, to the top, diagonally - // left and up d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); } lowerBound = Math.min(lowerBound, d[i]); } - // if the lower bound is greater than the threshold, then exit early if (lowerBound > threshold) { return -1; } - // copy current distance counts to 'previous row' distance counts tempD = p; p = d; d = tempD; } - // if p[n] is greater than the threshold, there's no guarantee on it - // being the correct - // distance - if (p[n] <= threshold) { - return p[n]; - } - return -1; + return p[n] <= threshold ? p[n] : -1; } /** - * Finds the Levenshtein distance between two Strings. + * Finds the Levenshtein distance between two Strings using configurable insert, delete, and + * replace costs. * *

* A higher score indicates a greater distance. *

* - *

- * This implementation only need one single-dimensional arrays of length s.length() + 1 - *

- * *
-     * unlimitedCompare(null, *)             = Throws {@link IllegalArgumentException}
-     * unlimitedCompare(*, null)             = Throws {@link IllegalArgumentException}
-     * unlimitedCompare("","")               = 0
-     * unlimitedCompare("","a")              = 1
-     * unlimitedCompare("aaapppp", "")       = 7
-     * unlimitedCompare("frog", "fog")       = 1
-     * unlimitedCompare("fly", "ant")        = 3
-     * unlimitedCompare("elephant", "hippo") = 7
-     * unlimitedCompare("hippo", "elephant") = 7
-     * unlimitedCompare("hippo", "zzzzzzzz") = 8
-     * unlimitedCompare("hello", "hallo")    = 1
+     * unlimitedCompare(null, *, *, *, *)             = throws {@link IllegalArgumentException}
+     * unlimitedCompare(*, null, *, *, *)             = throws {@link IllegalArgumentException}
+     * unlimitedCompare("","", 1, 1, 1)               = 0
+     * unlimitedCompare("","a", 1, 1, 1)              = 1
+     * unlimitedCompare("aaapppp", "", 1, 1, 1)       = 7
+     * unlimitedCompare("frog", "fog", 1, 1, 1)       = 1
+     * unlimitedCompare("fly", "ant", 1, 1, 1)        = 3
+     * unlimitedCompare("elephant", "hippo", 1, 1, 1) = 7
+     * unlimitedCompare("hippo", "elephant", 1, 1, 1) = 7
+     * unlimitedCompare("hippo", "zzzzzzzz", 1, 1, 1) = 8
+     * unlimitedCompare("hello", "hallo", 1, 1, 1)    = 1
      * 
* - * @param left the first CharSequence, must not be null. - * @param right the second CharSequence, must not be null. - * @return result distance, or -1. - * @throws IllegalArgumentException if either CharSequence input is {@code null}. + * @param the element type of the {@link SimilarityInput}. + * @param left the first SimilarityInput, must not be null. + * @param right the second SimilarityInput, must not be null. + * @param insertCost the cost of an insertion operation, must not be negative. + * @param deleteCost the cost of a deletion operation, must not be negative. + * @param replaceCost the cost of a substitution operation, must not be negative. + * @return result distance. + * @throws IllegalArgumentException if either input is {@code null}. */ - private static int unlimitedCompare(SimilarityInput left, SimilarityInput right) { + private static int unlimitedCompare(SimilarityInput left, SimilarityInput right, + final int insertCost, final int deleteCost, final int replaceCost) { if (left == null || right == null) { throw new IllegalArgumentException("CharSequences must not be null"); } - /* - * This implementation use two variable to record the previous cost counts, So this implementation use less memory than previous impl. - */ - int n = left.length(); // length of left - int m = right.length(); // length of right + + int n = left.length(); + int m = right.length(); if (n == 0) { - return m; + return m * insertCost; } if (m == 0) { - return n; + return n * deleteCost; } - if (n > m) { - // swap the input strings to consume less memory + + // When insert == delete costs are symmetric; swapping the shorter string into + // 'left' minimises working-array size without changing semantics. + // When insert != delete, swapping reverses their roles, so we keep the original order. + final boolean canSwap = insertCost == deleteCost; + if (canSwap && n > m) { final SimilarityInput tmp = left; left = right; right = tmp; n = m; m = right.length(); } + final int[] p = new int[n + 1]; - // indexes into strings left and right - int i; // iterates through left - int j; // iterates through right + + for (int i = 0; i <= n; i++) { + p[i] = i * deleteCost; + } + int upperLeft; int upper; - E rightJ; // jth character of right - int cost; // cost - for (i = 0; i <= n; i++) { - p[i] = i; - } - for (j = 1; j <= m; j++) { + + for (int j = 1; j <= m; j++) { upperLeft = p[0]; - rightJ = right.at(j - 1); - p[0] = j; + final E rightJ = right.at(j - 1); + p[0] = j * insertCost; - for (i = 1; i <= n; i++) { + for (int i = 1; i <= n; i++) { upper = p[i]; - cost = left.at(i - 1).equals(rightJ) ? 0 : 1; - // minimum of cell to the left+1, to the top+1, diagonally left and up +cost - p[i] = Math.min(Math.min(p[i - 1] + 1, p[i] + 1), upperLeft + cost); + if (left.at(i - 1).equals(rightJ)) { + p[i] = upperLeft; + } else { + p[i] = Math.min( + Math.min(p[i - 1] + insertCost, p[i] + deleteCost), + upperLeft + replaceCost); + } upperLeft = upper; } } @@ -278,32 +471,72 @@ private static int unlimitedCompare(SimilarityInput left, SimilarityInput } /** - * Threshold. + * Threshold (nullable). When non-null, {@link #limitedCompare} is used instead of + * {@link #unlimitedCompare}. */ private final Integer threshold; + /** Cost of inserting a character into the left sequence. */ + private final int insertCost; + + /** Cost of deleting a character from the left sequence. */ + private final int deleteCost; + + /** Cost of substituting one character for another. */ + private final int replaceCost; + /** - * Constructs a default instance that uses a version of the algorithm that does not use a threshold parameter. + * Constructs a default instance that uses the unlimited algorithm with all operation costs + * set to 1. * * @see LevenshteinDistance#getDefaultInstance() - * @deprecated Use {@link #getDefaultInstance()}. + * @deprecated Use {@link #getDefaultInstance()} or {@link #builder()}. */ @Deprecated public LevenshteinDistance() { - this(null); + this(builder()); } /** - * Constructs a new instance. If the threshold is not null, distance calculations will be limited to a maximum length. If the threshold is null, the - * unlimited version of the algorithm will be used. + * Constructs a new instance with the given threshold and all operation costs set to 1. * - * @param threshold If this is null then distances calculations will not be limited. This may not be negative. + *

+ * If the threshold is not null, distance calculations will be limited to that maximum value. + * If the threshold is null, the unlimited version of the algorithm will be used. + *

+ * + * @param threshold if this is null then distance calculations will not be limited; + * otherwise it must not be negative. + * @deprecated Use {@link #builder()}. */ + @Deprecated public LevenshteinDistance(final Integer threshold) { - if (threshold != null && threshold < 0) { + this(builder().setThreshold(threshold)); + } + + /** + * Constructs a new {@link LevenshteinDistance} from a {@link Builder}. + * + * @param builder the builder; must not be null. + * @throws IllegalArgumentException if the threshold is negative, or any cost is negative. + */ + private LevenshteinDistance(final Builder builder) { + if (builder.threshold != null && builder.threshold < 0) { throw new IllegalArgumentException("Threshold must not be negative"); } - this.threshold = threshold; + if (builder.insertCost < 0) { + throw new IllegalArgumentException("Insert cost must not be negative"); + } + if (builder.deleteCost < 0) { + throw new IllegalArgumentException("Delete cost must not be negative"); + } + if (builder.replaceCost < 0) { + throw new IllegalArgumentException("Replace cost must not be negative"); + } + this.threshold = builder.threshold; + this.insertCost = builder.insertCost; + this.deleteCost = builder.deleteCost; + this.replaceCost = builder.replaceCost; } /** @@ -313,29 +546,23 @@ public LevenshteinDistance(final Integer threshold) { * A higher score indicates a greater distance. *

* - *

- * Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large - * strings. - *

- * *
-     * distance.apply(null, *)             = Throws {@link IllegalArgumentException}
-     * distance.apply(*, null)             = Throws {@link IllegalArgumentException}
+     * distance.apply(null, *)             = throws {@link IllegalArgumentException}
+     * distance.apply(*, null)             = throws {@link IllegalArgumentException}
      * distance.apply("","")               = 0
-     * distance.apply("","a")              = 1
-     * distance.apply("aaapppp", "")       = 7
-     * distance.apply("frog", "fog")       = 1
-     * distance.apply("fly", "ant")        = 3
-     * distance.apply("elephant", "hippo") = 7
-     * distance.apply("hippo", "elephant") = 7
-     * distance.apply("hippo", "zzzzzzzz") = 8
-     * distance.apply("hello", "hallo")    = 1
+     * distance.apply("","a")              = insertCost
+     * distance.apply("aaapppp", "")       = 7 * deleteCost
+     * distance.apply("frog", "fog")       = 1 * deleteCost
+     * distance.apply("fly", "ant")        = 3 * replaceCost
+     * distance.apply("elephant", "hippo") = 7  (with default costs)
+     * distance.apply("hippo", "elephant") = 7  (with default costs)
+     * distance.apply("hello", "hallo")    = 1  (with default costs)
      * 
* * @param left the first input, must not be null. * @param right the second input, must not be null. - * @return result distance, or -1. - * @throws IllegalArgumentException if either String input {@code null}. + * @return result distance, or {@code -1} if a threshold is set and the distance exceeds it. + * @throws IllegalArgumentException if either String input is {@code null}. */ @Override public Integer apply(final CharSequence left, final CharSequence right) { @@ -343,47 +570,62 @@ public Integer apply(final CharSequence left, final CharSequence right) { } /** - * Computes the Levenshtein distance between two inputs. + * Computes the Levenshtein distance between two {@link SimilarityInput} instances. * *

* A higher score indicates a greater distance. *

* - *
-     * distance.apply(null, *)             = Throws {@link IllegalArgumentException}
-     * distance.apply(*, null)             = Throws {@link IllegalArgumentException}
-     * distance.apply("","")               = 0
-     * distance.apply("","a")              = 1
-     * distance.apply("aaapppp", "")       = 7
-     * distance.apply("frog", "fog")       = 1
-     * distance.apply("fly", "ant")        = 3
-     * distance.apply("elephant", "hippo") = 7
-     * distance.apply("hippo", "elephant") = 7
-     * distance.apply("hippo", "zzzzzzzz") = 8
-     * distance.apply("hello", "hallo")    = 1
-     * 
- * - * @param The type of similarity score unit. + * @param the type of element compared by the similarity score. * @param left the first input, must not be null. * @param right the second input, must not be null. - * @return result distance, or -1. - * @throws IllegalArgumentException if either String input {@code null}. - * @since 1.13.0 + * @return result distance, or {@code -1} if a threshold is set and the distance exceeds it. + * @throws IllegalArgumentException if either input is {@code null}. + * @since 1.16.0 */ public Integer apply(final SimilarityInput left, final SimilarityInput right) { if (threshold != null) { - return limitedCompare(left, right, threshold); + return limitedCompare(left, right, threshold, insertCost, deleteCost, replaceCost); } - return unlimitedCompare(left, right); + return unlimitedCompare(left, right, insertCost, deleteCost, replaceCost); + } + + /** + * Gets the cost of a deletion operation. + * + * @return the deletion cost. + * @since 1.16.0 + */ + public int getDeleteCost() { + return deleteCost; + } + + /** + * Gets the cost of an insertion operation. + * + * @return the insertion cost. + * @since 1.16.0 + */ + public int getInsertCost() { + return insertCost; + } + + /** + * Gets the cost of a substitution (replace) operation. + * + * @return the replacement cost. + * @since 1.16.0 + */ + public int getReplaceCost() { + return replaceCost; } /** * Gets the distance threshold. * - * @return The distance threshold. + * @return the distance threshold, or {@code null} if no threshold is set. */ public Integer getThreshold() { return threshold; } - } diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java index e7a88ca498..ed8b38454a 100644 --- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java @@ -181,4 +181,53 @@ void testGetThresholdDirectlyAfterObjectInstantiation() { assertNull(LevenshteinDistance.getDefaultInstance().getThreshold()); } + @Test + void testConstructorWithNegativeCosts() { + assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().setInsertCost(-1).build()); + assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().setDeleteCost(-1).build()); + assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().setReplaceCost(-1).build()); + } + + @Test + void testGetLevenshteinDistance_WeightedUnlimited() { + // Substitution is very expensive (10) vs Insert/Delete (1 each) + final LevenshteinDistance dist = LevenshteinDistance.builder().setInsertCost(1).setDeleteCost(1).setReplaceCost(10).build(); + // 'a' -> 'b' should choose delete 'a' (1) and insert 'b' (1) = distance 2, + // instead of replace (10). + assertEquals(2, dist.apply("a", "b")); + + // All operations are free (0) + final LevenshteinDistance freeDist = LevenshteinDistance.builder().setInsertCost(0).setDeleteCost(0).setReplaceCost(0).build(); + assertEquals(0, freeDist.apply("abc", "def")); + + // Asymmetric costs: Insert=10, Delete=1, Replace=100 + final LevenshteinDistance asymmetric = LevenshteinDistance.builder().setInsertCost(10).setDeleteCost(1).setReplaceCost(100).build(); + assertEquals(1, asymmetric.apply("a", "")); // Delete 'a' = 1 + assertEquals(10, asymmetric.apply("", "a")); // Insert 'a' = 10 + } + + @Test + void testGetLevenshteinDistance_WeightedThreshold() { + // Distance is 2 (via delete/insert), threshold is 5 -> result 2 + final LevenshteinDistance weighted = LevenshteinDistance.builder().setThreshold(5).setInsertCost(1).setDeleteCost(1).setReplaceCost(10).build(); + assertEquals(2, weighted.apply("a", "b")); + + // Distance is 2, threshold is 1 -> result -1 + final LevenshteinDistance strict = LevenshteinDistance.builder().setThreshold(1).setInsertCost(1).setDeleteCost(1).setReplaceCost(10).build(); + assertEquals(-1, strict.apply("a", "b")); + + // Empty strings with weighted threshold + assertEquals(0, LevenshteinDistance.builder().setThreshold(5).setInsertCost(2).setDeleteCost(2).setReplaceCost(2).build().apply("", "")); + assertEquals(4, LevenshteinDistance.builder().setThreshold(5).setInsertCost(2).setDeleteCost(2).setReplaceCost(2).build().apply("aa", "")); + assertEquals(-1, LevenshteinDistance.builder().setThreshold(1).setInsertCost(2).setDeleteCost(2).setReplaceCost(2).build().apply("aa", "")); + } + + @Test + void testWeightedAccessors() { + final LevenshteinDistance dist = LevenshteinDistance.builder().setThreshold(10).setInsertCost(2).setDeleteCost(3).setReplaceCost(4).build(); + assertEquals(10, dist.getThreshold()); + assertEquals(2, dist.getInsertCost()); + assertEquals(3, dist.getDeleteCost()); + assertEquals(4, dist.getReplaceCost()); + } }