diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java index 479b3fadea..7e05e68cc4 100644 --- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java @@ -19,13 +19,32 @@ import java.util.Arrays; /** - * An algorithm for measuring the difference between two character sequences using the Levenshtein - * Distance. + * An algorithm for measuring the difference between two character sequences using the + * Levenshtein Distance. * *
- * This is the number of changes needed to change one sequence into another, where each change is a single character modification (deletion, insertion or - * substitution). + * This is the number of changes needed to change one sequence into another, where each change is a + * single character modification (deletion, insertion or substitution). *
+ * + *+ * This implementation supports configurable costs for insertion, deletion, and substitution + * operations. By default, all costs are set to 1 for backward compatibility. + *
+ * + *+ * Use {@link Builder} to construct instances with custom thresholds and operation costs: + *
+ * + *+ * LevenshteinDistance dist = LevenshteinDistance.builder() + * .setThreshold(10) + * .setInsertCost(1) + * .setDeleteCost(2) + * .setReplaceCost(3) + * .build(); + *+ * *
* This code has been adapted from Apache Commons Lang 3.3. *
@@ -37,92 +56,278 @@ public class LevenshteinDistance implements EditDistance+ * All costs default to 1. The threshold defaults to {@code null} (unlimited). + *
+ * + *+ * LevenshteinDistance dist = LevenshteinDistance.builder() + * .setThreshold(5) + * .setInsertCost(1) + * .setDeleteCost(1) + * .setReplaceCost(2) + * .build(); + *+ * + * @since 1.16.0 + */ + public static final class Builder { + + /** + * Default cost for any single edit operation. + */ + private static final int DEFAULT_COST = 1; + + /** Threshold for limited compare, or {@code null} for unlimited. */ + private Integer threshold; + + /** Cost of inserting a character. */ + private int insertCost = DEFAULT_COST; + + /** Cost of deleting a character. */ + private int deleteCost = DEFAULT_COST; + + /** Cost of substituting one character for another. */ + private int replaceCost = DEFAULT_COST; + + /** + * Constructs a new builder with default values. + */ + private Builder() { + // use LevenshteinDistance.builder() factory method + } + + /** + * Builds a new {@link LevenshteinDistance} from the current state of this builder. + * + * @return a new {@link LevenshteinDistance}. + * @throws IllegalArgumentException if the threshold is negative, or any cost is negative. + */ + public LevenshteinDistance build() { + return new LevenshteinDistance(this); + } + + /** + * Sets the cost of a deletion operation. + * + * @param deleteCost the cost of deleting a character; must not be negative. + * @return {@code this} builder. + */ + public Builder setDeleteCost(final int deleteCost) { + this.deleteCost = deleteCost; + return this; + } + + /** + * Sets the cost of an insertion operation. + * + * @param insertCost the cost of inserting a character; must not be negative. + * @return {@code this} builder. + */ + public Builder setInsertCost(final int insertCost) { + this.insertCost = insertCost; + return this; + } + + /** + * Sets the cost of a substitution (replace) operation. + * + * @param replaceCost the cost of replacing a character; must not be negative. + * @return {@code this} builder. + */ + public Builder setReplaceCost(final int replaceCost) { + this.replaceCost = replaceCost; + return this; + } + + /** + * Sets the threshold for limited distance calculation. + * + *
+ * When set, {@link LevenshteinDistance#apply} returns {@code -1} if the computed + * distance exceeds this value. When {@code null}, the unlimited algorithm is used. + *
+ * + * @param threshold the maximum distance to report; must not be negative, or {@code null} + * for no limit. + * @return {@code this} builder. + */ + public Builder setThreshold(final Integer threshold) { + this.threshold = threshold; + return this; + } + } + + /** + * The singleton instance (uses default costs and no threshold). */ private static final LevenshteinDistance INSTANCE = new LevenshteinDistance(); /** - * Gets the default instance. + * Returns a new {@link Builder} for constructing {@link LevenshteinDistance} instances. * - * @return The default instance. + * @return a new {@link Builder}. + * @since 1.16.0 + */ + public static Builder builder() { + return new Builder(); + } + + /** + * Gets the default instance, which uses no threshold and all operation costs set to 1. + * + * @return the default instance. */ public static LevenshteinDistance getDefaultInstance() { return INSTANCE; } /** - * Finds the Levenshtein distance between two CharSequences if it's less than or equal to a given threshold. + * Finds the Levenshtein distance between two CharSequences if it is less than or equal to a + * given threshold, using configurable costs for insert, delete, and replace operations. * *- * This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield and Chas Emerick's implementation of the Levenshtein distance - * algorithm. + * This implementation follows from Algorithms on Strings, Trees and Sequences by + * Dan Gusfield and Chas Emerick's implementation of the Levenshtein distance algorithm. + *
+ * + *+ * Note: The stripe-width optimisation used in the unit-cost case relies on the assumption that + * each operation costs exactly 1. When custom costs are supplied the stripe cannot be reliably + * bounded to {@code 2*threshold+1}, so the full O(nm) DP table is used instead, returning + * {@code -1} only when the final distance exceeds the threshold. *
* *
- * limitedCompare(null, *, *) = Throws {@link IllegalArgumentException}
- * limitedCompare(*, null, *) = Throws {@link IllegalArgumentException}
- * limitedCompare(*, *, -1) = Throws {@link IllegalArgumentException}
- * limitedCompare("","", 0) = 0
- * limitedCompare("aaapppp", "", 8) = 7
- * limitedCompare("aaapppp", "", 7) = 7
- * limitedCompare("aaapppp", "", 6)) = -1
- * limitedCompare("elephant", "hippo", 7) = 7
- * limitedCompare("elephant", "hippo", 6) = -1
- * limitedCompare("hippo", "elephant", 7) = 7
- * limitedCompare("hippo", "elephant", 6) = -1
+ * limitedCompare(null, *, *, *, *, *) = throws {@link IllegalArgumentException}
+ * limitedCompare(*, null, *, *, *, *) = throws {@link IllegalArgumentException}
+ * limitedCompare(*, *, -1, *, *, *) = throws {@link IllegalArgumentException}
+ * limitedCompare("","", 0, 1, 1, 1) = 0
+ * limitedCompare("aaapppp", "", 8, 1, 1, 1) = 7
+ * limitedCompare("aaapppp", "", 7, 1, 1, 1) = 7
+ * limitedCompare("aaapppp", "", 6, 1, 1, 1) = -1
+ * limitedCompare("elephant", "hippo", 7, 1, 1, 1) = 7
+ * limitedCompare("elephant", "hippo", 6, 1, 1, 1) = -1
+ * limitedCompare("hippo", "elephant", 7, 1, 1, 1) = 7
+ * limitedCompare("hippo", "elephant", 6, 1, 1, 1) = -1
*
*
- * @param left the first SimilarityInput, must not be null.
- * @param right the second SimilarityInput, must not be null.
- * @param threshold the target threshold, must not be negative.
- * @return result distance, or -1
+ * @param 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 | | | |#|#| 6 | | | | |#| 7 | | | | | |- * - * Note how the stripe leads off the table as there is no possible way to turn a string of length 5 into one of length 7 in edit distance of 1. - * - * Additionally, this implementation decreases memory usage by using two single-dimensional arrays and swapping them back and forth instead of - * allocating an entire n by m matrix. This requires a few minor changes, such as immediately returning when it's detected that the stripe has run off - * the matrix and initially filling the arrays with large values so that entries we don't compute are ignored. - * - * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for some discussion. - */ - - int n = left.length(); // length of left - int m = right.length(); // length of right + final int n = left.length(); + final int m = right.length(); - // if one string is empty, the edit distance is necessarily the length - // of the other if (n == 0) { - return m <= threshold ? m : -1; + final int dist = m * insertCost; + return dist <= threshold ? dist : -1; } if (m == 0) { - return n <= threshold ? n : -1; + final int dist = n * deleteCost; + return dist <= threshold ? dist : -1; + } + + if (insertCost == 1 && deleteCost == 1 && replaceCost == 1) { + return limitedCompareUniformCost(left, right, threshold, n, m); + } + return limitedCompareCustomCost(left, right, threshold, n, m, + new int[] {insertCost, deleteCost, replaceCost}); + } + + /** + * Full O(nm) limited compare for custom (non-uniform) operation costs. + * + *
+ * Uses two rolling arrays to keep memory at O(min(n, m)). + *
+ * + *+ * When {@code deleteCost != insertCost} swapping the strings would change the semantics + * (delete on the original becomes insert on the swapped copy), so the orientation is always + * kept as-is and the correct directional cost is applied. + *
+ * + * @param+ * This preserves the original algorithm exactly. + *
+ * + * @param* A higher score indicates a greater distance. *
* - *- * This implementation only need one single-dimensional arrays of length s.length() + 1 - *
- * *
- * unlimitedCompare(null, *) = Throws {@link IllegalArgumentException}
- * unlimitedCompare(*, null) = Throws {@link IllegalArgumentException}
- * unlimitedCompare("","") = 0
- * unlimitedCompare("","a") = 1
- * unlimitedCompare("aaapppp", "") = 7
- * unlimitedCompare("frog", "fog") = 1
- * unlimitedCompare("fly", "ant") = 3
- * unlimitedCompare("elephant", "hippo") = 7
- * unlimitedCompare("hippo", "elephant") = 7
- * unlimitedCompare("hippo", "zzzzzzzz") = 8
- * unlimitedCompare("hello", "hallo") = 1
+ * unlimitedCompare(null, *, *, *, *) = throws {@link IllegalArgumentException}
+ * unlimitedCompare(*, null, *, *, *) = throws {@link IllegalArgumentException}
+ * unlimitedCompare("","", 1, 1, 1) = 0
+ * unlimitedCompare("","a", 1, 1, 1) = 1
+ * unlimitedCompare("aaapppp", "", 1, 1, 1) = 7
+ * unlimitedCompare("frog", "fog", 1, 1, 1) = 1
+ * unlimitedCompare("fly", "ant", 1, 1, 1) = 3
+ * unlimitedCompare("elephant", "hippo", 1, 1, 1) = 7
+ * unlimitedCompare("hippo", "elephant", 1, 1, 1) = 7
+ * unlimitedCompare("hippo", "zzzzzzzz", 1, 1, 1) = 8
+ * unlimitedCompare("hello", "hallo", 1, 1, 1) = 1
*
*
- * @param left the first CharSequence, must not be null.
- * @param right the second CharSequence, must not be null.
- * @return result distance, or -1.
- * @throws IllegalArgumentException if either CharSequence input is {@code null}.
+ * @param + * If the threshold is not null, distance calculations will be limited to that maximum value. + * If the threshold is null, the unlimited version of the algorithm will be used. + *
+ * + * @param threshold if this is null then distance calculations will not be limited; + * otherwise it must not be negative. + * @deprecated Use {@link #builder()}. */ + @Deprecated public LevenshteinDistance(final Integer threshold) { - if (threshold != null && threshold < 0) { + this(builder().setThreshold(threshold)); + } + + /** + * Constructs a new {@link LevenshteinDistance} from a {@link Builder}. + * + * @param builder the builder; must not be null. + * @throws IllegalArgumentException if the threshold is negative, or any cost is negative. + */ + private LevenshteinDistance(final Builder builder) { + if (builder.threshold != null && builder.threshold < 0) { throw new IllegalArgumentException("Threshold must not be negative"); } - this.threshold = threshold; + if (builder.insertCost < 0) { + throw new IllegalArgumentException("Insert cost must not be negative"); + } + if (builder.deleteCost < 0) { + throw new IllegalArgumentException("Delete cost must not be negative"); + } + if (builder.replaceCost < 0) { + throw new IllegalArgumentException("Replace cost must not be negative"); + } + this.threshold = builder.threshold; + this.insertCost = builder.insertCost; + this.deleteCost = builder.deleteCost; + this.replaceCost = builder.replaceCost; } /** @@ -313,29 +546,23 @@ public LevenshteinDistance(final Integer threshold) { * A higher score indicates a greater distance. * * - *- * Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large - * strings. - *
- * *
- * distance.apply(null, *) = Throws {@link IllegalArgumentException}
- * distance.apply(*, null) = Throws {@link IllegalArgumentException}
+ * distance.apply(null, *) = throws {@link IllegalArgumentException}
+ * distance.apply(*, null) = throws {@link IllegalArgumentException}
* distance.apply("","") = 0
- * distance.apply("","a") = 1
- * distance.apply("aaapppp", "") = 7
- * distance.apply("frog", "fog") = 1
- * distance.apply("fly", "ant") = 3
- * distance.apply("elephant", "hippo") = 7
- * distance.apply("hippo", "elephant") = 7
- * distance.apply("hippo", "zzzzzzzz") = 8
- * distance.apply("hello", "hallo") = 1
+ * distance.apply("","a") = insertCost
+ * distance.apply("aaapppp", "") = 7 * deleteCost
+ * distance.apply("frog", "fog") = 1 * deleteCost
+ * distance.apply("fly", "ant") = 3 * replaceCost
+ * distance.apply("elephant", "hippo") = 7 (with default costs)
+ * distance.apply("hippo", "elephant") = 7 (with default costs)
+ * distance.apply("hello", "hallo") = 1 (with default costs)
*
*
* @param left the first input, must not be null.
* @param right the second input, must not be null.
- * @return result distance, or -1.
- * @throws IllegalArgumentException if either String input {@code null}.
+ * @return result distance, or {@code -1} if a threshold is set and the distance exceeds it.
+ * @throws IllegalArgumentException if either String input is {@code null}.
*/
@Override
public Integer apply(final CharSequence left, final CharSequence right) {
@@ -343,47 +570,62 @@ public Integer apply(final CharSequence left, final CharSequence right) {
}
/**
- * Computes the Levenshtein distance between two inputs.
+ * Computes the Levenshtein distance between two {@link SimilarityInput} instances.
*
* * A higher score indicates a greater distance. *
* - *
- * distance.apply(null, *) = Throws {@link IllegalArgumentException}
- * distance.apply(*, null) = Throws {@link IllegalArgumentException}
- * distance.apply("","") = 0
- * distance.apply("","a") = 1
- * distance.apply("aaapppp", "") = 7
- * distance.apply("frog", "fog") = 1
- * distance.apply("fly", "ant") = 3
- * distance.apply("elephant", "hippo") = 7
- * distance.apply("hippo", "elephant") = 7
- * distance.apply("hippo", "zzzzzzzz") = 8
- * distance.apply("hello", "hallo") = 1
- *
- *
- * @param