From 7bf8c7b7acf19536819f39c4ddaecfddc6548b98 Mon Sep 17 00:00:00 2001
From: Ron Ladin
+ * This implementation supports configurable costs for insertion, deletion, and substitution operations. By default, all costs are set to 1 for + * backward compatibility. + *
+ * ** This code has been adapted from Apache Commons Lang 3.3. *
@@ -37,7 +43,22 @@ public class LevenshteinDistance implements EditDistance+ * This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield and + * Chas Emerick's implementation of the Levenshtein distance algorithm. + *
* *- * This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield and Chas Emerick's implementation of the Levenshtein distance - * algorithm. + * Note: The stripe-width optimisation used in the default (all-costs-1) case relies on the + * assumption that each operation costs exactly 1. When custom costs are supplied the stripe + * cannot be reliably bounded to {@code 2*threshold+1}, so the full O(nm) DP table is used + * instead, returning -1 only when the final distance exceeds the threshold. *
* *
- * limitedCompare(null, *, *) = Throws {@link IllegalArgumentException}
- * limitedCompare(*, null, *) = Throws {@link IllegalArgumentException}
- * limitedCompare(*, *, -1) = Throws {@link IllegalArgumentException}
- * limitedCompare("","", 0) = 0
- * limitedCompare("aaapppp", "", 8) = 7
- * limitedCompare("aaapppp", "", 7) = 7
- * limitedCompare("aaapppp", "", 6)) = -1
- * limitedCompare("elephant", "hippo", 7) = 7
- * limitedCompare("elephant", "hippo", 6) = -1
- * limitedCompare("hippo", "elephant", 7) = 7
- * limitedCompare("hippo", "elephant", 6) = -1
+ * limitedCompare(null, *, *, *, *, *) = Throws {@link IllegalArgumentException}
+ * limitedCompare(*, null, *, *, *, *) = Throws {@link IllegalArgumentException}
+ * limitedCompare(*, *, -1, *, *, *) = Throws {@link IllegalArgumentException}
+ * limitedCompare("","", 0, 1, 1, 1) = 0
+ * limitedCompare("aaapppp", "", 8, 1, 1, 1) = 7
+ * limitedCompare("aaapppp", "", 7, 1, 1, 1) = 7
+ * limitedCompare("aaapppp", "", 6, 1, 1, 1)) = -1
+ * limitedCompare("elephant", "hippo", 7, 1, 1, 1) = 7
+ * limitedCompare("elephant", "hippo", 6, 1, 1, 1) = -1
+ * limitedCompare("hippo", "elephant", 7, 1, 1, 1) = 7
+ * limitedCompare("hippo", "elephant", 6, 1, 1, 1) = -1
*
*
- * @param left the first SimilarityInput, must not be null.
- * @param right the second SimilarityInput, must not be null.
- * @param threshold the target threshold, must not be negative.
- * @return result distance, or -1
+ * @param left the first SimilarityInput, must not be null.
+ * @param right the second SimilarityInput, must not be null.
+ * @param threshold the target threshold, must not be negative.
+ * @param insertCost the cost of an insertion operation, must not be negative.
+ * @param deleteCost the cost of a deletion operation, must not be negative.
+ * @param replaceCost the cost of a substitution operation, must not be negative.
+ * @return result distance, or -1 if the distance exceeds the threshold.
*/
- private static 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 | | | |#|#| 6 | | | | |#| 7 | | | | | |- * - * Note how the stripe leads off the table as there is no possible way to turn a string of length 5 into one of length 7 in edit distance of 1. - * - * Additionally, this implementation decreases memory usage by using two single-dimensional arrays and swapping them back and forth instead of - * allocating an entire n by m matrix. This requires a few minor changes, such as immediately returning when it's detected that the stripe has run off - * the matrix and initially filling the arrays with large values so that entries we don't compute are ignored. - * - * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for some discussion. - */ + if (threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } int n = left.length(); // length of left int m = right.length(); // length of right - // if one string is empty, the edit distance is necessarily the length - // of the other + // If one string is empty, the edit distance is the cost of inserting/deleting + // all characters of the other string. if (n == 0) { - return m <= threshold ? m : -1; + final int dist = m * insertCost; + return dist <= threshold ? dist : -1; } if (m == 0) { - return n <= threshold ? n : -1; + final int dist = n * deleteCost; + return dist <= threshold ? dist : -1; + } + + // When all costs equal 1, use the classic diagonal-stripe optimisation. + // For asymmetric costs the stripe width is not reliably bounded, so fall + // back to the full O(nm) table and threshold-check only at the end. + if (insertCost == 1 && deleteCost == 1 && replaceCost == 1) { + return limitedCompareUniformCost(left, right, threshold, n, m); } + return limitedCompareCustomCost(left, right, threshold, insertCost, deleteCost, replaceCost, n, m); + } + + /** + * Classic stripe-optimised O(km) limited compare for uniform unit costs. + * This preserves the original algorithm exactly. + */ + private static
- * This implementation only need one single-dimensional arrays of length s.length() + 1 + * A higher score indicates a greater distance. *
* *
- * unlimitedCompare(null, *) = Throws {@link IllegalArgumentException}
- * unlimitedCompare(*, null) = Throws {@link IllegalArgumentException}
- * unlimitedCompare("","") = 0
- * unlimitedCompare("","a") = 1
- * unlimitedCompare("aaapppp", "") = 7
- * unlimitedCompare("frog", "fog") = 1
- * unlimitedCompare("fly", "ant") = 3
- * unlimitedCompare("elephant", "hippo") = 7
- * unlimitedCompare("hippo", "elephant") = 7
- * unlimitedCompare("hippo", "zzzzzzzz") = 8
- * unlimitedCompare("hello", "hallo") = 1
+ * unlimitedCompare(null, *, *, *, *) = Throws {@link IllegalArgumentException}
+ * unlimitedCompare(*, null, *, *, *) = Throws {@link IllegalArgumentException}
+ * unlimitedCompare("","", 1, 1, 1) = 0
+ * unlimitedCompare("","a", 1, 1, 1) = 1
+ * unlimitedCompare("aaapppp", "", 1, 1, 1) = 7
+ * unlimitedCompare("frog", "fog", 1, 1, 1) = 1
+ * unlimitedCompare("fly", "ant", 1, 1, 1) = 3
+ * unlimitedCompare("elephant", "hippo", 1, 1, 1) = 7
+ * unlimitedCompare("hippo", "elephant", 1, 1, 1) = 7
+ * unlimitedCompare("hippo", "zzzzzzzz", 1, 1, 1) = 8
+ * unlimitedCompare("hello", "hallo", 1, 1, 1) = 1
*
*
- * @param left the first CharSequence, must not be null.
- * @param right the second CharSequence, must not be null.
- * @return result distance, or -1.
+ * @param left the first CharSequence, must not be null.
+ * @param right the second CharSequence, must not be null.
+ * @param insertCost the cost of an insertion operation, must not be negative.
+ * @param deleteCost the cost of a deletion operation, must not be negative.
+ * @param replaceCost the cost of a substitution operation, must not be negative.
+ * @return result distance.
* @throws IllegalArgumentException if either CharSequence input is {@code null}.
*/
- private static + * If the threshold is not null, distance calculations will be limited to a maximum length. If + * the threshold is null, the unlimited version of the algorithm will be used. + *
+ * + * @param threshold If this is null then distances calculations will not be limited. + * This may not be negative. */ public LevenshteinDistance(final Integer threshold) { + this(threshold, DEFAULT_INSERT_COST, DEFAULT_DELETE_COST, DEFAULT_REPLACE_COST); + } + + /** + * Constructs a new instance with the given threshold and custom operation costs. + * + *+ * If the threshold is not null, distance calculations will be limited to a maximum value. + * If the threshold is null, the unlimited version of the algorithm will be used. + *
+ * + *+ * All cost parameters must be non-negative integers. Passing 0 for a cost makes that + * operation free; passing values greater than 1 makes it more expensive relative to + * the other operations. + *
+ * + * @param threshold If this is null then distance calculations will not be limited. + * This may not be negative. + * @param insertCost the cost of inserting a character, must not be negative. + * @param deleteCost the cost of deleting a character, must not be negative. + * @param replaceCost the cost of replacing (substituting) a character, must not be negative. + * @throws IllegalArgumentException if threshold is negative, or any cost is negative. + * @since 1.13.0 + */ + public LevenshteinDistance(final Integer threshold, final int insertCost, final int deleteCost, + final int replaceCost) { if (threshold != null && threshold < 0) { throw new IllegalArgumentException("Threshold must not be negative"); } - this.threshold = threshold; + if (insertCost < 0) { + throw new IllegalArgumentException("Insert cost must not be negative"); + } + if (deleteCost < 0) { + throw new IllegalArgumentException("Delete cost must not be negative"); + } + if (replaceCost < 0) { + throw new IllegalArgumentException("Replace cost must not be negative"); + } + this.threshold = threshold; + this.insertCost = insertCost; + this.deleteCost = deleteCost; + this.replaceCost = replaceCost; } + // ------------------------------------------------------------------------- + // Public API + // ------------------------------------------------------------------------- + /** * Computes the Levenshtein distance between two Strings. * @@ -313,29 +464,23 @@ public LevenshteinDistance(final Integer threshold) { * A higher score indicates a greater distance. * * - *- * Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large - * strings. - *
- * *
* distance.apply(null, *) = Throws {@link IllegalArgumentException}
* distance.apply(*, null) = Throws {@link IllegalArgumentException}
* distance.apply("","") = 0
- * distance.apply("","a") = 1
- * distance.apply("aaapppp", "") = 7
- * distance.apply("frog", "fog") = 1
- * distance.apply("fly", "ant") = 3
- * distance.apply("elephant", "hippo") = 7
- * distance.apply("hippo", "elephant") = 7
- * distance.apply("hippo", "zzzzzzzz") = 8
- * distance.apply("hello", "hallo") = 1
+ * distance.apply("","a") = insertCost
+ * distance.apply("aaapppp", "") = 7 * deleteCost
+ * distance.apply("frog", "fog") = 1 * deleteCost (one deletion)
+ * distance.apply("fly", "ant") = replaceCost + replaceCost + replaceCost
+ * distance.apply("elephant", "hippo") = 7 (with default costs)
+ * distance.apply("hippo", "elephant") = 7 (with default costs)
+ * distance.apply("hello", "hallo") = 1 (with default costs)
*
*
* @param left the first input, must not be null.
* @param right the second input, must not be null.
- * @return result distance, or -1.
- * @throws IllegalArgumentException if either String input {@code null}.
+ * @return result distance, or -1 if a threshold is set and the distance exceeds it.
+ * @throws IllegalArgumentException if either String input is {@code null}.
*/
@Override
public Integer apply(final CharSequence left, final CharSequence right) {
@@ -349,41 +494,60 @@ public Integer apply(final CharSequence left, final CharSequence right) {
* A higher score indicates a greater distance.
*
*
- *
- * distance.apply(null, *) = Throws {@link IllegalArgumentException}
- * distance.apply(*, null) = Throws {@link IllegalArgumentException}
- * distance.apply("","") = 0
- * distance.apply("","a") = 1
- * distance.apply("aaapppp", "") = 7
- * distance.apply("frog", "fog") = 1
- * distance.apply("fly", "ant") = 3
- * distance.apply("elephant", "hippo") = 7
- * distance.apply("hippo", "elephant") = 7
- * distance.apply("hippo", "zzzzzzzz") = 8
- * distance.apply("hello", "hallo") = 1
- *
- *
* @param - * This is the number of changes needed to change one sequence into another, where each change is a single character modification (deletion, insertion or - * substitution). + * This is the number of changes needed to change one sequence into another, where each change is a + * single character modification (deletion, insertion or substitution). *
* *- * This implementation supports configurable costs for insertion, deletion, and substitution operations. By default, all costs are set to 1 for - * backward compatibility. + * This implementation supports configurable costs for insertion, deletion, and substitution + * operations. By default, all costs are set to 1 for backward compatibility. *
* *+ * Use {@link Builder} to construct instances with custom thresholds and operation costs: + *
+ * + *+ * LevenshteinDistance dist = LevenshteinDistance.builder() + * .threshold(10) + * .insertCost(1) + * .deleteCost(2) + * .replaceCost(3) + * .build(); + *+ * + *
* This code has been adapted from Apache Commons Lang 3.3. *
* @@ -43,19 +56,109 @@ public class LevenshteinDistance implements EditDistance+ * All costs default to 1. The threshold defaults to {@code null} (unlimited). + *
+ * + *+ * LevenshteinDistance dist = LevenshteinDistance.builder() + * .threshold(5) + * .insertCost(1) + * .deleteCost(1) + * .replaceCost(2) + * .build(); + *+ * + * @since 1.13.0 */ - private static final int DEFAULT_INSERT_COST = 1; + public static final class Builder { - /** - * Default cost for a deletion operation. - */ - private static final int DEFAULT_DELETE_COST = 1; + /** + * Default cost for any single edit operation. + */ + private static final int DEFAULT_COST = 1; - /** - * Default cost for a substitution (replace) operation. - */ - private static final int DEFAULT_REPLACE_COST = 1; + /** Threshold for limited compare, or {@code null} for unlimited. */ + private Integer threshold; + + /** Cost of inserting a character. */ + private int insertCost = DEFAULT_COST; + + /** Cost of deleting a character. */ + private int deleteCost = DEFAULT_COST; + + /** Cost of substituting one character for another. */ + private int replaceCost = DEFAULT_COST; + + /** + * Constructs a new builder with default values. + */ + private Builder() { + // use LevenshteinDistance.builder() factory method + } + + /** + * Builds a new {@link LevenshteinDistance} from the current state of this builder. + * + * @return a new {@link LevenshteinDistance}. + * @throws IllegalArgumentException if the threshold is negative, or any cost is negative. + */ + public LevenshteinDistance build() { + return new LevenshteinDistance(this); + } + + /** + * Sets the cost of a deletion operation. + * + * @param deleteCost the cost of deleting a character; must not be negative. + * @return {@code this} builder. + */ + public Builder deleteCost(final int deleteCost) { + this.deleteCost = deleteCost; + return this; + } + + /** + * Sets the cost of an insertion operation. + * + * @param insertCost the cost of inserting a character; must not be negative. + * @return {@code this} builder. + */ + public Builder insertCost(final int insertCost) { + this.insertCost = insertCost; + return this; + } + + /** + * Sets the cost of a substitution (replace) operation. + * + * @param replaceCost the cost of replacing a character; must not be negative. + * @return {@code this} builder. + */ + public Builder replaceCost(final int replaceCost) { + this.replaceCost = replaceCost; + return this; + } + + /** + * Sets the threshold for limited distance calculation. + * + *
+ * When set, {@link LevenshteinDistance#apply} returns {@code -1} if the computed + * distance exceeds this value. When {@code null}, the unlimited algorithm is used. + *
+ * + * @param threshold the maximum distance to report; must not be negative, or {@code null} + * for no limit. + * @return {@code this} builder. + */ + public Builder threshold(final Integer threshold) { + this.threshold = threshold; + return this; + } + } /** * The singleton instance (uses default costs and no threshold). @@ -63,51 +166,62 @@ public class LevenshteinDistance implements EditDistance- * This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield and - * Chas Emerick's implementation of the Levenshtein distance algorithm. + * This implementation follows from Algorithms on Strings, Trees and Sequences by + * Dan Gusfield and Chas Emerick's implementation of the Levenshtein distance algorithm. *
* *- * Note: The stripe-width optimisation used in the default (all-costs-1) case relies on the - * assumption that each operation costs exactly 1. When custom costs are supplied the stripe - * cannot be reliably bounded to {@code 2*threshold+1}, so the full O(nm) DP table is used - * instead, returning -1 only when the final distance exceeds the threshold. + * Note: The stripe-width optimisation used in the unit-cost case relies on the assumption that + * each operation costs exactly 1. When custom costs are supplied the stripe cannot be reliably + * bounded to {@code 2*threshold+1}, so the full O(nm) DP table is used instead, returning + * {@code -1} only when the final distance exceeds the threshold. *
* *
- * limitedCompare(null, *, *, *, *, *) = Throws {@link IllegalArgumentException}
- * limitedCompare(*, null, *, *, *, *) = Throws {@link IllegalArgumentException}
- * limitedCompare(*, *, -1, *, *, *) = Throws {@link IllegalArgumentException}
+ * limitedCompare(null, *, *, *, *, *) = throws {@link IllegalArgumentException}
+ * limitedCompare(*, null, *, *, *, *) = throws {@link IllegalArgumentException}
+ * limitedCompare(*, *, -1, *, *, *) = throws {@link IllegalArgumentException}
* limitedCompare("","", 0, 1, 1, 1) = 0
* limitedCompare("aaapppp", "", 8, 1, 1, 1) = 7
* limitedCompare("aaapppp", "", 7, 1, 1, 1) = 7
- * limitedCompare("aaapppp", "", 6, 1, 1, 1)) = -1
+ * limitedCompare("aaapppp", "", 6, 1, 1, 1) = -1
* limitedCompare("elephant", "hippo", 7, 1, 1, 1) = 7
* limitedCompare("elephant", "hippo", 6, 1, 1, 1) = -1
* limitedCompare("hippo", "elephant", 7, 1, 1, 1) = 7
* limitedCompare("hippo", "elephant", 6, 1, 1, 1) = -1
*
*
+ * @param + * When {@code deleteCost != insertCost} swapping the strings would change the semantics + * (delete on the original becomes insert on the swapped copy), so the orientation is always + * kept as-is and the correct directional cost is applied. + *
+ * + * @param* This preserves the original algorithm exactly. + *
+ * + * @param- * When {@code deleteCost != insertCost} swapping the strings would change the - * semantics (delete on the original becomes insert on the swapped copy), so - * we always keep left as-is and pay the correct directional cost. - *
- */ - private static* A higher score indicates a greater distance. *
* *
- * unlimitedCompare(null, *, *, *, *) = Throws {@link IllegalArgumentException}
- * unlimitedCompare(*, null, *, *, *) = Throws {@link IllegalArgumentException}
+ * unlimitedCompare(null, *, *, *, *) = throws {@link IllegalArgumentException}
+ * unlimitedCompare(*, null, *, *, *) = throws {@link IllegalArgumentException}
* unlimitedCompare("","", 1, 1, 1) = 0
* unlimitedCompare("","a", 1, 1, 1) = 1
* unlimitedCompare("aaapppp", "", 1, 1, 1) = 7
@@ -282,13 +404,14 @@ private static int limitedCompareCustomCost(final SimilarityInput left, f
* unlimitedCompare("hello", "hallo", 1, 1, 1) = 1
*
*
- * @param left the first CharSequence, must not be null.
- * @param right the second CharSequence, must not be null.
+ * @param - * If the threshold is not null, distance calculations will be limited to a maximum length. If - * the threshold is null, the unlimited version of the algorithm will be used. + * If the threshold is not null, distance calculations will be limited to that maximum value. + * If the threshold is null, the unlimited version of the algorithm will be used. *
* - * @param threshold If this is null then distances calculations will not be limited. - * This may not be negative. + * @param threshold if this is null then distance calculations will not be limited; + * otherwise it must not be negative. + * @deprecated Use {@link #builder()}. */ + @Deprecated public LevenshteinDistance(final Integer threshold) { - this(threshold, DEFAULT_INSERT_COST, DEFAULT_DELETE_COST, DEFAULT_REPLACE_COST); + this(builder().threshold(threshold)); } /** - * Constructs a new instance with the given threshold and custom operation costs. - * - *- * If the threshold is not null, distance calculations will be limited to a maximum value. - * If the threshold is null, the unlimited version of the algorithm will be used. - *
- * - *- * All cost parameters must be non-negative integers. Passing 0 for a cost makes that - * operation free; passing values greater than 1 makes it more expensive relative to - * the other operations. - *
+ * Constructs a new {@link LevenshteinDistance} from a {@link Builder}. * - * @param threshold If this is null then distance calculations will not be limited. - * This may not be negative. - * @param insertCost the cost of inserting a character, must not be negative. - * @param deleteCost the cost of deleting a character, must not be negative. - * @param replaceCost the cost of replacing (substituting) a character, must not be negative. - * @throws IllegalArgumentException if threshold is negative, or any cost is negative. - * @since 1.13.0 + * @param builder the builder; must not be null. + * @throws IllegalArgumentException if the threshold is negative, or any cost is negative. */ - public LevenshteinDistance(final Integer threshold, final int insertCost, final int deleteCost, - final int replaceCost) { - if (threshold != null && threshold < 0) { + private LevenshteinDistance(final Builder builder) { + if (builder.threshold != null && builder.threshold < 0) { throw new IllegalArgumentException("Threshold must not be negative"); } - if (insertCost < 0) { + if (builder.insertCost < 0) { throw new IllegalArgumentException("Insert cost must not be negative"); } - if (deleteCost < 0) { + if (builder.deleteCost < 0) { throw new IllegalArgumentException("Delete cost must not be negative"); } - if (replaceCost < 0) { + if (builder.replaceCost < 0) { throw new IllegalArgumentException("Replace cost must not be negative"); } - this.threshold = threshold; - this.insertCost = insertCost; - this.deleteCost = deleteCost; - this.replaceCost = replaceCost; + this.threshold = builder.threshold; + this.insertCost = builder.insertCost; + this.deleteCost = builder.deleteCost; + this.replaceCost = builder.replaceCost; } - // ------------------------------------------------------------------------- - // Public API - // ------------------------------------------------------------------------- - /** * Computes the Levenshtein distance between two Strings. * @@ -465,13 +547,13 @@ public LevenshteinDistance(final Integer threshold, final int insertCost, final * * *
- * distance.apply(null, *) = Throws {@link IllegalArgumentException}
- * distance.apply(*, null) = Throws {@link IllegalArgumentException}
+ * distance.apply(null, *) = throws {@link IllegalArgumentException}
+ * distance.apply(*, null) = throws {@link IllegalArgumentException}
* distance.apply("","") = 0
* distance.apply("","a") = insertCost
* distance.apply("aaapppp", "") = 7 * deleteCost
- * distance.apply("frog", "fog") = 1 * deleteCost (one deletion)
- * distance.apply("fly", "ant") = replaceCost + replaceCost + replaceCost
+ * distance.apply("frog", "fog") = 1 * deleteCost
+ * distance.apply("fly", "ant") = 3 * replaceCost
* distance.apply("elephant", "hippo") = 7 (with default costs)
* distance.apply("hippo", "elephant") = 7 (with default costs)
* distance.apply("hello", "hallo") = 1 (with default costs)
@@ -479,7 +561,7 @@ public LevenshteinDistance(final Integer threshold, final int insertCost, final
*
* @param left the first input, must not be null.
* @param right the second input, must not be null.
- * @return result distance, or -1 if a threshold is set and the distance exceeds it.
+ * @return result distance, or {@code -1} if a threshold is set and the distance exceeds it.
* @throws IllegalArgumentException if either String input is {@code null}.
*/
@Override
@@ -488,16 +570,16 @@ public Integer apply(final CharSequence left, final CharSequence right) {
}
/**
- * Computes the Levenshtein distance between two inputs.
+ * Computes the Levenshtein distance between two {@link SimilarityInput} instances.
*
*
* A higher score indicates a greater distance.
*
*
- * @param The type of similarity score unit.
+ * @param the type of element compared by the similarity score.
* @param left the first input, must not be null.
* @param right the second input, must not be null.
- * @return result distance, or -1 if a threshold is set and the distance exceeds it.
+ * @return result distance, or {@code -1} if a threshold is set and the distance exceeds it.
* @throws IllegalArgumentException if either input is {@code null}.
* @since 1.13.0
*/
@@ -508,23 +590,20 @@ public Integer apply(final SimilarityInput left, final SimilarityInput
return unlimitedCompare(left, right, insertCost, deleteCost, replaceCost);
}
- // -------------------------------------------------------------------------
- // Accessors
- // -------------------------------------------------------------------------
-
/**
- * Gets the distance threshold.
+ * Gets the cost of a deletion operation.
*
- * @return The distance threshold, or {@code null} if no threshold is set.
+ * @return the deletion cost.
+ * @since 1.13.0
*/
- public Integer getThreshold() {
- return threshold;
+ public int getDeleteCost() {
+ return deleteCost;
}
/**
* Gets the cost of an insertion operation.
*
- * @return The insertion cost.
+ * @return the insertion cost.
* @since 1.13.0
*/
public int getInsertCost() {
@@ -532,22 +611,21 @@ public int getInsertCost() {
}
/**
- * Gets the cost of a deletion operation.
+ * Gets the cost of a substitution (replace) operation.
*
- * @return The deletion cost.
+ * @return the replacement cost.
* @since 1.13.0
*/
- public int getDeleteCost() {
- return deleteCost;
+ public int getReplaceCost() {
+ return replaceCost;
}
/**
- * Gets the cost of a substitution (replace) operation.
+ * Gets the distance threshold.
*
- * @return The replacement cost.
- * @since 1.13.0
+ * @return the distance threshold, or {@code null} if no threshold is set.
*/
- public int getReplaceCost() {
- return replaceCost;
+ public Integer getThreshold() {
+ return threshold;
}
-}
\ No newline at end of file
+}
diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
index ecf24ceea8..bdddd6ac52 100644
--- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
@@ -6,7 +6,7 @@
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
- * https://www.apache.org/licenses/LICENSE-2.0
+ * https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -181,58 +181,53 @@ void testGetThresholdDirectlyAfterObjectInstantiation() {
assertNull(LevenshteinDistance.getDefaultInstance().getThreshold());
}
- // -------------------------------------------------------------------------
- // New Weighted Levenshtein Distance Tests
- // -------------------------------------------------------------------------
-
@Test
void testConstructorWithNegativeCosts() {
- assertThrows(IllegalArgumentException.class, () -> new LevenshteinDistance(null, -1, 1, 1));
- assertThrows(IllegalArgumentException.class, () -> new LevenshteinDistance(null, 1, -1, 1));
- assertThrows(IllegalArgumentException.class, () -> new LevenshteinDistance(null, 1, 1, -1));
+ assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().insertCost(-1).build());
+ assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().deleteCost(-1).build());
+ assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().replaceCost(-1).build());
}
@Test
void testGetLevenshteinDistance_WeightedUnlimited() {
// Substitution is very expensive (10) vs Insert/Delete (1 each)
- final LevenshteinDistance dist = new LevenshteinDistance(null, 1, 1, 10);
+ final LevenshteinDistance dist = LevenshteinDistance.builder().insertCost(1).deleteCost(1).replaceCost(10).build();
// 'a' -> 'b' should choose delete 'a' (1) and insert 'b' (1) = distance 2,
// instead of replace (10).
assertEquals(2, dist.apply("a", "b"));
// All operations are free (0)
- final LevenshteinDistance freeDist = new LevenshteinDistance(null, 0, 0, 0);
+ final LevenshteinDistance freeDist = LevenshteinDistance.builder().insertCost(0).deleteCost(0).replaceCost(0).build();
assertEquals(0, freeDist.apply("abc", "def"));
// Asymmetric costs: Insert=10, Delete=1, Replace=100
- final LevenshteinDistance asymmetric = new LevenshteinDistance(null, 10, 1, 100);
- assertEquals(1, asymmetric.apply("a", "")); // Delete 'a' = 1
- assertEquals(10, asymmetric.apply("", "a")); // Insert 'a' = 10
+ final LevenshteinDistance asymmetric = LevenshteinDistance.builder().insertCost(10).deleteCost(1).replaceCost(100).build();
+ assertEquals(1, asymmetric.apply("a", "")); // Delete 'a' = 1
+ assertEquals(10, asymmetric.apply("", "a")); // Insert 'a' = 10
}
@Test
void testGetLevenshteinDistance_WeightedThreshold() {
// Distance is 2 (via delete/insert), threshold is 5 -> result 2
- final LevenshteinDistance weighted = new LevenshteinDistance(5, 1, 1, 10);
+ final LevenshteinDistance weighted = LevenshteinDistance.builder().threshold(5).insertCost(1).deleteCost(1).replaceCost(10).build();
assertEquals(2, weighted.apply("a", "b"));
// Distance is 2, threshold is 1 -> result -1
- final LevenshteinDistance strict = new LevenshteinDistance(1, 1, 1, 10);
+ final LevenshteinDistance strict = LevenshteinDistance.builder().threshold(1).insertCost(1).deleteCost(1).replaceCost(10).build();
assertEquals(-1, strict.apply("a", "b"));
// Empty strings with weighted threshold
- assertEquals(0, new LevenshteinDistance(5, 2, 2, 2).apply("", ""));
- assertEquals(4, new LevenshteinDistance(5, 2, 2, 2).apply("aa", ""));
- assertEquals(-1, new LevenshteinDistance(1, 2, 2, 2).apply("aa", ""));
+ assertEquals(0, LevenshteinDistance.builder().threshold(5).insertCost(2).deleteCost(2).replaceCost(2).build().apply("", ""));
+ assertEquals(4, LevenshteinDistance.builder().threshold(5).insertCost(2).deleteCost(2).replaceCost(2).build().apply("aa", ""));
+ assertEquals(-1, LevenshteinDistance.builder().threshold(1).insertCost(2).deleteCost(2).replaceCost(2).build().apply("aa", ""));
}
@Test
void testWeightedAccessors() {
- final LevenshteinDistance dist = new LevenshteinDistance(10, 2, 3, 4);
+ final LevenshteinDistance dist = LevenshteinDistance.builder().threshold(10).insertCost(2).deleteCost(3).replaceCost(4).build();
assertEquals(10, dist.getThreshold());
assertEquals(2, dist.getInsertCost());
assertEquals(3, dist.getDeleteCost());
assertEquals(4, dist.getReplaceCost());
}
-
-}
\ No newline at end of file
+}
From 39c918d50e79fcaa28bc2d676614604ae3273063 Mon Sep 17 00:00:00 2001
From: Ron Ladin
Date: Sun, 15 Mar 2026 21:13:49 +0200
Subject: [PATCH 3/3] Apply PR feedback: Update @since tags to 1.16.0 and use
'set' prefix for builder methods
---
.../text/similarity/LevenshteinDistance.java | 38 +++++++++----------
.../similarity/LevenshteinDistanceTest.java | 24 ++++++------
2 files changed, 31 insertions(+), 31 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
index ea7f5aa890..7e05e68cc4 100644
--- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java
@@ -38,10 +38,10 @@
*
*
* LevenshteinDistance dist = LevenshteinDistance.builder()
- * .threshold(10)
- * .insertCost(1)
- * .deleteCost(2)
- * .replaceCost(3)
+ * .setThreshold(10)
+ * .setInsertCost(1)
+ * .setDeleteCost(2)
+ * .setReplaceCost(3)
* .build();
*
*
@@ -64,14 +64,14 @@ public class LevenshteinDistance implements EditDistance {
*
*
* LevenshteinDistance dist = LevenshteinDistance.builder()
- * .threshold(5)
- * .insertCost(1)
- * .deleteCost(1)
- * .replaceCost(2)
+ * .setThreshold(5)
+ * .setInsertCost(1)
+ * .setDeleteCost(1)
+ * .setReplaceCost(2)
* .build();
*
*
- * @since 1.13.0
+ * @since 1.16.0
*/
public static final class Builder {
@@ -115,7 +115,7 @@ public LevenshteinDistance build() {
* @param deleteCost the cost of deleting a character; must not be negative.
* @return {@code this} builder.
*/
- public Builder deleteCost(final int deleteCost) {
+ public Builder setDeleteCost(final int deleteCost) {
this.deleteCost = deleteCost;
return this;
}
@@ -126,7 +126,7 @@ public Builder deleteCost(final int deleteCost) {
* @param insertCost the cost of inserting a character; must not be negative.
* @return {@code this} builder.
*/
- public Builder insertCost(final int insertCost) {
+ public Builder setInsertCost(final int insertCost) {
this.insertCost = insertCost;
return this;
}
@@ -137,7 +137,7 @@ public Builder insertCost(final int insertCost) {
* @param replaceCost the cost of replacing a character; must not be negative.
* @return {@code this} builder.
*/
- public Builder replaceCost(final int replaceCost) {
+ public Builder setReplaceCost(final int replaceCost) {
this.replaceCost = replaceCost;
return this;
}
@@ -154,7 +154,7 @@ public Builder replaceCost(final int replaceCost) {
* for no limit.
* @return {@code this} builder.
*/
- public Builder threshold(final Integer threshold) {
+ public Builder setThreshold(final Integer threshold) {
this.threshold = threshold;
return this;
}
@@ -169,7 +169,7 @@ public Builder threshold(final Integer threshold) {
* Returns a new {@link Builder} for constructing {@link LevenshteinDistance} instances.
*
* @return a new {@link Builder}.
- * @since 1.13.0
+ * @since 1.16.0
*/
public static Builder builder() {
return new Builder();
@@ -511,7 +511,7 @@ public LevenshteinDistance() {
*/
@Deprecated
public LevenshteinDistance(final Integer threshold) {
- this(builder().threshold(threshold));
+ this(builder().setThreshold(threshold));
}
/**
@@ -581,7 +581,7 @@ public Integer apply(final CharSequence left, final CharSequence right) {
* @param right the second input, must not be null.
* @return result distance, or {@code -1} if a threshold is set and the distance exceeds it.
* @throws IllegalArgumentException if either input is {@code null}.
- * @since 1.13.0
+ * @since 1.16.0
*/
public Integer apply(final SimilarityInput left, final SimilarityInput right) {
if (threshold != null) {
@@ -594,7 +594,7 @@ public Integer apply(final SimilarityInput left, final SimilarityInput
* Gets the cost of a deletion operation.
*
* @return the deletion cost.
- * @since 1.13.0
+ * @since 1.16.0
*/
public int getDeleteCost() {
return deleteCost;
@@ -604,7 +604,7 @@ public int getDeleteCost() {
* Gets the cost of an insertion operation.
*
* @return the insertion cost.
- * @since 1.13.0
+ * @since 1.16.0
*/
public int getInsertCost() {
return insertCost;
@@ -614,7 +614,7 @@ public int getInsertCost() {
* Gets the cost of a substitution (replace) operation.
*
* @return the replacement cost.
- * @since 1.13.0
+ * @since 1.16.0
*/
public int getReplaceCost() {
return replaceCost;
diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
index bdddd6ac52..ed8b38454a 100644
--- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java
@@ -183,25 +183,25 @@ void testGetThresholdDirectlyAfterObjectInstantiation() {
@Test
void testConstructorWithNegativeCosts() {
- assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().insertCost(-1).build());
- assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().deleteCost(-1).build());
- assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().replaceCost(-1).build());
+ assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().setInsertCost(-1).build());
+ assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().setDeleteCost(-1).build());
+ assertThrows(IllegalArgumentException.class, () -> LevenshteinDistance.builder().setReplaceCost(-1).build());
}
@Test
void testGetLevenshteinDistance_WeightedUnlimited() {
// Substitution is very expensive (10) vs Insert/Delete (1 each)
- final LevenshteinDistance dist = LevenshteinDistance.builder().insertCost(1).deleteCost(1).replaceCost(10).build();
+ final LevenshteinDistance dist = LevenshteinDistance.builder().setInsertCost(1).setDeleteCost(1).setReplaceCost(10).build();
// 'a' -> 'b' should choose delete 'a' (1) and insert 'b' (1) = distance 2,
// instead of replace (10).
assertEquals(2, dist.apply("a", "b"));
// All operations are free (0)
- final LevenshteinDistance freeDist = LevenshteinDistance.builder().insertCost(0).deleteCost(0).replaceCost(0).build();
+ final LevenshteinDistance freeDist = LevenshteinDistance.builder().setInsertCost(0).setDeleteCost(0).setReplaceCost(0).build();
assertEquals(0, freeDist.apply("abc", "def"));
// Asymmetric costs: Insert=10, Delete=1, Replace=100
- final LevenshteinDistance asymmetric = LevenshteinDistance.builder().insertCost(10).deleteCost(1).replaceCost(100).build();
+ final LevenshteinDistance asymmetric = LevenshteinDistance.builder().setInsertCost(10).setDeleteCost(1).setReplaceCost(100).build();
assertEquals(1, asymmetric.apply("a", "")); // Delete 'a' = 1
assertEquals(10, asymmetric.apply("", "a")); // Insert 'a' = 10
}
@@ -209,22 +209,22 @@ void testGetLevenshteinDistance_WeightedUnlimited() {
@Test
void testGetLevenshteinDistance_WeightedThreshold() {
// Distance is 2 (via delete/insert), threshold is 5 -> result 2
- final LevenshteinDistance weighted = LevenshteinDistance.builder().threshold(5).insertCost(1).deleteCost(1).replaceCost(10).build();
+ final LevenshteinDistance weighted = LevenshteinDistance.builder().setThreshold(5).setInsertCost(1).setDeleteCost(1).setReplaceCost(10).build();
assertEquals(2, weighted.apply("a", "b"));
// Distance is 2, threshold is 1 -> result -1
- final LevenshteinDistance strict = LevenshteinDistance.builder().threshold(1).insertCost(1).deleteCost(1).replaceCost(10).build();
+ final LevenshteinDistance strict = LevenshteinDistance.builder().setThreshold(1).setInsertCost(1).setDeleteCost(1).setReplaceCost(10).build();
assertEquals(-1, strict.apply("a", "b"));
// Empty strings with weighted threshold
- assertEquals(0, LevenshteinDistance.builder().threshold(5).insertCost(2).deleteCost(2).replaceCost(2).build().apply("", ""));
- assertEquals(4, LevenshteinDistance.builder().threshold(5).insertCost(2).deleteCost(2).replaceCost(2).build().apply("aa", ""));
- assertEquals(-1, LevenshteinDistance.builder().threshold(1).insertCost(2).deleteCost(2).replaceCost(2).build().apply("aa", ""));
+ assertEquals(0, LevenshteinDistance.builder().setThreshold(5).setInsertCost(2).setDeleteCost(2).setReplaceCost(2).build().apply("", ""));
+ assertEquals(4, LevenshteinDistance.builder().setThreshold(5).setInsertCost(2).setDeleteCost(2).setReplaceCost(2).build().apply("aa", ""));
+ assertEquals(-1, LevenshteinDistance.builder().setThreshold(1).setInsertCost(2).setDeleteCost(2).setReplaceCost(2).build().apply("aa", ""));
}
@Test
void testWeightedAccessors() {
- final LevenshteinDistance dist = LevenshteinDistance.builder().threshold(10).insertCost(2).deleteCost(3).replaceCost(4).build();
+ final LevenshteinDistance dist = LevenshteinDistance.builder().setThreshold(10).setInsertCost(2).setDeleteCost(3).setReplaceCost(4).build();
assertEquals(10, dist.getThreshold());
assertEquals(2, dist.getInsertCost());
assertEquals(3, dist.getDeleteCost());