| 1 | package net.digitaltsunami.word.sequence; |
| 2 | |
| 3 | /** |
| 4 | * Combination of edit distance and normalization strategies used to provide a |
| 5 | * configuration for calculating edit distances. |
| 6 | * <p> |
| 7 | * By allowing the the setting of the distance strategy separate from the |
| 8 | * normalization strategy, distance calculations can be tailored without the |
| 9 | * need to create additional subclasses. |
| 10 | * |
| 11 | * @author dhagberg |
| 12 | * |
| 13 | */ |
| 14 | public class EditDistanceCalculator { |
| 15 | private final EditDistanceStrategy distanceStrategy; |
| 16 | private final EditDistanceNormalization distanceNormalizer; |
| 17 | |
| 18 | /** |
| 19 | * Instantiate an EditDistanceCalculator using the default edit distance |
| 20 | * strategy and default normalization strategy. |
| 21 | */ |
| 22 | public EditDistanceCalculator() { |
| 23 | this(new LevenshteinDistanceStrategy(), new TermLengthNormalization()); |
| 24 | } |
| 25 | |
| 26 | /** |
| 27 | * Instantiate an EditDistanceCalculator using the provided |
| 28 | * {@link EditDistanceStrategy} and default normalization strategy. |
| 29 | * |
| 30 | * @param strategy |
| 31 | * used to calculate edit distance. |
| 32 | */ |
| 33 | public EditDistanceCalculator(EditDistanceStrategy strategy) { |
| 34 | this(strategy, new TermLengthNormalization()); |
| 35 | } |
| 36 | |
| 37 | /** |
| 38 | * Instantiate an EditDistanceCalculator using the default edit distance |
| 39 | * strategy and the provided {@link EditDistanceNormalization}. |
| 40 | * |
| 41 | * @param normalizer |
| 42 | * used to normalize edit distances. |
| 43 | */ |
| 44 | public EditDistanceCalculator(EditDistanceNormalization normalizer) { |
| 45 | this(new LevenshteinDistanceStrategy(), normalizer); |
| 46 | } |
| 47 | |
| 48 | /** |
| 49 | * Instantiate an EditDistanceCalculator using the provided |
| 50 | * {@link EditDistanceStrategy} and {@link EditDistanceNormalization}. |
| 51 | * |
| 52 | * @param strategy |
| 53 | * used to calculate edit distance. |
| 54 | * @param normalizer |
| 55 | * used to normalize edit distances. |
| 56 | */ |
| 57 | public EditDistanceCalculator(EditDistanceStrategy strategy, EditDistanceNormalization normalizer) { |
| 58 | this.distanceStrategy = strategy; |
| 59 | this.distanceNormalizer = normalizer; |
| 60 | } |
| 61 | |
| 62 | /** |
| 63 | * Calculate and return the number of edits required to convert fromTerm |
| 64 | * into toTerm. As this method provides only a count of the required edits, |
| 65 | * no significance will be applied to the length of the two terms or any |
| 66 | * edit weights. |
| 67 | * |
| 68 | * @param fromTerm |
| 69 | * initial term used as baseline |
| 70 | * @param toTerm |
| 71 | * target term from which the edit count will be calculated. |
| 72 | * @return a count of applicable edits required to convert the fromTerm to |
| 73 | * toTerm. Count will be an integer value in the range: 0 >= count |
| 74 | * >= max_length(fromTerm, toTerm) |
| 75 | */ |
| 76 | public int getEditCount(String fromTerm, String toTerm) { |
| 77 | return distanceStrategy.getEditCount(fromTerm, toTerm); |
| 78 | } |
| 79 | |
| 80 | /** |
| 81 | * Calculate and return the edit distance between fromTerm and toTerm. The |
| 82 | * distance is calculated based on features specific to the current edit |
| 83 | * distance strategy. As an example, the strategy may, but is not required |
| 84 | * to, take into account such factors as: common mistakes, keyboard |
| 85 | * location, sounds, etc.. |
| 86 | * |
| 87 | * @param fromTerm |
| 88 | * initial term used as baseline |
| 89 | * @param toTerm |
| 90 | * target term from which the edit count will be calculated. |
| 91 | * @return a value returned will be in the range: 0 >= distance >= |
| 92 | * max_length(fromTerm, toTerm) with an identical term being 0 and |
| 93 | * increasing in size as the difference in the terms increases. |
| 94 | */ |
| 95 | public double getEditDistance(String fromTerm, String toTerm) { |
| 96 | return distanceStrategy.getEditDistance(fromTerm, toTerm); |
| 97 | } |
| 98 | |
| 99 | /** |
| 100 | * Calculate and return the <strong>normalized</strong> edit distance |
| 101 | * between fromTerm and toTerm. The distance is calculated based on features |
| 102 | * specific to the current edit distance strategy. As an example, the |
| 103 | * strategy may, but is not required to, take into account such factors as: |
| 104 | * common mistakes, keyboard location, sounds, etc.. |
| 105 | * <p> |
| 106 | * After the edit distance is calculated, it will be normalized using the |
| 107 | * current normalization strategy to provide a value in the range [0,1] with |
| 108 | * a 0 being a complete mismatch (no characters in common) and 1 being an |
| 109 | * exact match. |
| 110 | * |
| 111 | * @param fromTerm |
| 112 | * initial term used as baseline |
| 113 | * @param toTerm |
| 114 | * target term from which the edit count will be calculated. |
| 115 | * |
| 116 | * @return a value returned will be in the closed interval: [0, 1] with an |
| 117 | * identical term being 1 and decreasing towards zero as the |
| 118 | * difference in the terms increases. |
| 119 | */ |
| 120 | public double getNormalizedEditDistance(String fromTerm, String toTerm) { |
| 121 | double distance = distanceStrategy.getEditDistance(fromTerm, toTerm); |
| 122 | return distanceNormalizer.getNormalizedEditDistance(distance, fromTerm, toTerm); |
| 123 | } |
| 124 | } |