| 1 | package net.digitaltsunami.word.sequence; |
| 2 | |
| 3 | import java.util.ArrayList; |
| 4 | import java.util.Collection; |
| 5 | import java.util.Collections; |
| 6 | import java.util.Iterator; |
| 7 | import java.util.List; |
| 8 | import java.util.concurrent.atomic.AtomicReference; |
| 9 | |
| 10 | /** |
| 11 | * Provides a set of utility methods related to edit distance. |
| 12 | * |
| 13 | * @author dhagberg |
| 14 | * |
| 15 | */ |
| 16 | public class EditDistance { |
| 17 | /** |
| 18 | * Used to calculate edit distance using strategy. Distance will be |
| 19 | * calculated by invoking |
| 20 | * {@link EditDistanceCalculator#getEditDistance(String, String)} |
| 21 | */ |
| 22 | private static AtomicReference<EditDistanceCalculator> defaultDistanceCalculator = new AtomicReference<EditDistanceCalculator>( |
| 23 | new EditDistanceCalculator()); |
| 24 | |
| 25 | /** |
| 26 | * Reduce the provided collection of strings to list of strings within an |
| 27 | * edit distance provided by the maxDistance parameter. Collection passed in |
| 28 | * will not be modified. The results will be returned in a new collection. |
| 29 | * |
| 30 | * @param candidates |
| 31 | * Collection of terms to compare against fromTerm |
| 32 | * @param fromTerm |
| 33 | * Term from which the edit distance will be calculated. |
| 34 | * @param maxDistance |
| 35 | * Maximum edit distance cutoff for matching candidates. |
| 36 | * @return |
| 37 | */ |
| 38 | public static Collection<String> getAllWithinDistance(Collection<String> candidates, |
| 39 | String fromTerm, int maxDistance) { |
| 40 | return getAllWithinDistance(candidates, fromTerm, maxDistance, |
| 41 | defaultDistanceCalculator.get()); |
| 42 | } |
| 43 | |
| 44 | /** |
| 45 | * Reduce the provided collection of strings to list of strings within an |
| 46 | * edit distance provided by the maxDistance parameter. Collection passed in |
| 47 | * will not be modified. The results will be returned in a new collection. |
| 48 | * |
| 49 | * @param candidates |
| 50 | * Collection of terms to compare against fromTerm |
| 51 | * @param fromTerm |
| 52 | * Term from which the edit distance will be calculated. |
| 53 | * @param maxDistance |
| 54 | * Maximum edit distance cutoff for matching candidates. |
| 55 | * @param distanceCalculator |
| 56 | * used to calculate edit distance using strategy. Distance will |
| 57 | * be calculated by invoking |
| 58 | * {@link EditDistanceCalculator#getEditDistance(String, String)} |
| 59 | * @return |
| 60 | */ |
| 61 | public static Collection<String> getAllWithinDistance(Collection<String> candidates, |
| 62 | String fromTerm, int maxDistance, EditDistanceCalculator distanceCalculator) { |
| 63 | |
| 64 | List<WeightedString> withinDistance = new ArrayList<EditDistance.WeightedString>( |
| 65 | candidates.size()); |
| 66 | /* |
| 67 | * Add all terms within edit distance, storing the distance and term to |
| 68 | * order the list after completion. |
| 69 | */ |
| 70 | for (Iterator<String> iterator = candidates.iterator(); iterator.hasNext();) { |
| 71 | String candidate = iterator.next(); |
| 72 | double editDistance = distanceCalculator.getEditDistance(fromTerm, candidate); |
| 73 | if (editDistance <= maxDistance) { |
| 74 | withinDistance.add(new WeightedString(editDistance, candidate)); |
| 75 | } |
| 76 | } |
| 77 | |
| 78 | // Sort the qualified terms by edit distance. |
| 79 | Collections.sort(withinDistance); |
| 80 | /* |
| 81 | * Copy the contents (term) of each WeightedString to a collection for |
| 82 | * return. hold all returned terms to prevent unnecessary expansions. |
| 83 | */ |
| 84 | ArrayList<String> allTerms = new ArrayList<String>(withinDistance.size()); |
| 85 | for (WeightedString ws : withinDistance) { |
| 86 | allTerms.add(ws.string); |
| 87 | } |
| 88 | |
| 89 | return allTerms; |
| 90 | } |
| 91 | |
| 92 | /** |
| 93 | * Testing method to dump out edit matrix. |
| 94 | * |
| 95 | * @param fromTerm |
| 96 | * @param toTerm |
| 97 | * @param prefixLengths |
| 98 | */ |
| 99 | private static void printEditMatrix(char[] fromTerm, char[] toTerm, int[][] prefixLengths) { |
| 100 | System.out.printf("\n%3s", ""); // Print first Row |
| 101 | System.out.printf("%2s", "0"); // Print 0 col hdr |
| 102 | for (int c = 0; c < toTerm.length; c++) { // Print String 2 |
| 103 | System.out.printf("%3s", toTerm[c]); |
| 104 | } |
| 105 | System.out.printf("\n"); |
| 106 | System.out.printf("%2s", " 0"); // Print 0 row hdr |
| 107 | for (int c = 0; c <= toTerm.length; c++) { // Print 0 for each column |
| 108 | System.out.printf("|%2d", prefixLengths[0][c]); |
| 109 | } |
| 110 | for (int r = 1; r <= fromTerm.length; r++) { |
| 111 | System.out.printf("\n%2s", fromTerm[r - 1]); |
| 112 | for (int c = 0; c < toTerm.length + 1; c++) { |
| 113 | System.out.printf("|%2d", prefixLengths[r][c]); |
| 114 | } |
| 115 | } |
| 116 | System.out.println(""); |
| 117 | } |
| 118 | |
| 119 | /** |
| 120 | * Set the {@link EditDistanceCalculator} used as the default when not |
| 121 | * explicitly provided to methods within this class. |
| 122 | * |
| 123 | * @param editDistanceCalculator |
| 124 | */ |
| 125 | public static void setDefaultEditDistanceCalculator( |
| 126 | EditDistanceCalculator editDistanceCalculator) { |
| 127 | defaultDistanceCalculator.set(editDistanceCalculator); |
| 128 | } |
| 129 | |
| 130 | /** |
| 131 | * Temporary result storing string value and edit distance from primary |
| 132 | * term. |
| 133 | * |
| 134 | */ |
| 135 | private static class WeightedString implements Comparable<WeightedString> { |
| 136 | final double weight; |
| 137 | final String string; |
| 138 | |
| 139 | private WeightedString(double weight, String string) { |
| 140 | this.weight = weight; |
| 141 | this.string = string; |
| 142 | } |
| 143 | |
| 144 | @Override |
| 145 | public int compareTo(WeightedString o) { |
| 146 | long delta = Double.doubleToLongBits(weight) - Double.doubleToLongBits(o.weight); |
| 147 | if (delta == 0) { |
| 148 | return this.string.compareTo(o.string); |
| 149 | } else { |
| 150 | return delta > 0 ? 1 : -1; |
| 151 | } |
| 152 | } |
| 153 | |
| 154 | /* |
| 155 | * (non-Javadoc) |
| 156 | * |
| 157 | * @see java.lang.Object#hashCode() |
| 158 | */ |
| 159 | @Override |
| 160 | public int hashCode() { |
| 161 | final int prime = 31; |
| 162 | int result = 1; |
| 163 | result = prime * result + ((string == null) ? 0 : string.hashCode()); |
| 164 | long temp; |
| 165 | temp = Double.doubleToLongBits(weight); |
| 166 | result = prime * result + (int) (temp ^ (temp >>> 32)); |
| 167 | return result; |
| 168 | } |
| 169 | |
| 170 | /* |
| 171 | * (non-Javadoc) |
| 172 | * |
| 173 | * @see java.lang.Object#equals(java.lang.Object) |
| 174 | */ |
| 175 | @Override |
| 176 | public boolean equals(Object obj) { |
| 177 | if (this == obj) |
| 178 | return true; |
| 179 | if (obj == null) |
| 180 | return false; |
| 181 | if (getClass() != obj.getClass()) |
| 182 | return false; |
| 183 | WeightedString other = (WeightedString) obj; |
| 184 | if (string == null) { |
| 185 | if (other.string != null) |
| 186 | return false; |
| 187 | } else if (!string.equals(other.string)) |
| 188 | return false; |
| 189 | if (Double.doubleToLongBits(weight) != Double.doubleToLongBits(other.weight)) |
| 190 | return false; |
| 191 | return true; |
| 192 | } |
| 193 | } |
| 194 | } |