| 1 | /** |
| 2 | * |
| 3 | */ |
| 4 | package net.digitaltsunami.word.sequence; |
| 5 | |
| 6 | /** |
| 7 | * Computes edit distance using the Damerau-Levenshtein distance metric. |
| 8 | * Damerau-Levenshtein differs from Levenshtein in that the transposition of two |
| 9 | * values count as a single edit operation; whereas, in Levenshtein this would |
| 10 | * count as two. Each edit (insertion, deletion, substitution, transposition) |
| 11 | * counts as one edit. |
| 12 | * <p> |
| 13 | * This implementation provides a basic count of edits without weights. |
| 14 | * |
| 15 | * @author dhagberg |
| 16 | * |
| 17 | */ |
| 18 | public class DamerauLevenshteinDistanceStrategy implements EditDistanceStrategy { |
| 19 | /* |
| 20 | * (non-Javadoc) |
| 21 | * |
| 22 | * @see |
| 23 | * net.digitaltsunami.word.sequence.EditDistanceStrategy#getEditCount( |
| 24 | * java.lang.String, java.lang.String) |
| 25 | */ |
| 26 | @Override |
| 27 | public int getEditCount(String fromTerm, String toTerm) { |
| 28 | return calculateEditCount(fromTerm, toTerm); |
| 29 | } |
| 30 | |
| 31 | /* |
| 32 | * (non-Javadoc) |
| 33 | * |
| 34 | * @see |
| 35 | * net.digitaltsunami.word.sequence.EditDistanceStrategy#getEditDistance |
| 36 | * (java.lang.String, java.lang.String) |
| 37 | */ |
| 38 | @Override |
| 39 | public double getEditDistance(String fromTerm, String toTerm) { |
| 40 | return calculateEditCount(fromTerm, toTerm); |
| 41 | } |
| 42 | |
| 43 | /** |
| 44 | * Compute and return the Damerau-Levenshtein edit distance between the two |
| 45 | * provided sequences. Damerau-Levenshtein differs from Levenshtein in that |
| 46 | * the transposition of two values count as a single edit operation; |
| 47 | * whereas, in Levenshtein this would count as two. Each edit (insertion, |
| 48 | * deletion, substitution, transposition) counts as one edit. The total |
| 49 | * number of these edit operations required to convert string 1 to string 2 |
| 50 | * is calculated and returned. |
| 51 | * |
| 52 | * @param fromTerm |
| 53 | * @param toTerm |
| 54 | * @return total number of edits required to convert string 1 to string 2. |
| 55 | */ |
| 56 | public static int calculateEditCount(String fromTerm, String toTerm) { |
| 57 | int nRows = fromTerm.length() + 1; |
| 58 | int nCols = toTerm.length() + 1; |
| 59 | /* |
| 60 | * Create matrix to store edit operations. Each dimension is length + 1 |
| 61 | * to allow for full edit distance values. |
| 62 | */ |
| 63 | int editMatrix[][] = new int[nRows][nCols]; |
| 64 | /* |
| 65 | * Initialize first row and column to contain edit distances as if the |
| 66 | * other string were empty. |
| 67 | */ |
| 68 | for (int r = 0; r < nRows; r++) { |
| 69 | editMatrix[r][0] = r; |
| 70 | } |
| 71 | for (int c = 0; c < nCols; c++) { |
| 72 | editMatrix[0][c] = c; |
| 73 | } |
| 74 | /* |
| 75 | * Compare each character of the string against one another and |
| 76 | * calculate the number of edits needed at each stage. The total number |
| 77 | * of edits will be in editMatrix[fromTerm.len][toTerm.len] |
| 78 | */ |
| 79 | for (int r = 1; r < nRows; r++) { |
| 80 | for (int c = 1; c < nCols; c++) { |
| 81 | // Cost if values are different. Will be used below |
| 82 | int cost = (fromTerm.charAt(r - 1) == toTerm.charAt(c - 1)) ? 0 : 1; |
| 83 | // Minimum value of deletion, insertion, or substitution. |
| 84 | editMatrix[r][c] = |
| 85 | Math.min( |
| 86 | Math.min(editMatrix[r][c - 1] + 1, editMatrix[r - 1][c] + 1), |
| 87 | editMatrix[r - 1][c - 1] + cost); |
| 88 | if ((r > 1 && c > 1) |
| 89 | && (fromTerm.charAt(r - 1) == toTerm.charAt(c - 2)) |
| 90 | && (fromTerm.charAt(r - 2) == toTerm.charAt(c - 1))) { |
| 91 | editMatrix[r][c] = Math.min(editMatrix[r][c], editMatrix[r - 2][c - 2] + cost); |
| 92 | } |
| 93 | } |
| 94 | } |
| 95 | |
| 96 | return editMatrix[nRows - 1][nCols - 1]; |
| 97 | } |
| 98 | |
| 99 | } |