Skip to content

Commit 2fe4275

Browse files
committed
added tests for cosine, damerau and LCS
1 parent a5e647a commit 2fe4275

File tree

7 files changed

+355
-148
lines changed

7 files changed

+355
-148
lines changed

src/main/java/info/debatty/java/stringsimilarity/Cosine.java

Lines changed: 20 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -21,87 +21,66 @@
2121
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2222
* THE SOFTWARE.
2323
*/
24-
2524
package info.debatty.java.stringsimilarity;
2625

2726
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
2827
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
2928

3029
/**
31-
* The similarity between the two strings is the cosine of the angle between
30+
* The similarity between the two strings is the cosine of the angle between
3231
* these two vectors representation. It is computed as V1 . V2 / (|V1| * |V2|)
3332
* The cosine distance is computed as 1 - cosine similarity.
33+
*
3434
* @author Thibault Debatty
3535
*/
36-
public class Cosine extends ShingleBased implements
37-
NormalizedStringDistance, NormalizedStringSimilarity{
36+
public class Cosine extends ShingleBased implements
37+
NormalizedStringDistance, NormalizedStringSimilarity {
3838

39-
public static void main(String[] args) {
40-
Cosine cos = new Cosine(3);
41-
42-
// ABC BCE
43-
// 1 0
44-
// 1 1
45-
// angle = 45°
46-
// => similarity = .71
47-
48-
System.out.println(cos.similarity("ABC", "ABCE"));
49-
50-
cos = new Cosine(2);
51-
// AB BA
52-
// 2 1
53-
// 1 1
54-
// similarity = .95
55-
System.out.println(cos.similarity("ABAB", "BAB"));
56-
57-
58-
}
59-
6039
/**
61-
* Implements Cosine Similarity between strings.
62-
* The strings are first transformed in vectors of occurrences of k-shingles
63-
* (sequences of k characters). In this n-dimensional space, the similarity
64-
* between the two strings is the cosine of their respective vectors.
65-
*
66-
* @param k
40+
* Implements Cosine Similarity between strings. The strings are first
41+
* transformed in vectors of occurrences of k-shingles (sequences of k
42+
* characters). In this n-dimensional space, the similarity between the two
43+
* strings is the cosine of their respective vectors.
44+
*
45+
* @param k
6746
*/
6847
public Cosine(int k) {
6948
super(k);
7049
}
71-
50+
7251
public Cosine() {
7352
super();
7453
}
75-
76-
54+
7755
public double similarity(String s1, String s2) {
7856
KShingling ks = new KShingling(k);
7957
int[] profile1 = ks.getArrayProfile(s1);
8058
int[] profile2 = ks.getArrayProfile(s2);
81-
59+
8260
return dotProduct(profile1, profile2) / (norm(profile1) * norm(profile2));
8361
}
84-
62+
8563
/**
8664
* Compute the norm L2 : sqrt(Sum_i( v_i²))
65+
*
8766
* @param profile
8867
* @return L2 norm
8968
*/
9069
protected static double norm(int[] profile) {
9170
double agg = 0;
92-
71+
9372
for (int v : profile) {
9473
agg += v * v;
9574
}
96-
75+
9776
return Math.sqrt(agg);
9877
}
99-
78+
10079
protected static double dotProduct(int[] profile1, int[] profile2) {
10180
int length = Math.max(profile1.length, profile2.length);
10281
profile1 = java.util.Arrays.copyOf(profile1, length);
10382
profile2 = java.util.Arrays.copyOf(profile2, length);
104-
83+
10584
double agg = 0;
10685
for (int i = 0; i < length; i++) {
10786
agg += profile1[i] * profile2[i];
@@ -112,5 +91,5 @@ protected static double dotProduct(int[] profile1, int[] profile2) {
11291
public double distance(String s1, String s2) {
11392
return 1.0 - similarity(s1, s2);
11493
}
115-
94+
11695
}

src/main/java/info/debatty/java/stringsimilarity/Damerau.java

Lines changed: 26 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -27,105 +27,85 @@
2727
import info.debatty.java.stringsimilarity.interfaces.StringDistance;
2828

2929
/**
30-
* Implementation of Damerau-Levenshtein distance, computed as the
31-
* minimum number of operations needed to transform one string into the other,
32-
* where an operation is defined as an insertion, deletion, or substitution of a
33-
* single character, or a transposition of two adjacent characters.
34-
*
35-
* This is not to be confused with the optimal string alignment distance, which
30+
* Implementation of Damerau-Levenshtein distance, computed as the minimum
31+
* number of operations needed to transform one string into the other, where an
32+
* operation is defined as an insertion, deletion, or substitution of a single
33+
* character, or a transposition of two adjacent characters.
34+
*
35+
* This is not to be confused with the optimal string alignment distance, which
3636
* is an extension where no substring can be edited more than once.
37-
*
38-
* Also, Damerau-Levenshting does not respect triangle inequality, and is thus
37+
*
38+
* Also, Damerau-Levenshting does not respect triangle inequality, and is thus
3939
* not a metric distance.
40-
*
40+
*
4141
* @author Thibault Debatty
4242
*/
4343
public class Damerau implements StringDistance {
4444

45-
46-
public static void main(String[] args) {
47-
48-
Damerau d = new Damerau();
49-
50-
// 1 substitution
51-
System.out.println(d.distance("ABCDEF", "ABDCEF"));
52-
53-
// 2 substitutions
54-
System.out.println(d.distance("ABCDEF", "BACDFE"));
55-
56-
// 1 deletion
57-
System.out.println(d.distance("ABCDEF", "ABCDE"));
58-
System.out.println(d.distance("ABCDEF", "BCDEF"));
59-
System.out.println(d.distance("ABCDEF", "ABCGDEF"));
60-
61-
// All different
62-
System.out.println(d.distance("ABCDEF", "POIU"));
63-
}
64-
6545
public double distance(String s1, String s2) {
6646

6747
// INFinite distance is the max possible distance
6848
int INF = s1.length() + s2.length();
69-
49+
7050
// Create and initialize the character array indices
7151
HashMap<Character, Integer> DA = new HashMap<Character, Integer>();
72-
52+
7353
for (int d = 0; d < s1.length(); d++) {
7454
if (!DA.containsKey(s1.charAt(d))) {
7555
DA.put(s1.charAt(d), 0);
7656
}
7757
}
78-
58+
7959
for (int d = 0; d < s2.length(); d++) {
8060
if (!DA.containsKey(s2.charAt(d))) {
8161
DA.put(s2.charAt(d), 0);
8262
}
8363
}
84-
64+
8565
// Create the distance matrix H[0 .. s1.length+1][0 .. s2.length+1]
8666
int[][] H = new int[s1.length() + 2][s2.length() + 2];
87-
67+
8868
// initialize the left and top edges of H
8969
for (int i = 0; i <= s1.length(); i++) {
9070
H[i + 1][0] = INF;
9171
H[i + 1][1] = i;
9272
}
93-
73+
9474
for (int j = 0; j <= s2.length(); j++) {
9575
H[0][j + 1] = INF;
9676
H[1][j + 1] = j;
97-
77+
9878
}
99-
79+
10080
// fill in the distance matrix H
10181
// look at each character in s1
10282
for (int i = 1; i <= s1.length(); i++) {
10383
int DB = 0;
104-
84+
10585
// look at each character in b
10686
for (int j = 1; j <= s2.length(); j++) {
10787
int i1 = DA.get(s2.charAt(j - 1));
10888
int j1 = DB;
109-
89+
11090
int cost = 1;
11191
if (s1.charAt(i - 1) == s2.charAt(j - 1)) {
11292
cost = 0;
11393
DB = j;
11494
}
115-
95+
11696
H[i + 1][j + 1] = min(
117-
H[i][j] + cost, // substitution
118-
H[i + 1][j] + 1, // insertion
119-
H[i][j + 1] + 1, // deletion
97+
H[i][j] + cost, // substitution
98+
H[i + 1][j] + 1, // insertion
99+
H[i][j + 1] + 1, // deletion
120100
H[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1));
121101
}
122-
102+
123103
DA.put(s1.charAt(i - 1), i);
124104
}
125-
105+
126106
return H[s1.length() + 1][s2.length() + 1];
127107
}
128-
108+
129109
protected static int min(int a, int b, int c, int d) {
130110
return Math.min(a, Math.min(b, Math.min(c, d)));
131111
}

src/main/java/info/debatty/java/stringsimilarity/LongestCommonSubsequence.java

Lines changed: 46 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -3,107 +3,92 @@
33
import info.debatty.java.stringsimilarity.interfaces.StringDistance;
44

55
/**
6-
* The longest common subsequence (LCS) problem consists in finding the
7-
* longest subsequence common to two (or more) sequences. It differs from
8-
* problems of finding common substrings: unlike substrings, subsequences are
9-
* not required to occupy consecutive positions within the original sequences.
10-
*
6+
* The longest common subsequence (LCS) problem consists in finding the longest
7+
* subsequence common to two (or more) sequences. It differs from problems of
8+
* finding common substrings: unlike substrings, subsequences are not required
9+
* to occupy consecutive positions within the original sequences.
10+
*
1111
* It is used by the diff utility, by Git for reconciling multiple changes, etc.
12-
*
13-
* The LCS distance between Strings X (length n) and Y (length m) is
14-
* n + m - 2 |LCS(X, Y)|
15-
* min = 0
16-
* max = n + m
17-
*
18-
* LCS distance is equivalent to Levenshtein distance, when only insertion and
19-
* deletion is allowed (no substitution), or when the cost of the substitution
12+
*
13+
* The LCS distance between Strings X (length n) and Y (length m) is n + m - 2
14+
* |LCS(X, Y)| min = 0 max = n + m
15+
*
16+
* LCS distance is equivalent to Levenshtein distance, when only insertion and
17+
* deletion is allowed (no substitution), or when the cost of the substitution
2018
* is the double of the cost of an insertion or deletion.
21-
*
22-
* ! This class currently implements the dynamic programming approach, which
23-
* has a space requirement O(m * n)!
24-
*
19+
*
20+
* ! This class currently implements the dynamic programming approach, which has
21+
* a space requirement O(m * n)!
22+
*
2523
* @author Thibault Debatty
2624
*/
2725
public class LongestCommonSubsequence implements StringDistance {
2826

2927
/**
30-
* @param args the command line arguments
31-
*/
32-
public static void main(String[] args) {
33-
LongestCommonSubsequence lcs = new LongestCommonSubsequence();
34-
35-
// Will produce 4.0
36-
System.out.println(lcs.distance("AGCAT", "GAC"));
37-
38-
// Will produce 1.0
39-
System.out.println(lcs.distance("AGCAT", "AGCT"));
40-
}
41-
42-
43-
/**
44-
* Return the LCS distance between strings s1 and s2,
45-
* computed as |s1| + |s2| - 2 * |LCS(s1, s2)|
28+
* Return the LCS distance between strings s1 and s2, computed as |s1| +
29+
* |s2| - 2 * |LCS(s1, s2)|
30+
*
4631
* @param s1
4732
* @param s2
48-
* @return the LCS distance between strings s1 and s2, computed as |s1| + |s2| - 2 * |LCS(s1, s2)|
33+
* @return the LCS distance between strings s1 and s2, computed as |s1| +
34+
* |s2| - 2 * |LCS(s1, s2)|
4935
*/
5036
public double distance(String s1, String s2) {
5137
return s1.length() + s2.length() - 2 * length(s1, s2);
5238
}
53-
39+
5440
/**
5541
* Return the length of Longest Common Subsequence (LCS) between strings s1
5642
* and s2.
43+
*
5744
* @param s1
5845
* @param s2
5946
* @return the length of LCS(s1, s2)
6047
*/
6148
protected int length(String s1, String s2) {
6249
/* function LCSLength(X[1..m], Y[1..n])
63-
C = array(0..m, 0..n)
50+
C = array(0..m, 0..n)
6451
65-
for i := 0..m
66-
C[i,0] = 0
52+
for i := 0..m
53+
C[i,0] = 0
6754
68-
for j := 0..n
69-
C[0,j] = 0
55+
for j := 0..n
56+
C[0,j] = 0
7057
71-
for i := 1..m
72-
for j := 1..n
73-
if X[i] = Y[j]
74-
C[i,j] := C[i-1,j-1] + 1
75-
else
76-
C[i,j] := max(C[i,j-1], C[i-1,j])
77-
return C[m,n]
78-
*/
58+
for i := 1..m
59+
for j := 1..n
60+
if X[i] = Y[j]
61+
C[i,j] := C[i-1,j-1] + 1
62+
else
63+
C[i,j] := max(C[i,j-1], C[i-1,j])
64+
return C[m,n]
65+
*/
7966
int m = s1.length();
8067
int n = s2.length();
8168
char[] X = s1.toCharArray();
8269
char[] Y = s2.toCharArray();
83-
84-
int[][] C = new int[m+1][n+1];
85-
70+
71+
int[][] C = new int[m + 1][n + 1];
72+
8673
for (int i = 0; i <= m; i++) {
8774
C[i][0] = 0;
8875
}
89-
76+
9077
for (int j = 0; j <= n; j++) {
9178
C[0][j] = 0;
9279
}
93-
94-
for (int i = 1; i <=m ; i++) {
80+
81+
for (int i = 1; i <= m; i++) {
9582
for (int j = 1; j <= n; j++) {
96-
if (X[i-1] == Y[j-1]) {
97-
C[i][j] = C[i-1][j-1] + 1;
98-
83+
if (X[i - 1] == Y[j - 1]) {
84+
C[i][j] = C[i - 1][j - 1] + 1;
85+
9986
} else {
100-
C[i][j] = Math.max(C[i][j-1], C[i-1][j]);
87+
C[i][j] = Math.max(C[i][j - 1], C[i - 1][j]);
10188
}
10289
}
10390
}
104-
91+
10592
return C[m][n];
106-
10793
}
108-
10994
}

0 commit comments

Comments
 (0)