Skip to content

Commit 5bec82b

Browse files
committed
Cleaned code (shingle/profile based algorithms)
1 parent 55245cc commit 5bec82b

File tree

13 files changed

+164
-548
lines changed

13 files changed

+164
-548
lines changed

src/main/java/info/debatty/java/stringsimilarity/Cosine.java

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
2727
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
28+
import java.util.Map;
2829
import net.jcip.annotations.Immutable;
2930

3031
/**
@@ -72,39 +73,52 @@ public final double similarity(final String s1, final String s2) {
7273
if (s1.length() < getK() || s2.length() < getK()) {
7374
return 0;
7475
}
75-
KShingling ks = new KShingling(getK());
76-
int[] profile1 = ks.getArrayProfile(s1);
77-
int[] profile2 = ks.getArrayProfile(s2);
76+
77+
Map<String, Integer> profile1 = getProfile(s1);
78+
Map<String, Integer> profile2 = getProfile(s2);
7879

7980
return dotProduct(profile1, profile2)
8081
/ (norm(profile1) * norm(profile2));
8182
}
8283

84+
85+
8386
/**
8487
* Compute the norm L2 : sqrt(Sum_i( v_i²)).
8588
*
8689
* @param profile
8790
* @return L2 norm
8891
*/
89-
private static double norm(final int[] profile) {
92+
private static double norm(final Map<String, Integer> profile) {
9093
double agg = 0;
9194

92-
for (int v : profile) {
93-
agg += 1.0 * v * v;
95+
for (Map.Entry<String, Integer> entry : profile.entrySet()) {
96+
agg += 1.0 * entry.getValue() * entry.getValue();
9497
}
9598

9699
return Math.sqrt(agg);
97100
}
98101

99102
private static double dotProduct(
100-
final int[] profile1, final int[] profile2) {
103+
final Map<String, Integer> profile1,
104+
final Map<String, Integer> profile2) {
105+
106+
// Loop over the smallest map
107+
Map<String, Integer> small_profile = profile2;
108+
Map<String, Integer> large_profile = profile1;
109+
if (profile1.size() < profile2.size()) {
110+
small_profile = profile1;
111+
large_profile = profile2;
112+
}
101113

102-
// profiles may not have the same length
103-
int length = Math.min(profile1.length, profile2.length);
104114
double agg = 0;
105-
for (int i = 0; i < length; i++) {
106-
agg += 1.0 * profile1[i] * profile2[i];
115+
for (Map.Entry<String, Integer> entry : small_profile.entrySet()) {
116+
if (!large_profile.containsKey(entry.getKey())) {
117+
continue;
118+
}
119+
agg += 1.0 * entry.getValue() * large_profile.get(entry.getKey());
107120
}
121+
108122
return agg;
109123
}
110124

@@ -118,4 +132,12 @@ public final double distance(final String s1, final String s2) {
118132
return 1.0 - similarity(s1, s2);
119133
}
120134

135+
public double similarity(
136+
final Map<String, Integer> profile1,
137+
final Map<String, Integer> profile2) {
138+
139+
return dotProduct(profile1, profile2)
140+
/ (norm(profile1) * norm(profile2));
141+
}
142+
121143
}

src/main/java/info/debatty/java/stringsimilarity/Jaccard.java

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
import info.debatty.java.stringsimilarity.interfaces.MetricStringDistance;
2828
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
2929
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
30+
import java.util.HashSet;
31+
import java.util.Map;
32+
import java.util.Set;
3033
import net.jcip.annotations.Immutable;
3134

3235
/**
@@ -71,28 +74,22 @@ public Jaccard() {
7174
* @return
7275
*/
7376
public final double similarity(final String s1, final String s2) {
74-
KShingling ks = new KShingling(getK());
75-
int[] profile1 = ks.getArrayProfile(s1);
76-
int[] profile2 = ks.getArrayProfile(s2);
77+
Map<String, Integer> profile1 = getProfile(s1);
78+
Map<String, Integer> profile2 = getProfile(s2);
7779

78-
int length = Math.max(profile1.length, profile2.length);
79-
profile1 = java.util.Arrays.copyOf(profile1, length);
80-
profile2 = java.util.Arrays.copyOf(profile2, length);
80+
Set<String> union = new HashSet<String>();
81+
union.addAll(profile1.keySet());
82+
union.addAll(profile2.keySet());
8183

8284
int inter = 0;
83-
int union = 0;
8485

85-
for (int i = 0; i < length; i++) {
86-
if (profile1[i] > 0 || profile2[i] > 0) {
87-
union++;
88-
89-
if (profile1[i] > 0 && profile2[i] > 0) {
90-
inter++;
91-
}
86+
for (String key : union) {
87+
if (profile1.containsKey(key) && profile2.containsKey(key)) {
88+
inter++;
9289
}
9390
}
9491

95-
return 1.0 * inter / union;
92+
return 1.0 * inter / union.size();
9693
}
9794

9895

src/main/java/info/debatty/java/stringsimilarity/KShingling.java

Lines changed: 0 additions & 191 deletions
This file was deleted.

src/main/java/info/debatty/java/stringsimilarity/QGram.java

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package info.debatty.java.stringsimilarity;
22

33
import info.debatty.java.stringsimilarity.interfaces.StringDistance;
4+
import java.util.HashSet;
5+
import java.util.Map;
6+
import java.util.Set;
47
import net.jcip.annotations.Immutable;
58

69
/**
@@ -21,7 +24,7 @@ public class QGram extends ShingleBased implements StringDistance {
2124
* string-matching with q-grams and maximal matches",
2225
* http://www.sciencedirect.com/science/article/pii/0304397592901434 The
2326
* distance between two strings is defined as the L1 norm of the difference
24-
* of their profiles (the number of occurence of each k-shingle). Q-gram
27+
* of their profiles (the number of occurences of each k-shingle). Q-gram
2528
* distance is a lower bound on Levenshtein distance, but can be computed in
2629
* O(|A| + |B|), where Levenshtein requires O(|A|.|B|)
2730
*
@@ -53,19 +56,27 @@ public QGram() {
5356
* @return
5457
*/
5558
public final double distance(final String s1, final String s2) {
56-
KShingling ks = new KShingling(getK());
57-
int[] profile1 = ks.getArrayProfile(s1);
58-
int[] profile2 = ks.getArrayProfile(s2);
59-
int length = Math.max(profile1.length, profile2.length);
59+
Map<String, Integer> profile1 = getProfile(s1);
60+
Map<String, Integer> profile2 = getProfile(s2);
6061

61-
profile1 = java.util.Arrays.copyOf(profile1, length);
62-
profile2 = java.util.Arrays.copyOf(profile2, length);
62+
Set<String> union = new HashSet<String>();
63+
union.addAll(profile1.keySet());
64+
union.addAll(profile2.keySet());
6365

64-
int d = 0;
65-
for (int i = 0; i < length; i++) {
66-
d += Math.abs(profile1[i] - profile2[i]);
67-
}
66+
int agg = 0;
67+
for (String key : union) {
68+
int v1 = 0;
69+
int v2 = 0;
70+
if (profile1.containsKey(key)) {
71+
v1 = profile1.get(key);
72+
}
6873

69-
return d;
74+
75+
if (profile2.containsKey(key)) {
76+
v2 = profile2.get(key);
77+
}
78+
agg += Math.abs(v1 - v2);
79+
}
80+
return agg;
7081
}
7182
}

0 commit comments

Comments
 (0)