Skip to content

Commit 10b7271

Browse files
committed
Moved examples to "examples" package
1 parent 2fe4275 commit 10b7271

File tree

7 files changed

+170
-200
lines changed

7 files changed

+170
-200
lines changed

src/main/java/info/debatty/java/stringsimilarity/NGram.java

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,32 +4,19 @@
44

55
/**
66
* N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance",
7-
* String Processing and Information Retrieval, Lecture Notes in Computer
7+
* String Processing and Information Retrieval, Lecture Notes in Computer
88
* Science Volume 3772, 2005, pp 115-126.
9-
*
10-
* The algorithm uses affixing with special character '\n' to increase the
11-
* weight of first characters. The normalization is achieved by dividing the
9+
*
10+
* The algorithm uses affixing with special character '\n' to increase the
11+
* weight of first characters. The normalization is achieved by dividing the
1212
* total similarity score the original length of the longest word.
13-
*
13+
*
1414
* http://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf
1515
*/
1616
public class NGram implements NormalizedStringDistance {
17-
18-
public static void main(String[] args) {
19-
20-
// produces 0.416666
21-
NGram twogram = new NGram(2);
22-
System.out.println(twogram.distance("ABCD", "ABTUIO"));
23-
24-
// produces 0.97222
25-
String s1 = "Adobe CreativeSuite 5 Master Collection from cheap 4zp";
26-
String s2 = "Adobe CreativeSuite 5 Master Collection from cheap d1x";
27-
NGram ngram = new NGram(4);
28-
System.out.println(ngram.distance(s1, s2));
29-
}
3017

3118
private final int n;
32-
19+
3320
public NGram(int n) {
3421
this.n = n;
3522
}
@@ -38,7 +25,6 @@ public NGram() {
3825
this.n = 2;
3926
}
4027

41-
4228
@Override
4329
public double distance(String s0, String s1) {
4430
final char special = '\n';

src/main/java/info/debatty/java/stringsimilarity/NormalizedLevenshtein.java

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,30 +21,22 @@
2121
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2222
* THE SOFTWARE.
2323
*/
24-
2524
package info.debatty.java.stringsimilarity;
2625

2726
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
2827
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
2928

3029
/**
31-
* This distance is computed as levenshtein distance divided by the length of
32-
* the longest string. The resulting value is always in the interval [0.0 1.0]
33-
* but it is not a metric anymore!
34-
* The similarity is computed as 1 - normalized distance.
30+
* This distance is computed as levenshtein distance divided by the length of
31+
* the longest string. The resulting value is always in the interval [0.0 1.0]
32+
* but it is not a metric anymore! The similarity is computed as 1 - normalized
33+
* distance.
34+
*
3535
* @author Thibault Debatty
3636
*/
37-
public class NormalizedLevenshtein implements NormalizedStringDistance, NormalizedStringSimilarity {
38-
37+
public class NormalizedLevenshtein implements
38+
NormalizedStringDistance, NormalizedStringSimilarity {
3939

40-
public static void main(String[] args) {
41-
NormalizedLevenshtein l = new NormalizedLevenshtein();
42-
43-
System.out.println(l.distance("My string", "My $tring"));
44-
System.out.println(l.distance("My string", "M string2"));
45-
System.out.println(l.distance("My string", "abcd"));
46-
}
47-
4840
private final Levenshtein l = new Levenshtein();
4941

5042
public double distance(String s1, String s2) {
@@ -54,5 +46,5 @@ public double distance(String s1, String s2) {
5446
public double similarity(String s1, String s2) {
5547
return 1.0 - distance(s1, s2);
5648
}
57-
49+
5850
}
Lines changed: 20 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,53 @@
11
package info.debatty.java.stringsimilarity;
22

3-
43
import info.debatty.java.stringsimilarity.interfaces.StringDistance;
54

65
/**
7-
* Q-gram distance, as defined by Ukkonen in "Approximate string-matching with
8-
* q-grams and maximal matches". The distance between two strings is defined as
9-
* the L1 norm of the difference of their profiles (the number of occurences of
10-
* each n-gram): SUM( |V1_i - V2_i| ). Q-gram distance is a lower bound on
11-
* Levenshtein distance, but can be computed in O(m + n), where Levenshtein
6+
* Q-gram distance, as defined by Ukkonen in "Approximate string-matching with
7+
* q-grams and maximal matches". The distance between two strings is defined as
8+
* the L1 norm of the difference of their profiles (the number of occurences of
9+
* each n-gram): SUM( |V1_i - V2_i| ). Q-gram distance is a lower bound on
10+
* Levenshtein distance, but can be computed in O(m + n), where Levenshtein
1211
* requires O(m.n).
12+
*
1313
* @author Thibault Debatty
1414
*/
1515
public class QGram extends ShingleBased implements StringDistance {
16-
17-
public static void main(String[] args) {
18-
QGram dig = new QGram(2);
19-
20-
// AB BC CD CE
21-
// 1 1 1 0
22-
// 1 1 0 1
23-
// Total: 2
2416

25-
System.out.println(dig.distance("ABCD", "ABCE"));
26-
27-
System.out.println(dig.distance("", "QSDFGHJKLM"));
28-
29-
System.out.println(dig.distance(
30-
"Best Deal Ever! Viagra50/100mg - $1.85 071",
31-
"Best Deal Ever! Viagra50/100mg - $1.85 7z3"));
32-
}
33-
34-
3517
/**
36-
* Q-gram similarity and distance.
37-
* Defined by Ukkonen in "Approximate string-matching with q-grams and maximal
38-
* matches", http://www.sciencedirect.com/science/article/pii/0304397592901434
39-
* The distance between two strings is defined as the L1 norm of the difference
40-
* of their profiles (the number of occurence of each k-shingle).
41-
* Q-gram distance is a lower bound on Levenshtein distance, but can be computed
42-
* in O(|A| + |B|), where Levenshtein requires O(|A|.|B|)
43-
*
44-
* @param n
18+
* Q-gram similarity and distance. Defined by Ukkonen in "Approximate
19+
* string-matching with q-grams and maximal matches",
20+
* http://www.sciencedirect.com/science/article/pii/0304397592901434 The
21+
* distance between two strings is defined as the L1 norm of the difference
22+
* of their profiles (the number of occurence of each k-shingle). Q-gram
23+
* distance is a lower bound on Levenshtein distance, but can be computed in
24+
* O(|A| + |B|), where Levenshtein requires O(|A|.|B|)
25+
*
26+
* @param n
4527
*/
4628
public QGram(int n) {
4729
super(n);
4830
}
49-
31+
5032
public QGram() {
5133
super();
5234
}
53-
5435

5536
public double distance(String s1, String s2) {
5637
KShingling ks = new KShingling(k);
5738
int[] profile1 = ks.getArrayProfile(s1);
5839
int[] profile2 = ks.getArrayProfile(s2);
5940
int length = Math.max(profile1.length, profile2.length);
60-
41+
6142
profile1 = java.util.Arrays.copyOf(profile1, length);
6243
profile2 = java.util.Arrays.copyOf(profile2, length);
63-
44+
6445
int d = 0;
6546
for (int i = 0; i < length; i++) {
6647
d += Math.abs(profile1[i] - profile2[i]);
6748
}
68-
49+
6950
return d;
70-
51+
7152
}
7253
}

src/main/java/info/debatty/java/stringsimilarity/SorensenDice.java

Lines changed: 18 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -21,81 +21,64 @@
2121
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2222
* THE SOFTWARE.
2323
*/
24-
2524
package info.debatty.java.stringsimilarity;
2625

2726
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
2827
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
2928

3029
/**
31-
* Similar to Jaccard index, but this time the similarity is computed as
32-
* 2 * |V1 inter V2| / (|V1| + |V2|).
33-
* Distance is computed as 1 - cosine similarity.
30+
* Similar to Jaccard index, but this time the similarity is computed as 2 * |V1
31+
* inter V2| / (|V1| + |V2|). Distance is computed as 1 - cosine similarity.
32+
*
3433
* @author Thibault Debatty
3534
*/
36-
public class SorensenDice extends ShingleBased implements
35+
public class SorensenDice extends ShingleBased implements
3736
NormalizedStringDistance, NormalizedStringSimilarity {
3837

3938
/**
40-
* @param args the command line arguments
41-
*/
42-
public static void main(String[] args) {
43-
SorensenDice sd = new SorensenDice(2);
44-
45-
// AB BC CD DE DF FG
46-
// 1 1 1 1 0 0
47-
// 1 1 1 0 1 1
48-
// => 2 x 3 / (4 + 5) = 6/9 = 0.6666
49-
System.out.println(sd.similarity("ABCDE", "ABCDFG"));
50-
}
51-
52-
53-
/**
54-
* Sorensen-Dice coefficient, aka Sørensen index, Dice's coefficient or
39+
* Sorensen-Dice coefficient, aka Sørensen index, Dice's coefficient or
5540
* Czekanowski's binary (non-quantitative) index.
56-
*
57-
* The strings are first converted to boolean sets of k-shingles (sequences
58-
* of k characters), then the similarity is computed as
59-
* 2 * |A inter B| / (|A| + |B|).
60-
* Attention: Sorensen-Dice distance (and similarity) does not satisfy
61-
* triangle inequality.
62-
*
63-
* @param k
41+
*
42+
* The strings are first converted to boolean sets of k-shingles (sequences
43+
* of k characters), then the similarity is computed as 2 * |A inter B| /
44+
* (|A| + |B|). Attention: Sorensen-Dice distance (and similarity) does not
45+
* satisfy triangle inequality.
46+
*
47+
* @param k
6448
*/
6549
public SorensenDice(int k) {
6650
super(k);
6751
}
68-
52+
6953
public SorensenDice() {
7054
super(3);
7155
}
72-
7356

7457
public double similarity(String s1, String s2) {
7558
KShingling ks = new KShingling(k);
7659
int[] profile1 = ks.getArrayProfile(s1);
7760
int[] profile2 = ks.getArrayProfile(s2);
78-
61+
7962
int length = Math.max(profile1.length, profile2.length);
8063
profile1 = java.util.Arrays.copyOf(profile1, length);
8164
profile2 = java.util.Arrays.copyOf(profile2, length);
82-
65+
8366
int inter = 0;
8467
int sum = 0;
8568
for (int i = 0; i < length; i++) {
8669
if (profile1[i] > 0 && profile2[i] > 0) {
8770
inter++;
8871
}
89-
72+
9073
if (profile1[i] > 0) {
9174
sum++;
9275
}
93-
76+
9477
if (profile2[i] > 0) {
9578
sum++;
9679
}
9780
}
98-
81+
9982
return 2.0 * inter / sum;
10083
}
10184

src/main/java/info/debatty/java/stringsimilarity/StringProfile.java

Lines changed: 17 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -21,42 +21,26 @@
2121
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2222
* THE SOFTWARE.
2323
*/
24-
2524
package info.debatty.java.stringsimilarity;
2625

2726
import info.debatty.java.utils.SparseIntegerVector;
2827

2928
/**
30-
* Profile of a string (number of occurences of each shingle/n-gram), computed
29+
* Profile of a string (number of occurences of each shingle/n-gram), computed
3130
* using shingling.
32-
*
31+
*
3332
* @author Thibault Debatty
3433
*/
3534
public class StringProfile {
36-
37-
public static void main(String[] args) {
38-
KShingling ks = new KShingling(2);
39-
for (String ngram : ks.getProfile("ABCABC").getMostFrequentNGrams(2)) {
40-
System.out.println(ngram);
41-
}
42-
43-
for (String ngram : ks.getProfile("A").getMostFrequentNGrams(2)) {
44-
System.out.println(ngram);
45-
}
46-
47-
for (String ngram : ks.getProfile("This is a string...").getMostFrequentNGrams(2)) {
48-
System.out.println(ngram);
49-
}
50-
}
51-
35+
5236
private final SparseIntegerVector vector;
5337
private final KShingling ks;
54-
38+
5539
public StringProfile(SparseIntegerVector vector, KShingling ks) {
5640
this.vector = vector;
5741
this.ks = ks;
5842
}
59-
43+
6044
/**
6145
*
6246
* @param other
@@ -67,44 +51,44 @@ public double cosineSimilarity(StringProfile other) throws Exception {
6751
if (this.ks != other.ks) {
6852
throw new Exception("Profiles were not created using the same kshingling object!");
6953
}
70-
54+
7155
return this.vector.cosineSimilarity(other.vector);
7256
}
73-
57+
7458
/**
75-
*
59+
*
7660
* @param other
7761
* @return qgram distance between this string and the other
78-
* @throws Exception
62+
* @throws Exception
7963
*/
8064
public double qgramDistance(StringProfile other) throws Exception {
8165
if (this.ks != other.ks) {
8266
throw new Exception("Profiles were not created using the same kshingling object!");
8367
}
84-
68+
8569
return this.vector.qgram(other.vector);
8670
}
87-
71+
8872
public SparseIntegerVector getSparseVector() {
8973
return this.vector;
9074
}
91-
75+
9276
public String[] getMostFrequentNGrams(int number) {
9377
String[] strings = new String[number];
9478
int[] frequencies = new int[number];
95-
79+
9680
int position_smallest_frequency = 0;
97-
81+
9882
for (int i = 0; i < vector.size(); i++) {
9983
int key = vector.getKey(i);
10084
int frequency = vector.getValue(i);
10185
String ngram = ks.getNGram(key);
102-
86+
10387
if (frequency > frequencies[position_smallest_frequency]) {
10488
// 1. replace the element with currently the smallest frequency
10589
strings[position_smallest_frequency] = ngram;
10690
frequencies[position_smallest_frequency] = frequency;
107-
91+
10892
// 2. loop over frequencies to find which one is now the lowest
10993
// frequency
11094
int smallest_frequency = Integer.MAX_VALUE;
@@ -115,7 +99,7 @@ public String[] getMostFrequentNGrams(int number) {
11599
}
116100
}
117101
}
118-
102+
119103
}
120104
return strings;
121105
}

0 commit comments

Comments
 (0)