Moved examples to "examples" package

tdebatty · tdebatty · commit 10b7271d86a3 · 2015-11-20T11:27:53.000+01:00
diff --git a/src/main/java/info/debatty/java/stringsimilarity/NGram.java b/src/main/java/info/debatty/java/stringsimilarity/NGram.java
@@ -4,32 +4,19 @@
 
 /**
  * N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance",
- * String Processing and Information Retrieval, Lecture Notes in Computer 
+ * String Processing and Information Retrieval, Lecture Notes in Computer
  * Science Volume 3772, 2005, pp 115-126.
- * 
- * The algorithm uses affixing with special character '\n' to increase the 
- * weight of first characters. The normalization is achieved by dividing the 
+ *
+ * The algorithm uses affixing with special character '\n' to increase the
+ * weight of first characters. The normalization is achieved by dividing the
  * total similarity score the original length of the longest word.
- * 
+ *
  * http://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf
  */
 public class NGram implements NormalizedStringDistance {
-    
-    public static void main(String[] args) {
-        
-        // produces 0.416666
-        NGram twogram = new NGram(2);
-        System.out.println(twogram.distance("ABCD", "ABTUIO"));
-        
-        // produces 0.97222
-        String s1 = "Adobe CreativeSuite 5 Master Collection from cheap 4zp";
-        String s2 = "Adobe CreativeSuite 5 Master Collection from cheap d1x";
-        NGram ngram = new NGram(4);
-        System.out.println(ngram.distance(s1, s2));
-    }
 
     private final int n;
-    
+
     public NGram(int n) {
         this.n = n;
     }
@@ -38,7 +25,6 @@ public NGram() {
         this.n = 2;
     }
 
-
     @Override
     public double distance(String s0, String s1) {
         final char special = '\n';
diff --git a/src/main/java/info/debatty/java/stringsimilarity/NormalizedLevenshtein.java b/src/main/java/info/debatty/java/stringsimilarity/NormalizedLevenshtein.java
@@ -21,30 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-
 package info.debatty.java.stringsimilarity;
 
 import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
 import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
 
 /**
- * This distance is computed as levenshtein distance divided by the length of 
- * the longest string. The resulting value is always in the interval [0.0 1.0] 
- * but it is not a metric anymore!
- * The similarity is computed as 1 - normalized distance.
+ * This distance is computed as levenshtein distance divided by the length of
+ * the longest string. The resulting value is always in the interval [0.0 1.0]
+ * but it is not a metric anymore! The similarity is computed as 1 - normalized
+ * distance.
+ *
  * @author Thibault Debatty
  */
-public class NormalizedLevenshtein implements NormalizedStringDistance, NormalizedStringSimilarity {
-
+public class NormalizedLevenshtein implements 
+        NormalizedStringDistance, NormalizedStringSimilarity {
 
-    public static void main(String[] args) {
-        NormalizedLevenshtein l = new NormalizedLevenshtein();
-        
-        System.out.println(l.distance("My string", "My $tring"));
-        System.out.println(l.distance("My string", "M string2"));
-        System.out.println(l.distance("My string", "abcd"));
-    }
-    
     private final Levenshtein l = new Levenshtein();
 
     public double distance(String s1, String s2) {
@@ -54,5 +46,5 @@ public double distance(String s1, String s2) {
     public double similarity(String s1, String s2) {
         return 1.0 - distance(s1, s2);
     }
-    
+
 }
diff --git a/src/main/java/info/debatty/java/stringsimilarity/QGram.java b/src/main/java/info/debatty/java/stringsimilarity/QGram.java
@@ -1,72 +1,53 @@
 package info.debatty.java.stringsimilarity;
 
-
 import info.debatty.java.stringsimilarity.interfaces.StringDistance;
 
 /**
- * Q-gram distance, as defined by Ukkonen in "Approximate string-matching with 
- * q-grams and maximal matches". The distance between two strings is defined as 
- * the L1 norm of the difference of their profiles (the number of occurences of 
- * each n-gram): SUM( |V1_i - V2_i| ). Q-gram distance is a lower bound on 
- * Levenshtein distance, but can be computed in O(m + n), where Levenshtein 
+ * Q-gram distance, as defined by Ukkonen in "Approximate string-matching with
+ * q-grams and maximal matches". The distance between two strings is defined as
+ * the L1 norm of the difference of their profiles (the number of occurences of
+ * each n-gram): SUM( |V1_i - V2_i| ). Q-gram distance is a lower bound on
+ * Levenshtein distance, but can be computed in O(m + n), where Levenshtein
  * requires O(m.n).
+ *
  * @author Thibault Debatty
  */
 public class QGram extends ShingleBased implements StringDistance {
-    
-    public static void main(String[] args) {
-        QGram dig = new QGram(2);
-        
-        // AB BC CD CE
-        // 1  1  1  0
-        // 1  1  0  1
-        // Total: 2
 
-        System.out.println(dig.distance("ABCD", "ABCE"));
-                
-        System.out.println(dig.distance("", "QSDFGHJKLM"));
-        
-        System.out.println(dig.distance(
-                "Best Deal Ever! Viagra50/100mg - $1.85 071",
-                "Best Deal Ever! Viagra50/100mg - $1.85 7z3"));
-    }
-    
-    
     /**
-     * Q-gram similarity and distance.
-     * Defined by Ukkonen in "Approximate string-matching with q-grams and maximal
-     * matches", http://www.sciencedirect.com/science/article/pii/0304397592901434
-     * The distance between two strings is defined as the L1 norm of the difference 
-     * of their profiles (the number of occurence of each k-shingle).
-     * Q-gram distance is a lower bound on Levenshtein distance, but can be computed
-     * in O(|A| + |B|), where Levenshtein requires O(|A|.|B|)
-     * 
-     * @param n 
+     * Q-gram similarity and distance. Defined by Ukkonen in "Approximate
+     * string-matching with q-grams and maximal matches",
+     * http://www.sciencedirect.com/science/article/pii/0304397592901434 The
+     * distance between two strings is defined as the L1 norm of the difference
+     * of their profiles (the number of occurence of each k-shingle). Q-gram
+     * distance is a lower bound on Levenshtein distance, but can be computed in
+     * O(|A| + |B|), where Levenshtein requires O(|A|.|B|)
+     *
+     * @param n
      */
     public QGram(int n) {
         super(n);
     }
-    
+
     public QGram() {
         super();
     }
- 
 
     public double distance(String s1, String s2) {
         KShingling ks = new KShingling(k);
         int[] profile1 = ks.getArrayProfile(s1);
         int[] profile2 = ks.getArrayProfile(s2);
         int length = Math.max(profile1.length, profile2.length);
-        
+
         profile1 = java.util.Arrays.copyOf(profile1, length);
         profile2 = java.util.Arrays.copyOf(profile2, length);
-        
+
         int d = 0;
         for (int i = 0; i < length; i++) {
             d += Math.abs(profile1[i] - profile2[i]);
         }
-        
+
         return d;
-        
+
     }
 }
diff --git a/src/main/java/info/debatty/java/stringsimilarity/SorensenDice.java b/src/main/java/info/debatty/java/stringsimilarity/SorensenDice.java
@@ -21,81 +21,64 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-
 package info.debatty.java.stringsimilarity;
 
 import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
 import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
 
 /**
- * Similar to Jaccard index, but this time the similarity is computed as 
- * 2 * |V1 inter V2| / (|V1| + |V2|).
- * Distance is computed as 1 - cosine similarity.
+ * Similar to Jaccard index, but this time the similarity is computed as 2 * |V1
+ * inter V2| / (|V1| + |V2|). Distance is computed as 1 - cosine similarity.
+ *
  * @author Thibault Debatty
  */
-public class SorensenDice extends ShingleBased implements 
+public class SorensenDice extends ShingleBased implements
         NormalizedStringDistance, NormalizedStringSimilarity {
 
     /**
-     * @param args the command line arguments
-     */
-    public static void main(String[] args) {
-        SorensenDice sd = new SorensenDice(2);
-        
-        // AB BC CD DE DF FG
-        // 1  1  1  1  0  0
-        // 1  1  1  0  1  1
-        // => 2 x 3 / (4 + 5) = 6/9 = 0.6666
-        System.out.println(sd.similarity("ABCDE", "ABCDFG"));
-    }
-
-    
-    /**
-     * Sorensen-Dice coefficient, aka Sørensen index, Dice's coefficient or 
+     * Sorensen-Dice coefficient, aka Sørensen index, Dice's coefficient or
      * Czekanowski's binary (non-quantitative) index.
-     * 
-     * The strings are first converted to boolean sets of k-shingles (sequences 
-     * of k characters), then the similarity is computed as 
-     * 2 * |A inter B| / (|A| + |B|).
-     * Attention: Sorensen-Dice distance (and similarity) does not satisfy 
-     * triangle inequality.
-     * 
-     * @param k 
+     *
+     * The strings are first converted to boolean sets of k-shingles (sequences
+     * of k characters), then the similarity is computed as 2 * |A inter B| /
+     * (|A| + |B|). Attention: Sorensen-Dice distance (and similarity) does not
+     * satisfy triangle inequality.
+     *
+     * @param k
      */
     public SorensenDice(int k) {
         super(k);
     }
-    
+
     public SorensenDice() {
         super(3);
     }
-    
 
     public double similarity(String s1, String s2) {
         KShingling ks = new KShingling(k);
         int[] profile1 = ks.getArrayProfile(s1);
         int[] profile2 = ks.getArrayProfile(s2);
-        
+
         int length = Math.max(profile1.length, profile2.length);
         profile1 = java.util.Arrays.copyOf(profile1, length);
         profile2 = java.util.Arrays.copyOf(profile2, length);
-        
+
         int inter = 0;
         int sum = 0;
         for (int i = 0; i < length; i++) {
             if (profile1[i] > 0 && profile2[i] > 0) {
                 inter++;
             }
-            
+
             if (profile1[i] > 0) {
                 sum++;
             }
-            
+
             if (profile2[i] > 0) {
                 sum++;
             }
         }
-        
+
         return 2.0 * inter / sum;
     }
 
diff --git a/src/main/java/info/debatty/java/stringsimilarity/StringProfile.java b/src/main/java/info/debatty/java/stringsimilarity/StringProfile.java
@@ -21,42 +21,26 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-
 package info.debatty.java.stringsimilarity;
 
 import info.debatty.java.utils.SparseIntegerVector;
 
 /**
- * Profile of a string (number of occurences of each shingle/n-gram), computed 
+ * Profile of a string (number of occurences of each shingle/n-gram), computed
  * using shingling.
- * 
+ *
  * @author Thibault Debatty
  */
 public class StringProfile {
-    
-    public static void main(String[] args) {
-        KShingling ks = new KShingling(2);
-        for (String ngram : ks.getProfile("ABCABC").getMostFrequentNGrams(2)) {
-            System.out.println(ngram);
-        }
-        
-        for (String ngram : ks.getProfile("A").getMostFrequentNGrams(2)) {
-            System.out.println(ngram);
-        }
-        
-        for (String ngram : ks.getProfile("This is a string...").getMostFrequentNGrams(2)) {
-            System.out.println(ngram);
-        }
-    }
-    
+
     private final SparseIntegerVector vector;
     private final KShingling ks;
-    
+
     public StringProfile(SparseIntegerVector vector, KShingling ks) {
         this.vector = vector;
         this.ks = ks;
     }
-    
+
     /**
      *
      * @param other
@@ -67,44 +51,44 @@ public double cosineSimilarity(StringProfile other) throws Exception {
         if (this.ks != other.ks) {
             throw new Exception("Profiles were not created using the same kshingling object!");
         }
-        
+
         return this.vector.cosineSimilarity(other.vector);
     }
-    
+
     /**
-     * 
+     *
      * @param other
      * @return qgram distance between this string and the other
-     * @throws Exception 
+     * @throws Exception
      */
     public double qgramDistance(StringProfile other) throws Exception {
         if (this.ks != other.ks) {
             throw new Exception("Profiles were not created using the same kshingling object!");
         }
-        
+
         return this.vector.qgram(other.vector);
     }
-    
+
     public SparseIntegerVector getSparseVector() {
         return this.vector;
     }
-    
+
     public String[] getMostFrequentNGrams(int number) {
         String[] strings = new String[number];
         int[] frequencies = new int[number];
-        
+
         int position_smallest_frequency = 0;
-        
+
         for (int i = 0; i < vector.size(); i++) {
             int key = vector.getKey(i);
             int frequency = vector.getValue(i);
             String ngram = ks.getNGram(key);
-            
+
             if (frequency > frequencies[position_smallest_frequency]) {
                 // 1. replace the element with currently the smallest frequency
                 strings[position_smallest_frequency] = ngram;
                 frequencies[position_smallest_frequency] = frequency;
-                
+
                 // 2. loop over frequencies to find which one is now the lowest
                 // frequency
                 int smallest_frequency = Integer.MAX_VALUE;
@@ -115,7 +99,7 @@ public String[] getMostFrequentNGrams(int number) {
                     }
                 }
             }
-            
+
         }
         return strings;
     }
diff --git a/src/main/java/info/debatty/java/stringsimilarity/WeightedLevenshtein.java b/src/main/java/info/debatty/java/stringsimilarity/WeightedLevenshtein.java
diff --git a/src/main/java/info/debatty/java/stringsimilarity/examples/Examples.java b/src/main/java/info/debatty/java/stringsimilarity/examples/Examples.java