diff --git a/code/pom.xml b/code/pom.xml index 003f8f9c0..bcceab803 100644 --- a/code/pom.xml +++ b/code/pom.xml @@ -248,6 +248,11 @@ sqlite-jdbc 3.20.1 + + ru.finam + liblevenshtein-lite + 3.0.1 + com.esotericsoftware kryo diff --git a/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java b/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java new file mode 100644 index 000000000..3564d573f --- /dev/null +++ b/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java @@ -0,0 +1,196 @@ +package com.googlecode.cqengine.index.levenshtein; + +import com.github.liblevenshtein.collection.dictionary.SortedDawg; +import com.github.liblevenshtein.transducer.Algorithm; +import com.github.liblevenshtein.transducer.Candidate; +import com.github.liblevenshtein.transducer.ITransducer; +import com.github.liblevenshtein.transducer.factory.TransducerBuilder; +import com.googlecode.cqengine.attribute.Attribute; +import com.googlecode.cqengine.index.Index; +import com.googlecode.cqengine.index.support.AbstractAttributeIndex; +import com.googlecode.cqengine.index.support.CloseableIterator; +import com.googlecode.cqengine.persistence.support.ObjectSet; +import com.googlecode.cqengine.persistence.support.ObjectStore; +import com.googlecode.cqengine.query.Query; +import com.googlecode.cqengine.query.option.QueryOptions; +import com.googlecode.cqengine.query.simple.LevenshteinDistance; +import com.googlecode.cqengine.resultset.ResultSet; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * @author Ruslan Sennov + */ +public class LevenshteinDistanceIndex extends AbstractAttributeIndex { + + private final TransducerFactory transducerFactory; + private ITransducer transducer; + private Map> terms; + + /** + * Private constructor, used by static factory methods. + * + * @param attribute The attribute on which the index will be built + */ + private LevenshteinDistanceIndex(Attribute attribute, Algorithm transducerAlgorithm) { + super(attribute, Collections.>singleton(LevenshteinDistance.class)); + this.transducerFactory = new TransducerFactory(transducerAlgorithm); + } + + @Override + public boolean isMutable() { + return false; + } + + @Override + public boolean isQuantized() { + return false; + } + + @Override + public ResultSet retrieve(final Query query, final QueryOptions queryOptions) { + Class queryClass = query.getClass(); + if (LevenshteinDistance.class.equals(queryClass)) { + LevenshteinDistance lev = (LevenshteinDistance) query; + final Set set = new LinkedHashSet(); + for (Candidate candidate : transducer.transduce(lev.getValue(), lev.getMaxDistance())) { + set.addAll(terms.get(candidate.term())); + } + return new ResultSet() { + @Override + public Iterator iterator() { + return set.iterator(); + } + + @Override + public boolean contains(O object) { + return set.contains(object); + } + + @Override + public boolean matches(O object) { + return set.contains(object); + } + + @Override + public Query getQuery() { + return query; + } + + @Override + public QueryOptions getQueryOptions() { + return queryOptions; + } + + @Override + public int getRetrievalCost() { + return 10; + } + + @Override + public int getMergeCost() { + return 10; + } + + @Override + public int size() { + return set.size(); + } + + @Override + public void close() { + set.clear(); + } + }; + } else { + throw new IllegalArgumentException("Unsupported query: " + query); + } + } + + @Override + public Index getEffectiveIndex() { + return this; + } + + @Override + public boolean addAll(ObjectSet objectSet, QueryOptions queryOptions) { + // this index is immutable, will never be here + throw new IllegalStateException(); + } + + @Override + public boolean removeAll(ObjectSet objectSet, QueryOptions queryOptions) { + // this index is immutable, will never be here + throw new IllegalStateException(); + } + + @Override + public void clear(QueryOptions queryOptions) { + } + + @Override + public void init(ObjectStore objectStore, QueryOptions queryOptions) { + CloseableIterator it = null; + try { + it = objectStore.iterator(queryOptions); + terms = new HashMap>(); + O o; + while (it.hasNext()) { + o = it.next(); + for (String term : attribute.getValues(o, queryOptions)) { + if (!terms.containsKey(term)) { + terms.put(term, new HashSet()); + } + terms.get(term).add(o); + } + } + } finally { + if (it != null) { + it.close(); + } + } + + SortedDawg dict = new SortedDawg(); + List list = new ArrayList(terms.keySet()); + Collections.sort(list); + dict.addAll(list); + dict.finish(); + transducer = transducerFactory.buildTransducer(dict); + } + + public static LevenshteinDistanceIndex onAttribute(Attribute attribute) { + return new LevenshteinDistanceIndex(attribute, Algorithm.STANDARD); + } + + public static LevenshteinDistanceIndex withSpellingCorrectionOnAttribute(Attribute attribute) { + return new LevenshteinDistanceIndex(attribute, Algorithm.TRANSPOSITION); + } + + public static LevenshteinDistanceIndex withOCRCorrectionOnAttribute(Attribute attribute) { + return new LevenshteinDistanceIndex(attribute, Algorithm.MERGE_AND_SPLIT); + } +} + +class TransducerFactory { + private final Algorithm transducerAlgorithm; + + TransducerFactory(Algorithm transducerAlgorithm) { + this.transducerAlgorithm = transducerAlgorithm; + } + + ITransducer buildTransducer(SortedDawg dictionary) { + return new TransducerBuilder() + .dictionary(dictionary) + .algorithm(transducerAlgorithm) + .includeDistance(true) + .build(); + } +} \ No newline at end of file diff --git a/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java b/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java index b5eaea0a7..86f359a0f 100644 --- a/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java +++ b/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java @@ -1287,6 +1287,10 @@ public static OrderByOption orderBy(AttributeOrder attributeOrder1, At return new OrderByOption(attributeOrders); } + public static LevenshteinDistance levenshteinDistance(Attribute attribute, String value, int maxDistance) { + return new LevenshteinDistance(attribute, value, maxDistance); + } + // *************************************************************************************************************** /** diff --git a/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java b/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java new file mode 100644 index 000000000..fe3b8b460 --- /dev/null +++ b/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java @@ -0,0 +1,69 @@ +package com.googlecode.cqengine.query.simple; + +import com.googlecode.cqengine.attribute.Attribute; +import com.googlecode.cqengine.attribute.SimpleAttribute; +import com.googlecode.cqengine.query.option.QueryOptions; + +import java.util.Objects; + +/** + * @author Ruslan Sennov + */ +public class LevenshteinDistance extends SimpleQuery { + + private final String value; + private final int maxDistance; + + /** + * Creates a new {@link SimpleQuery} initialized to make assertions on values of the specified attribute + * + * @param attribute The attribute on which the assertion is to be made + */ + public LevenshteinDistance(Attribute attribute, String value, int maxDistance) { + super(attribute); + this.value = value; + this.maxDistance = maxDistance; + } + + public String getValue() { + return value; + } + + public int getMaxDistance() { + return maxDistance; + } + + @Override + protected boolean matchesSimpleAttribute(SimpleAttribute attribute, O object, QueryOptions queryOptions) { + throw new RuntimeException("Missing Levenshtein index on attribute " + attribute.toString()); + } + + @Override + protected boolean matchesNonSimpleAttribute(Attribute attribute, O object, QueryOptions queryOptions) { + throw new RuntimeException("Missing Levenshtein index on attribute " + attribute.toString()); + } + + @Override + protected int calcHashCode() { + return Objects.hashCode(value) + 31 * maxDistance; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + LevenshteinDistance that = (LevenshteinDistance) o; + + if (!attribute.equals(that.attribute)) return false; + if (maxDistance != that.maxDistance) return false; + return value != null ? value.equals(that.value) : that.value == null; + } + + @Override + public String toString() { + return "distance("+ asLiteral(super.getAttributeName()) + + ", " + asLiteral(value) + + ")<=" + maxDistance; + } +} diff --git a/code/src/test/java/com/googlecode/cqengine/index/levenshtein/LevenshteinTest.java b/code/src/test/java/com/googlecode/cqengine/index/levenshtein/LevenshteinTest.java new file mode 100644 index 000000000..81651fb29 --- /dev/null +++ b/code/src/test/java/com/googlecode/cqengine/index/levenshtein/LevenshteinTest.java @@ -0,0 +1,31 @@ +package com.googlecode.cqengine.index.levenshtein; + +import com.googlecode.cqengine.ConcurrentIndexedCollection; +import com.googlecode.cqengine.IndexedCollection; +import com.googlecode.cqengine.testutil.Car; +import com.googlecode.cqengine.testutil.CarFactory; +import org.junit.Test; + +import static com.googlecode.cqengine.query.QueryFactory.levenshteinDistance; +import static org.junit.Assert.assertEquals; + +/** + * @author Ruslan Sennov + */ +public class LevenshteinTest { + + @Test(expected = IllegalStateException.class) + public void testImmutable() { + IndexedCollection collection = new ConcurrentIndexedCollection(); + collection.addIndex(LevenshteinDistanceIndex.onAttribute(Car.MODEL)); + collection.addAll(CarFactory.createCollectionOfCars(10)); + } + + @Test + public void testQuery() { + IndexedCollection collection = new ConcurrentIndexedCollection(); + collection.addAll(CarFactory.createCollectionOfCars(10)); + collection.addIndex(LevenshteinDistanceIndex.onAttribute(Car.MANUFACTURER)); + assertEquals(3, collection.retrieve(levenshteinDistance(Car.MANUFACTURER, "Frd", 1)).size()); + } +}