Skip to content

Commit 8825036

Browse files
committed
Create similarity module
1 parent f1d9b61 commit 8825036

33 files changed

+1512
-0
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.idea/
2+
.vscode/
3+
__pycache__/
4+
similarity/__pycache__/

README.md

Lines changed: 461 additions & 0 deletions
Large diffs are not rendered by default.

similarity/__init__.py

Whitespace-only changes.

similarity/cosine.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import math
2+
3+
from .shingle_based import ShingleBased
4+
from .string_distance import NormalizedStringDistance
5+
from .string_similarity import NormalizedStringSimilarity
6+
7+
8+
class Cosine(ShingleBased, NormalizedStringDistance, NormalizedStringSimilarity):
9+
10+
def __init__(self, k):
11+
super().__init__(k)
12+
13+
def distance(self, s0, s1):
14+
return 1.0 - self.similarity(s0, s1)
15+
16+
def similarity(self, s0, s1):
17+
if s0 is None:
18+
raise TypeError("Argument s0 is NoneType.")
19+
if s1 is None:
20+
raise TypeError("Argument s1 is NoneType.")
21+
if s0 == s1:
22+
return 1.0
23+
if len(s0) < self.get_k() or len(s1) < self.get_k():
24+
return 0.0
25+
profile0 = self.get_profile(s0)
26+
profile1 = self.get_profile(s1)
27+
return self._dot_product(profile0, profile1) / (self._norm(profile0) * self._norm(profile1))
28+
29+
def similarity_profiles(self, profile0, profile1):
30+
return self._dot_product(profile0, profile1) / (self._norm(profile0) * self._norm(profile1))
31+
32+
@staticmethod
33+
def _dot_product(profile0, profile1):
34+
small = profile1
35+
large = profile0
36+
if len(profile0) < len(profile1):
37+
small = profile0
38+
large = profile1
39+
agg = 0.0
40+
for k, v in small.items():
41+
i = large.get(k)
42+
if not i:
43+
continue
44+
agg += 1.0 * v * i
45+
return agg
46+
47+
@staticmethod
48+
def _norm(profile):
49+
agg = 0.0
50+
for k, v in profile.items():
51+
agg += 1.0 * v * v
52+
return math.sqrt(agg)
53+
54+
55+
if __name__ == "__main__":
56+
cosine = Cosine(1)
57+
str0 = "上海市宝山区 你好"
58+
str1 = "上海浦东新区 你好吗"
59+
d = cosine.distance(str0, str1)
60+
s = cosine.similarity(str0, str1)
61+
print(d)
62+
print(s)

similarity/cosine_test.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import unittest
2+
3+
from .cosine import Cosine
4+
5+
6+
class TestCosine(unittest.TestCase):
7+
8+
def test_cosine(self):
9+
a = Cosine(1)
10+
s0 = ""
11+
s1 = ""
12+
s2 = "上海"
13+
s3 = "上海市"
14+
distance_format = "distance: {:.4}\t between {} and {}"
15+
similarity_format = "similarity: {:.4}\t between {} and {}"
16+
print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
17+
print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
18+
print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
19+
print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
20+
print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
21+
print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
22+
23+
print(similarity_format.format(str(a.similarity(s0, s1)), s0, s1))
24+
print(similarity_format.format(str(a.similarity(s0, s2)), s0, s2))
25+
print(similarity_format.format(str(a.similarity(s0, s3)), s0, s3))
26+
print(similarity_format.format(str(a.similarity(s1, s2)), s1, s2))
27+
print(similarity_format.format(str(a.similarity(s1, s3)), s1, s3))
28+
print(similarity_format.format(str(a.similarity(s2, s3)), s2, s3))
29+
30+
31+
if __name__ == "__main__":
32+
unittest.main()

similarity/damerau.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from .string_distance import MetricStringDistance
2+
import numpy as np
3+
4+
5+
class Damerau(MetricStringDistance):
6+
7+
def distance(self, s0, s1):
8+
if s0 is None:
9+
raise TypeError("Argument s0 is NoneType.")
10+
if s1 is None:
11+
raise TypeError("Argument s1 is NoneType.")
12+
if s0 == s1:
13+
return 0.0
14+
inf = int(len(s0) + len(s1))
15+
da = dict()
16+
for i in range(len(s0)):
17+
da[s0[i]] = str(0)
18+
for i in range(len(s1)):
19+
da[s1[i]] = str(0)
20+
h = np.zeros((len(s0) + 2, len(s1) + 2))
21+
for i in range(len(s0) + 1):
22+
h[i + 1][0] = inf
23+
h[i + 1][1] = i
24+
for j in range(len(s1) + 1):
25+
h[0][j + 1] = inf
26+
h[1][j + 1] = j
27+
for i in range(1, len(s0) + 1):
28+
db = 0
29+
for j in range(1, len(s1) + 1):
30+
i1 = int(da[s1[j - 1]])
31+
j1 = db
32+
33+
cost = 1
34+
if s0[i - 1] == s1[j - 1]:
35+
cost = 0
36+
db = j
37+
h[i + 1][j + 1] = min(h[i][j] + cost,
38+
h[i + 1][j] + 1,
39+
h[i][j + 1] + 1,
40+
h[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1))
41+
da[s0[i - 1]] = str(i)
42+
43+
return h[len(s0) + 1][len(s1) + 1]

similarity/damerau_test.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import unittest
2+
3+
from .damerau import Damerau
4+
5+
6+
class TestDamerau(unittest.TestCase):
7+
8+
def test_damerau(self):
9+
a = Damerau()
10+
s0 = ""
11+
s1 = ""
12+
s2 = "上海"
13+
s3 = "上海市"
14+
distance_format = "distance: {:.4}\t between {} and {}"
15+
print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
16+
print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
17+
print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
18+
print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
19+
print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
20+
print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
21+
22+
23+
if __name__ == "__main__":
24+
unittest.main()

similarity/jaccard.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from .shingle_based import ShingleBased
2+
from .string_distance import NormalizedStringDistance, MetricStringDistance
3+
from .string_similarity import NormalizedStringSimilarity
4+
5+
6+
class Jaccard(ShingleBased, MetricStringDistance, NormalizedStringDistance, NormalizedStringSimilarity):
7+
8+
def __init__(self, k):
9+
super().__init__(k)
10+
11+
def distance(self, s0, s1):
12+
1.0 - self.similarity(s0, s1)
13+
14+
def similarity(self, s0, s1):
15+
if s0 is None:
16+
raise TypeError("Argument s0 is NoneType.")
17+
if s1 is None:
18+
raise TypeError("Argument s1 is NoneType.")
19+
if s0 == s1:
20+
return 1.0
21+
if len(s0) < self.get_k() or len(s1) < self.get_k():
22+
return 0.0
23+
profile0 = self.get_profile(s0)
24+
profile1 = self.get_profile(s1)
25+
union = set()
26+
for ite in profile0.keys():
27+
union.add(ite)
28+
for ite in profile1.keys():
29+
union.add(ite)
30+
inter = int(len(profile0.keys()) + len(profile1.keys()) - len(union))
31+
return 1.0 * inter / len(union)

similarity/jaccard_test.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import unittest
2+
3+
from .jaccard import Jaccard
4+
5+
6+
class TestJaccard(unittest.TestCase):
7+
8+
def test_jaccard(self):
9+
a = Jaccard(1)
10+
s0 = ""
11+
s1 = ""
12+
s2 = "上海"
13+
s3 = "上海市"
14+
distance_format = "distance: {:.4}\t between {} and {}"
15+
similarity_format = "similarity: {:.4}\t between {} and {}"
16+
print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
17+
print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
18+
print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
19+
print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
20+
print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
21+
print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
22+
23+
print(similarity_format.format(str(a.similarity(s0, s1)), s0, s1))
24+
print(similarity_format.format(str(a.similarity(s0, s2)), s0, s2))
25+
print(similarity_format.format(str(a.similarity(s0, s3)), s0, s3))
26+
print(similarity_format.format(str(a.similarity(s1, s2)), s1, s2))
27+
print(similarity_format.format(str(a.similarity(s1, s3)), s1, s3))
28+
print(similarity_format.format(str(a.similarity(s2, s3)), s2, s3))
29+
30+
31+
if __name__ == "__main__":
32+
unittest.main()

similarity/jarowinkler.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from .string_distance import NormalizedStringDistance
2+
from .string_similarity import NormalizedStringSimilarity
3+
4+
5+
class JaroWinkler(NormalizedStringSimilarity, NormalizedStringDistance):
6+
7+
def __init__(self, threshold=0.7):
8+
self.threshold = threshold
9+
self.three = 3
10+
self.jw_coef = 0.1
11+
12+
def get_threshold(self):
13+
return self.threshold
14+
15+
def similarity(self, s0, s1):
16+
if s0 is None:
17+
raise TypeError("Argument s0 is NoneType.")
18+
if s1 is None:
19+
raise TypeError("Argument s1 is NoneType.")
20+
if s0 == s1:
21+
return 1.0
22+
mtp = self.matches(s0, s1)
23+
m = mtp[0]
24+
if m == 0:
25+
return 0.0
26+
j = (m / len(s0) + m / len(s1) + (m - mtp[1]) / m) / self.three
27+
jw = j
28+
if j > self.get_threshold():
29+
jw = j + min(self.jw_coef, 1.0 / mtp[self.three]) * mtp[2] * (1 - j)
30+
return jw
31+
32+
def distance(self, s0, s1):
33+
return 1.0 - self.similarity(s0, s1)
34+
35+
@staticmethod
36+
def matches(s0, s1):
37+
if len(s0) > len(s1):
38+
max_str = s0
39+
min_str = s1
40+
else:
41+
max_str = s1
42+
min_str = s0
43+
ran = int(max(len(max_str) / 2 - 1, 0))
44+
match_indexes = [-1] * len(min_str)
45+
match_flags = [False] * len(max_str)
46+
matches = 0
47+
for mi in range(len(min_str)):
48+
c1 = min_str[mi]
49+
for xi in range(max(mi - ran, 0), min(mi + ran + 1, len(max_str))):
50+
if not match_flags[xi] and c1 == max_str[xi]:
51+
match_indexes[mi] = xi
52+
match_flags[xi] = True
53+
matches += 1
54+
break
55+
56+
ms0, ms1 = [0] * matches, [0] * matches
57+
si = 0
58+
for i in range(len(min_str)):
59+
if match_indexes[i] != -1:
60+
ms0[si] = min_str[i]
61+
si += 1
62+
si = 0
63+
for j in range(len(max_str)):
64+
if match_flags[j]:
65+
ms1[si] = max_str[j]
66+
si += 1
67+
transpositions = 0
68+
for mi in range(len(ms0)):
69+
if ms0[mi] != ms1[mi]:
70+
transpositions += 1
71+
prefix = 0
72+
for mi in range(len(min_str)):
73+
if s0[mi] == s1[mi]:
74+
prefix += 1
75+
else:
76+
break
77+
return [matches, int(transpositions / 2), prefix, len(max_str)]

0 commit comments

Comments
 (0)