Skip to content

Commit 0e8d38a

Browse files
authored
pybind: replaced input type to bytearray - MOD-4597 (#304)
* replaced input type to bytearray * added a module to convert numpy to bytearay * moved mybytearray to python bindings lib
1 parent e72cd0d commit 0e8d38a

File tree

4 files changed

+86
-16
lines changed

4 files changed

+86
-16
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ def build_extension(self, ext):
8282
["cmake", "--build", "."] + build_args, cwd=self.build_temp
8383
)
8484

85-
8685
# The information here can also be placed in setup.cfg - better separation of
8786
# logic and declaration, and simpler if you include description/version in a file.
8887
setup(
@@ -93,5 +92,6 @@ def build_extension(self, ext):
9392
description="Python library around collection of vector similarity algorithm",
9493
long_description="",
9594
ext_modules=[CMakeExtension("VecSim", "src/python_bindings")],
95+
py_modules=['src/python_bindings/Mybytearray'],
9696
cmdclass={"build_ext": CMakeBuild}
9797
)

src/python_bindings/Mybytearray.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
def create_bytearray(np_arr):
3+
return bytearray(np_arr)

src/python_bindings/bindings.cpp

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -69,24 +69,32 @@ class PyBatchIterator {
6969
void reset() { VecSimBatchIterator_Reset(batchIterator.get()); }
7070
virtual ~PyBatchIterator() {}
7171
};
72+
// @input or @query arguments are a py::object object. (numpy arrays are acceptable)
73+
74+
// To convert input or query to a pointer use input_to_blob(input)
75+
// For example:
76+
// VecSimIndex_AddVector(index, input_to_blob(input), id);
7277

7378
class PyVecSimIndex {
7479
public:
75-
PyVecSimIndex() {}
76-
77-
PyVecSimIndex(const VecSimParams &params) { index = VecSimIndex_New(&params); }
80+
PyVecSimIndex()
81+
: create_bytearray(
82+
py::module::import("src.python_bindings.Mybytearray").attr("create_bytearray")) {}
83+
84+
PyVecSimIndex(const VecSimParams &params)
85+
: create_bytearray(
86+
py::module::import("src.python_bindings.Mybytearray").attr("create_bytearray")) {
87+
index = VecSimIndex_New(&params);
88+
}
7889

7990
void addVector(py::object input, size_t id) {
80-
py::array_t<float, py::array::c_style | py::array::forcecast> items(input);
81-
VecSimIndex_AddVector(index, (void *)items.data(0), id);
91+
VecSimIndex_AddVector(index, input_to_blob(input), id);
8292
}
83-
8493
void deleteVector(size_t id) { VecSimIndex_DeleteVector(index, id); }
8594

8695
py::object knn(py::object input, size_t k, VecSimQueryParams *query_params) {
87-
py::array_t<float, py::array::c_style | py::array::forcecast> items(input);
8896
VecSimQueryResult_List res =
89-
VecSimIndex_TopKQuery(index, (void *)items.data(0), k, query_params, BY_SCORE);
97+
VecSimIndex_TopKQuery(index, input_to_blob(input), k, query_params, BY_SCORE);
9098
if (VecSimQueryResult_Len(res) != k) {
9199
throw std::runtime_error("Cannot return the results in a contiguous 2D array. Probably "
92100
"ef or M is too small");
@@ -95,27 +103,32 @@ class PyVecSimIndex {
95103
}
96104

97105
py::object range(py::object input, double radius, VecSimQueryParams *query_params) {
98-
py::array_t<float, py::array::c_style | py::array::forcecast> items(input);
99106
VecSimQueryResult_List res =
100-
VecSimIndex_RangeQuery(index, (void *)items.data(0), radius, query_params, BY_SCORE);
107+
VecSimIndex_RangeQuery(index, input_to_blob(input), radius, query_params, BY_SCORE);
101108
return wrap_results(res, VecSimQueryResult_Len(res));
102109
}
103110

104111
size_t indexSize() { return VecSimIndex_IndexSize(index); }
105112

106-
PyBatchIterator createBatchIterator(py::object &query_blob, VecSimQueryParams *query_params) {
107-
py::array_t<float, py::array::c_style | py::array::forcecast> items(query_blob);
108-
float *vector_data = (float *)items.data(0);
109-
return PyBatchIterator(VecSimBatchIterator_New(index, vector_data, query_params));
113+
PyBatchIterator createBatchIterator(py::object input, VecSimQueryParams *query_params) {
114+
return PyBatchIterator(VecSimBatchIterator_New(index, input_to_blob(input), query_params));
110115
}
111116

112117
virtual ~PyVecSimIndex() { VecSimIndex_Free(index); }
113118

114119
protected:
115120
VecSimIndex *index;
121+
122+
private:
123+
// save the bytearray to keep its pointer valid
124+
py::bytearray tmp_bytearray;
125+
const py::function create_bytearray;
126+
const char *input_to_blob(py::object input) {
127+
tmp_bytearray = create_bytearray(input);
128+
return PyByteArray_AS_STRING(tmp_bytearray.ptr());
129+
}
116130
};
117131

118-
// Currently supports only floats. TODO change after serializer refactoring
119132
class PyHNSWLibIndex : public PyVecSimIndex {
120133
public:
121134
PyHNSWLibIndex(const HNSWParams &hnsw_params) {

tests/flow/test_bruteforce.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,60 @@
44

55
from common import *
66

7+
def test_sanity_bf():
8+
class TestData:
9+
def __init__(self, data_type, metric, dist_func, np_fuc):
10+
dim = 16
11+
num_elements = 10
12+
params = VecSimParams()
13+
bfparams = BFParams()
14+
15+
params.algo = VecSimAlgo_BF
16+
bfparams.initialCapacity = num_elements
17+
bfparams.blockSize = num_elements
18+
bfparams.dim = dim
19+
bfparams.type = data_type
20+
bfparams.metric = metric
21+
22+
params.bfParams = bfparams
23+
24+
self.index = VecSimIndex(params)
25+
26+
self.metric = metric
27+
self.type = data_type
28+
self.dist_func = dist_func
29+
30+
np.random.seed(47)
31+
self.data = np_fuc(np.random.random((num_elements, dim)))
32+
self.query = np_fuc(np.random.random((1, dim)))
33+
self.vectors = []
34+
for i, vector in enumerate(self.data):
35+
self.vectors.append((i, vector))
36+
self.index.add_vector(vector, i)
37+
38+
def measure_dists(self, k):
39+
dists = [(self.dist_func(self.query.flat, vec), key) for key, vec in self.vectors]
40+
dists = sorted(dists)[:k]
41+
keys = [key for _, key in dists]
42+
dists = [dist for dist, _ in dists]
43+
return (keys, dists)
44+
45+
test_datas = []
46+
47+
dist_funcs = [(VecSimMetric_Cosine, spatial.distance.cosine), (VecSimMetric_L2, spatial.distance.sqeuclidean)]
48+
types = [(VecSimType_FLOAT32, np.float32), (VecSimType_FLOAT64, np.float64)]
49+
for type_name, np_type in types:
50+
for dist_name, dist_func in dist_funcs:
51+
test_datas.append(TestData(type_name, dist_name, dist_func, np_type))
52+
53+
k = 10
54+
for test_data in test_datas:
55+
56+
keys, dists = test_data.measure_dists(k)
57+
bf_labels, bf_distances = test_data.index.knn_query(test_data.query, k=k)
58+
assert_allclose(bf_labels, [keys], rtol=1e-5, atol=0)
59+
assert_allclose(bf_distances, [dists], rtol=1e-5, atol=0)
60+
print(f"\nsanity test for {test_data.metric} and {test_data.type} pass")
761

862
def test_bf_cosine():
963
dim = 128

0 commit comments

Comments
 (0)