From 2d65034a23b5fa1ca0df1625f93ff755f06c5629 Mon Sep 17 00:00:00 2001 From: MattScicluna Date: Thu, 18 Dec 2025 13:48:10 -0500 Subject: [PATCH 1/4] added random_landmarking support for precomputed distance/affinity --- graphtools/graphs.py | 47 ++++++++++++++++++++++++++------- test/test_random_landmarking.py | 40 ++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/graphtools/graphs.py b/graphtools/graphs.py index c53a1dc..403a540 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -1201,16 +1201,45 @@ def build_landmark_op(self): n_samples = self.data.shape[0] rng = np.random.default_rng(self.random_state) landmark_indices = rng.choice(n_samples, self.n_landmark, replace=False) - data = ( - self.data if not hasattr(self, "data_nu") else self.data_nu - ) # because of the scaling to review - if ( - n_samples > 5000 and self.distance == "euclidean" - ): # sklearn.euclidean_distances is faster than cdist for big dataset - distances = euclidean_distances(data, data[landmark_indices]) + precomputed = getattr(self, "precomputed", None) + + if precomputed is not None: + # Use the precomputed affinities/distances directly to avoid Euclidean fallback + landmark_affinities = self.kernel[:, landmark_indices] + + if sparse.issparse(landmark_affinities): + landmark_affinities = landmark_affinities.tocsr() + cluster_assignments = np.asarray( + landmark_affinities.argmax(axis=1) + ).reshape(-1) + row_max = matrix.to_array( + landmark_affinities.max(axis=1) + ).reshape(-1) + else: + landmark_affinities = np.asarray(landmark_affinities) + cluster_assignments = np.argmax(landmark_affinities, axis=1) + row_max = np.max(landmark_affinities, axis=1) + + if np.any(row_max == 0): + warnings.warn( + "Some samples have zero affinity to all randomly selected landmarks; " + "increase n_landmark or ensure the affinity matrix connects all points.", + RuntimeWarning, + ) + self._clusters = cluster_assignments else: - distances = cdist(data, data[landmark_indices], metric=self.distance) - self._clusters = np.argmin(distances, axis=1) + data = ( + self.data if not hasattr(self, "data_nu") else self.data_nu + ) # because of the scaling to review + if ( + n_samples > 5000 and self.distance == "euclidean" + ): # sklearn.euclidean_distances is faster than cdist for big dataset + distances = euclidean_distances(data, data[landmark_indices]) + else: + distances = cdist( + data, data[landmark_indices], metric=self.distance + ) + self._clusters = np.argmin(distances, axis=1) else: with _logger.log_task("SVD"): diff --git a/test/test_random_landmarking.py b/test/test_random_landmarking.py index 329fb98..0e23577 100644 --- a/test/test_random_landmarking.py +++ b/test/test_random_landmarking.py @@ -405,6 +405,46 @@ def test_random_landmarking_distance_parameter_consistency(): assert len(G.clusters) == small_data.shape[0] +def test_random_landmarking_with_precomputed_affinity(): + """Random landmarking should work with precomputed affinity matrices""" + affinity = np.array( + [ + [1.0, 0.8, 0.1, 0.0, 0.0, 0.0], + [0.8, 1.0, 0.2, 0.0, 0.0, 0.0], + [0.1, 0.2, 1.0, 0.9, 0.4, 0.0], + [0.0, 0.0, 0.9, 1.0, 0.5, 0.2], + [0.0, 0.0, 0.4, 0.5, 1.0, 0.9], + [0.0, 0.0, 0.0, 0.2, 0.9, 1.0], + ] + ) + affinity = (affinity + affinity.T) / 2 # ensure symmetry + n_landmark = 3 + random_state = 42 + + G = graphtools.Graph( + affinity, + precomputed="affinity", + n_landmark=n_landmark, + random_landmarking=True, + random_state=random_state, + knn=3, + thresh=0, + ) + + # Trigger landmark construction + _ = G.landmark_op + + rng = np.random.default_rng(random_state) + landmark_indices = rng.choice(affinity.shape[0], n_landmark, replace=False) + expected_clusters = np.asarray( + G.kernel[:, landmark_indices].argmax(axis=1) + ).reshape(-1) + + assert np.array_equal(G.clusters, expected_clusters) + assert G.transitions.shape == (affinity.shape[0], n_landmark) + assert G.landmark_op.shape == (n_landmark, n_landmark) + + ############# # Test API ############# From 45c96cae335a295719b19d47a5e1032fb67a3edd Mon Sep 17 00:00:00 2001 From: MattScicluna Date: Thu, 18 Dec 2025 14:16:36 -0500 Subject: [PATCH 2/4] added random landmarking precomputed distance test --- test/test_random_landmarking.py | 42 +++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/test/test_random_landmarking.py b/test/test_random_landmarking.py index 0e23577..80db6fa 100644 --- a/test/test_random_landmarking.py +++ b/test/test_random_landmarking.py @@ -445,6 +445,48 @@ def test_random_landmarking_with_precomputed_affinity(): assert G.landmark_op.shape == (n_landmark, n_landmark) +def test_random_landmarking_with_precomputed_distance(): + """Random landmarking should work with precomputed distance matrices""" + dist = np.array( + [ + [0, 1, 4, 4, 4, 4], + [1, 0, 4, 4, 4, 4], + [4, 4, 0, 1, 4, 4], + [4, 4, 1, 0, 4, 4], + [4, 4, 4, 4, 0, 1], + [4, 4, 4, 4, 1, 0], + ] + ) + + n_landmark = 3 + random_state = 42 + + G = graphtools.Graph( + dist, + precomputed="distance", + n_landmark=n_landmark, + random_landmarking=True, + random_state=random_state, + bandwidth=1, # deterministic affinity: exp(-dist) + decay=1, + thresh=0, + knn=3, + ) + + # Trigger landmark construction + _ = G.landmark_op + + rng = np.random.default_rng(random_state) + landmark_indices = rng.choice(dist.shape[0], n_landmark, replace=False) + expected_clusters = np.asarray( + G.kernel[:, landmark_indices].argmax(axis=1) + ).reshape(-1) + + assert np.array_equal(G.clusters, expected_clusters) + assert G.transitions.shape == (dist.shape[0], n_landmark) + assert G.landmark_op.shape == (n_landmark, n_landmark) + + ############# # Test API ############# From 4be342ab5855c820ec4382d31eb933a5019d0d07 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 19:30:37 +0000 Subject: [PATCH 3/4] Initial plan From 78f0c8949e82c99d8f7db8bb5658e4b4401d5e78 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 18 Dec 2025 19:35:21 +0000 Subject: [PATCH 4/4] Add test coverage for sparse matrices and zero affinity edge case, improve comment clarity Co-authored-by: MattScicluna <19255250+MattScicluna@users.noreply.github.com> --- graphtools/graphs.py | 2 +- test/test_random_landmarking.py | 121 ++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 1 deletion(-) diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 403a540..d0317d2 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -1204,7 +1204,7 @@ def build_landmark_op(self): precomputed = getattr(self, "precomputed", None) if precomputed is not None: - # Use the precomputed affinities/distances directly to avoid Euclidean fallback + # Use affinities from the kernel computed from the precomputed matrix to avoid Euclidean fallback landmark_affinities = self.kernel[:, landmark_indices] if sparse.issparse(landmark_affinities): diff --git a/test/test_random_landmarking.py b/test/test_random_landmarking.py index 80db6fa..38fb3f3 100644 --- a/test/test_random_landmarking.py +++ b/test/test_random_landmarking.py @@ -8,6 +8,7 @@ from load_tests import generate_swiss_roll from load_tests import graphtools from load_tests import np +from load_tests import sp import pygsp import warnings @@ -487,6 +488,126 @@ def test_random_landmarking_with_precomputed_distance(): assert G.landmark_op.shape == (n_landmark, n_landmark) +def test_random_landmarking_with_sparse_precomputed_affinity(): + """Random landmarking should work with sparse precomputed affinity matrices""" + affinity = np.array( + [ + [1.0, 0.8, 0.1, 0.0, 0.0, 0.0], + [0.8, 1.0, 0.2, 0.0, 0.0, 0.0], + [0.1, 0.2, 1.0, 0.9, 0.4, 0.0], + [0.0, 0.0, 0.9, 1.0, 0.5, 0.2], + [0.0, 0.0, 0.4, 0.5, 1.0, 0.9], + [0.0, 0.0, 0.0, 0.2, 0.9, 1.0], + ] + ) + affinity = (affinity + affinity.T) / 2 # ensure symmetry + affinity_sparse = sp.csr_matrix(affinity) + n_landmark = 3 + random_state = 42 + + G = graphtools.Graph( + affinity_sparse, + precomputed="affinity", + n_landmark=n_landmark, + random_landmarking=True, + random_state=random_state, + knn=3, + thresh=0, + ) + + # Trigger landmark construction + _ = G.landmark_op + + rng = np.random.default_rng(random_state) + landmark_indices = rng.choice(affinity.shape[0], n_landmark, replace=False) + expected_clusters = np.asarray( + G.kernel[:, landmark_indices].argmax(axis=1) + ).reshape(-1) + + assert np.array_equal(G.clusters, expected_clusters) + assert G.transitions.shape == (affinity.shape[0], n_landmark) + assert G.landmark_op.shape == (n_landmark, n_landmark) + + +def test_random_landmarking_with_sparse_precomputed_distance(): + """Random landmarking should work with sparse precomputed distance matrices""" + dist = np.array( + [ + [0, 1, 4, 4, 4, 4], + [1, 0, 4, 4, 4, 4], + [4, 4, 0, 1, 4, 4], + [4, 4, 1, 0, 4, 4], + [4, 4, 4, 4, 0, 1], + [4, 4, 4, 4, 1, 0], + ] + ) + dist_sparse = sp.csr_matrix(dist) + + n_landmark = 3 + random_state = 42 + + G = graphtools.Graph( + dist_sparse, + precomputed="distance", + n_landmark=n_landmark, + random_landmarking=True, + random_state=random_state, + bandwidth=1, # deterministic affinity: exp(-dist) + decay=1, + thresh=0, + knn=3, + ) + + # Trigger landmark construction + _ = G.landmark_op + + rng = np.random.default_rng(random_state) + landmark_indices = rng.choice(dist.shape[0], n_landmark, replace=False) + expected_clusters = np.asarray( + G.kernel[:, landmark_indices].argmax(axis=1) + ).reshape(-1) + + assert np.array_equal(G.clusters, expected_clusters) + assert G.transitions.shape == (dist.shape[0], n_landmark) + assert G.landmark_op.shape == (n_landmark, n_landmark) + + +def test_random_landmarking_zero_affinity_warning(): + """Test warning when samples have zero affinity to all landmarks""" + # Create an affinity matrix where point 5 has no connection to other points + affinity = np.array( + [ + [1.0, 0.8, 0.1, 0.0, 0.0, 0.0], + [0.8, 1.0, 0.2, 0.0, 0.0, 0.0], + [0.1, 0.2, 1.0, 0.9, 0.4, 0.0], + [0.0, 0.0, 0.9, 1.0, 0.5, 0.0], + [0.0, 0.0, 0.4, 0.5, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # isolated point + ] + ) + affinity = (affinity + affinity.T) / 2 # ensure symmetry + n_landmark = 2 + random_state = 42 # This seed selects landmarks that don't include point 5 + + # Should warn about zero affinity + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + G = graphtools.Graph( + affinity, + precomputed="affinity", + n_landmark=n_landmark, + random_landmarking=True, + random_state=random_state, + knn=3, + thresh=0, + ) + _ = G.landmark_op + + assert len(w) == 1 + assert issubclass(w[0].category, RuntimeWarning) + assert "zero affinity to all randomly selected landmarks" in str(w[0].message) + + ############# # Test API #############