diff --git a/graphtools/graphs.py b/graphtools/graphs.py index c53a1dc..d0317d2 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -1201,16 +1201,45 @@ def build_landmark_op(self): n_samples = self.data.shape[0] rng = np.random.default_rng(self.random_state) landmark_indices = rng.choice(n_samples, self.n_landmark, replace=False) - data = ( - self.data if not hasattr(self, "data_nu") else self.data_nu - ) # because of the scaling to review - if ( - n_samples > 5000 and self.distance == "euclidean" - ): # sklearn.euclidean_distances is faster than cdist for big dataset - distances = euclidean_distances(data, data[landmark_indices]) + precomputed = getattr(self, "precomputed", None) + + if precomputed is not None: + # Use affinities from the kernel computed from the precomputed matrix to avoid Euclidean fallback + landmark_affinities = self.kernel[:, landmark_indices] + + if sparse.issparse(landmark_affinities): + landmark_affinities = landmark_affinities.tocsr() + cluster_assignments = np.asarray( + landmark_affinities.argmax(axis=1) + ).reshape(-1) + row_max = matrix.to_array( + landmark_affinities.max(axis=1) + ).reshape(-1) + else: + landmark_affinities = np.asarray(landmark_affinities) + cluster_assignments = np.argmax(landmark_affinities, axis=1) + row_max = np.max(landmark_affinities, axis=1) + + if np.any(row_max == 0): + warnings.warn( + "Some samples have zero affinity to all randomly selected landmarks; " + "increase n_landmark or ensure the affinity matrix connects all points.", + RuntimeWarning, + ) + self._clusters = cluster_assignments else: - distances = cdist(data, data[landmark_indices], metric=self.distance) - self._clusters = np.argmin(distances, axis=1) + data = ( + self.data if not hasattr(self, "data_nu") else self.data_nu + ) # because of the scaling to review + if ( + n_samples > 5000 and self.distance == "euclidean" + ): # sklearn.euclidean_distances is faster than cdist for big dataset + distances = euclidean_distances(data, data[landmark_indices]) + else: + distances = cdist( + data, data[landmark_indices], metric=self.distance + ) + self._clusters = np.argmin(distances, axis=1) else: with _logger.log_task("SVD"): diff --git a/test/test_random_landmarking.py b/test/test_random_landmarking.py index 329fb98..38fb3f3 100644 --- a/test/test_random_landmarking.py +++ b/test/test_random_landmarking.py @@ -8,6 +8,7 @@ from load_tests import generate_swiss_roll from load_tests import graphtools from load_tests import np +from load_tests import sp import pygsp import warnings @@ -405,6 +406,208 @@ def test_random_landmarking_distance_parameter_consistency(): assert len(G.clusters) == small_data.shape[0] +def test_random_landmarking_with_precomputed_affinity(): + """Random landmarking should work with precomputed affinity matrices""" + affinity = np.array( + [ + [1.0, 0.8, 0.1, 0.0, 0.0, 0.0], + [0.8, 1.0, 0.2, 0.0, 0.0, 0.0], + [0.1, 0.2, 1.0, 0.9, 0.4, 0.0], + [0.0, 0.0, 0.9, 1.0, 0.5, 0.2], + [0.0, 0.0, 0.4, 0.5, 1.0, 0.9], + [0.0, 0.0, 0.0, 0.2, 0.9, 1.0], + ] + ) + affinity = (affinity + affinity.T) / 2 # ensure symmetry + n_landmark = 3 + random_state = 42 + + G = graphtools.Graph( + affinity, + precomputed="affinity", + n_landmark=n_landmark, + random_landmarking=True, + random_state=random_state, + knn=3, + thresh=0, + ) + + # Trigger landmark construction + _ = G.landmark_op + + rng = np.random.default_rng(random_state) + landmark_indices = rng.choice(affinity.shape[0], n_landmark, replace=False) + expected_clusters = np.asarray( + G.kernel[:, landmark_indices].argmax(axis=1) + ).reshape(-1) + + assert np.array_equal(G.clusters, expected_clusters) + assert G.transitions.shape == (affinity.shape[0], n_landmark) + assert G.landmark_op.shape == (n_landmark, n_landmark) + + +def test_random_landmarking_with_precomputed_distance(): + """Random landmarking should work with precomputed distance matrices""" + dist = np.array( + [ + [0, 1, 4, 4, 4, 4], + [1, 0, 4, 4, 4, 4], + [4, 4, 0, 1, 4, 4], + [4, 4, 1, 0, 4, 4], + [4, 4, 4, 4, 0, 1], + [4, 4, 4, 4, 1, 0], + ] + ) + + n_landmark = 3 + random_state = 42 + + G = graphtools.Graph( + dist, + precomputed="distance", + n_landmark=n_landmark, + random_landmarking=True, + random_state=random_state, + bandwidth=1, # deterministic affinity: exp(-dist) + decay=1, + thresh=0, + knn=3, + ) + + # Trigger landmark construction + _ = G.landmark_op + + rng = np.random.default_rng(random_state) + landmark_indices = rng.choice(dist.shape[0], n_landmark, replace=False) + expected_clusters = np.asarray( + G.kernel[:, landmark_indices].argmax(axis=1) + ).reshape(-1) + + assert np.array_equal(G.clusters, expected_clusters) + assert G.transitions.shape == (dist.shape[0], n_landmark) + assert G.landmark_op.shape == (n_landmark, n_landmark) + + +def test_random_landmarking_with_sparse_precomputed_affinity(): + """Random landmarking should work with sparse precomputed affinity matrices""" + affinity = np.array( + [ + [1.0, 0.8, 0.1, 0.0, 0.0, 0.0], + [0.8, 1.0, 0.2, 0.0, 0.0, 0.0], + [0.1, 0.2, 1.0, 0.9, 0.4, 0.0], + [0.0, 0.0, 0.9, 1.0, 0.5, 0.2], + [0.0, 0.0, 0.4, 0.5, 1.0, 0.9], + [0.0, 0.0, 0.0, 0.2, 0.9, 1.0], + ] + ) + affinity = (affinity + affinity.T) / 2 # ensure symmetry + affinity_sparse = sp.csr_matrix(affinity) + n_landmark = 3 + random_state = 42 + + G = graphtools.Graph( + affinity_sparse, + precomputed="affinity", + n_landmark=n_landmark, + random_landmarking=True, + random_state=random_state, + knn=3, + thresh=0, + ) + + # Trigger landmark construction + _ = G.landmark_op + + rng = np.random.default_rng(random_state) + landmark_indices = rng.choice(affinity.shape[0], n_landmark, replace=False) + expected_clusters = np.asarray( + G.kernel[:, landmark_indices].argmax(axis=1) + ).reshape(-1) + + assert np.array_equal(G.clusters, expected_clusters) + assert G.transitions.shape == (affinity.shape[0], n_landmark) + assert G.landmark_op.shape == (n_landmark, n_landmark) + + +def test_random_landmarking_with_sparse_precomputed_distance(): + """Random landmarking should work with sparse precomputed distance matrices""" + dist = np.array( + [ + [0, 1, 4, 4, 4, 4], + [1, 0, 4, 4, 4, 4], + [4, 4, 0, 1, 4, 4], + [4, 4, 1, 0, 4, 4], + [4, 4, 4, 4, 0, 1], + [4, 4, 4, 4, 1, 0], + ] + ) + dist_sparse = sp.csr_matrix(dist) + + n_landmark = 3 + random_state = 42 + + G = graphtools.Graph( + dist_sparse, + precomputed="distance", + n_landmark=n_landmark, + random_landmarking=True, + random_state=random_state, + bandwidth=1, # deterministic affinity: exp(-dist) + decay=1, + thresh=0, + knn=3, + ) + + # Trigger landmark construction + _ = G.landmark_op + + rng = np.random.default_rng(random_state) + landmark_indices = rng.choice(dist.shape[0], n_landmark, replace=False) + expected_clusters = np.asarray( + G.kernel[:, landmark_indices].argmax(axis=1) + ).reshape(-1) + + assert np.array_equal(G.clusters, expected_clusters) + assert G.transitions.shape == (dist.shape[0], n_landmark) + assert G.landmark_op.shape == (n_landmark, n_landmark) + + +def test_random_landmarking_zero_affinity_warning(): + """Test warning when samples have zero affinity to all landmarks""" + # Create an affinity matrix where point 5 has no connection to other points + affinity = np.array( + [ + [1.0, 0.8, 0.1, 0.0, 0.0, 0.0], + [0.8, 1.0, 0.2, 0.0, 0.0, 0.0], + [0.1, 0.2, 1.0, 0.9, 0.4, 0.0], + [0.0, 0.0, 0.9, 1.0, 0.5, 0.0], + [0.0, 0.0, 0.4, 0.5, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # isolated point + ] + ) + affinity = (affinity + affinity.T) / 2 # ensure symmetry + n_landmark = 2 + random_state = 42 # This seed selects landmarks that don't include point 5 + + # Should warn about zero affinity + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + G = graphtools.Graph( + affinity, + precomputed="affinity", + n_landmark=n_landmark, + random_landmarking=True, + random_state=random_state, + knn=3, + thresh=0, + ) + _ = G.landmark_op + + assert len(w) == 1 + assert issubclass(w[0].category, RuntimeWarning) + assert "zero affinity to all randomly selected landmarks" in str(w[0].message) + + ############# # Test API #############