2020from sklearn .utils .validation import check_is_fitted
2121from sklearn .exceptions import ConvergenceWarning
2222
23+ # cython implementation of swap step in PAM algorithm.
24+ from ._k_medoids_helper import _compute_optimal_swap , _build
25+
2326
2427class KMedoids (BaseEstimator , ClusterMixin , TransformerMixin ):
2528 """k-medoids clustering.
@@ -35,17 +38,27 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
3538 metric : string, or callable, optional, default: 'euclidean'
3639 What distance metric to use. See :func:metrics.pairwise_distances
3740
38- init : {'random', 'heuristic', 'k-medoids++'}, optional, default: 'heuristic'
41+ method : {'alternate', 'pam'}, default: 'alternate'
42+ Which algorithm to use.
43+
44+ init : {'random', 'heuristic', 'k-medoids++', 'build'}, optional, default: 'build'
3945 Specify medoid initialization method. 'random' selects n_clusters
4046 elements from the dataset. 'heuristic' picks the n_clusters points
4147 with the smallest sum distance to every other point. 'k-medoids++'
4248 follows an approach based on k-means++_, and in general, gives initial
4349 medoids which are more separated than those generated by the other methods.
50+ 'build' is a greedy initialization of the medoids used in the original PAM
51+ algorithm. Often 'build' is more efficient but slower than other
52+ initializations on big datasets and it is also very non-robust,
53+ if there are outliers in the dataset, use another initialization.
4454
4555 .. _k-means++: https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf
4656
4757 max_iter : int, optional, default : 300
48- Specify the maximum number of iterations when fitting.
58+ Specify the maximum number of iterations when fitting. It can be zero in
59+ which case only the initialization is computed which may be suitable for
60+ large datasets when the initialization is sufficiently efficient
61+ (i.e. for 'build' init).
4962
5063 random_state : int, RandomState instance or None, optional
5164 Specify random state for the random number generator. Used to
@@ -112,24 +125,25 @@ def __init__(
112125 self ,
113126 n_clusters = 8 ,
114127 metric = "euclidean" ,
128+ method = "alternate" ,
115129 init = "heuristic" ,
116130 max_iter = 300 ,
117131 random_state = None ,
118132 ):
119133 self .n_clusters = n_clusters
120134 self .metric = metric
135+ self .method = method
121136 self .init = init
122137 self .max_iter = max_iter
123138 self .random_state = random_state
124139
125- def _check_nonnegative_int (self , value , desc ):
140+ def _check_nonnegative_int (self , value , desc , strict = True ):
126141 """Validates if value is a valid integer > 0"""
127-
128- if (
129- value is None
130- or value <= 0
131- or not isinstance (value , (int , np .integer ))
132- ):
142+ if strict :
143+ negative = (value is None ) or (value <= 0 )
144+ else :
145+ negative = (value is None ) or (value < 0 )
146+ if negative or not isinstance (value , (int , np .integer )):
133147 raise ValueError (
134148 "%s should be a nonnegative integer. "
135149 "%s was given" % (desc , value )
@@ -140,10 +154,10 @@ def _check_init_args(self):
140154
141155 # Check n_clusters and max_iter
142156 self ._check_nonnegative_int (self .n_clusters , "n_clusters" )
143- self ._check_nonnegative_int (self .max_iter , "max_iter" )
157+ self ._check_nonnegative_int (self .max_iter , "max_iter" , False )
144158
145159 # Check init
146- init_methods = ["random" , "heuristic" , "k-medoids++" ]
160+ init_methods = ["random" , "heuristic" , "k-medoids++" , "build" ]
147161 if self .init not in init_methods :
148162 raise ValueError (
149163 "init needs to be one of "
@@ -183,15 +197,44 @@ def fit(self, X, y=None):
183197 )
184198 labels = None
185199
200+ if self .method == "pam" :
201+ # Compute the distance to the first and second closest points
202+ # among medoids.
203+ Djs , Ejs = np .sort (D [medoid_idxs ], axis = 0 )[[0 , 1 ]]
204+
186205 # Continue the algorithm as long as
187206 # the medoids keep changing and the maximum number
188207 # of iterations is not exceeded
208+
189209 for self .n_iter_ in range (0 , self .max_iter ):
190210 old_medoid_idxs = np .copy (medoid_idxs )
191211 labels = np .argmin (D [medoid_idxs , :], axis = 0 )
192212
193- # Update medoids with the new cluster indices
194- self ._update_medoid_idxs_in_place (D , labels , medoid_idxs )
213+ if self .method == "alternate" :
214+ # Update medoids with the new cluster indices
215+ self ._update_medoid_idxs_in_place (D , labels , medoid_idxs )
216+ elif self .method == "pam" :
217+ not_medoid_idxs = np .delete (np .arange (len (D )), medoid_idxs )
218+ optimal_swap = _compute_optimal_swap (
219+ D ,
220+ medoid_idxs .astype (np .intc ),
221+ not_medoid_idxs .astype (np .intc ),
222+ Djs ,
223+ Ejs ,
224+ self .n_clusters ,
225+ )
226+ if optimal_swap is not None :
227+ i , j , _ = optimal_swap
228+ medoid_idxs [medoid_idxs == i ] = j
229+
230+ # update Djs and Ejs with new medoids
231+ Djs , Ejs = np .sort (D [medoid_idxs ], axis = 0 )[[0 , 1 ]]
232+ else :
233+ raise ValueError (
234+ f"method={ self .method } is not supported. Supported methods "
235+ f"are 'pam' and 'alternate'."
236+ )
237+
195238 if np .all (old_medoid_idxs == medoid_idxs ):
196239 break
197240 elif self .n_iter_ == self .max_iter - 1 :
@@ -210,7 +253,7 @@ def fit(self, X, y=None):
210253
211254 # Expose labels_ which are the assignments of
212255 # the training data to clusters
213- self .labels_ = labels
256+ self .labels_ = np . argmin ( D [ medoid_idxs , :], axis = 0 )
214257 self .medoid_indices_ = medoid_idxs
215258 self .inertia_ = self ._compute_inertia (self .transform (X ))
216259
@@ -252,6 +295,10 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs):
252295 if min_cost < curr_cost :
253296 medoid_idxs [k ] = cluster_k_idxs [min_cost_idx ]
254297
298+ def _compute_cost (self , D , medoid_idxs ):
299+ """ Compute the cose for a given configuration of the medoids"""
300+ return self ._compute_inertia (D [:, medoid_idxs ])
301+
255302 def transform (self , X ):
256303 """Transforms X to cluster-distance space.
257304
@@ -339,6 +386,8 @@ def _initialize_medoids(self, D, n_clusters, random_state_):
339386 medoids = np .argpartition (np .sum (D , axis = 1 ), n_clusters - 1 )[
340387 :n_clusters
341388 ]
389+ elif self .init == "build" : # Build initialization
390+ medoids = _build (D , n_clusters ).astype (np .int64 )
342391 else :
343392 raise ValueError (f"init value '{ self .init } ' not recognized" )
344393
0 commit comments