diff --git a/README.md b/README.md index 196560c..7e6ed41 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ Multiscale PHATE ================ [![Latest PyPi version](https://img.shields.io/pypi/v/multiscale_phate.svg)](https://pypi.org/project/multiscale_phate/) -[![Travis CI Build](https://api.travis-ci.com/KrishnaswamyLab/Multiscale_PHATE.svg?branch=master)](https://travis-ci.com/KrishnaswamyLab/Multiscale_PHATE) -[![Coverage Status](https://coveralls.io/repos/github/KrishnaswamyLab/Multiscale_PHATE/badge.svg?branch=master)](https://coveralls.io/github/KrishnaswamyLab/Multiscale_PHATE?branch=master) +[![Travis CI Build](https://api.travis-ci.com/KrishnaswamyLab/Multiscale_PHATE.svg?branch=main)](https://travis-ci.com/KrishnaswamyLab/Multiscale_PHATE) +[![Coverage Status](https://coveralls.io/repos/github/KrishnaswamyLab/Multiscale_PHATE/badge.svg?branch=main)](https://coveralls.io/github/KrishnaswamyLab/Multiscale_PHATE?branch=main) [![Twitter](https://img.shields.io/twitter/follow/KrishnaswamyLab.svg?style=social&label=Follow)](https://twitter.com/KrishnaswamyLab) [![GitHub stars](https://img.shields.io/github/stars/KrishnaswamyLab/Multiscale_PHATE.svg?style=social&label=Stars)](https://github.com/KrishnaswamyLab/Multiscale_PHATE/) [![Code style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) @@ -19,6 +19,12 @@ Installation Multiscale PHATE is available on `pip`. Install by running the following in a terminal: +``` +pip install --user multiscale_phate +``` + +If you wish to install from source, you may do so as follows: + ``` pip install --user git+https://github.com/KrishnaswamyLab/Multiscale_PHATE ``` diff --git a/multiscale_phate/compress.py b/multiscale_phate/compress.py index 8f4d718..7080322 100644 --- a/multiscale_phate/compress.py +++ b/multiscale_phate/compress.py @@ -6,41 +6,42 @@ import scipy.spatial.distance -def get_compression_features(N, features, n_pca, partitions, landmarks): - """Short summary. +_logger = tasklogger.get_tasklogger("graphtools") + + +def get_compression_features(N, features, n_pca, partitions): + """Short summary. TODO Parameters ---------- - N : type - Description of parameter `N`. - features : type - Description of parameter `features`. - n_pca : type - Description of parameter `n_pca`. - partitions : type - Description of parameter `partitions`. - landmarks : type - Description of parameter `landmarks`. + N : type TODO + Description of parameter `N`. TODO + features : type TODO + Description of parameter `features`. TODO + n_pca : type TODO + Description of parameter `n_pca`. TODO + partitions : type TODO + Description of parameter `partitions`. TODO + landmarks : type TODO + Description of parameter `landmarks`. TODO Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ - if n_pca == None: + if n_pca is None: n_pca = min(N, features) if n_pca > 100: n_pca = 100 - n_pca = 100 - # if N<100000: # partitions=None - if partitions != None and partitions >= N: + if partitions is not None and partitions >= N: partitions = None - if partitions != None and partitions > 50000: + if partitions is not None and partitions > 50000: partitions = 50000 elif N > 100000: partitions = 20000 @@ -49,16 +50,16 @@ def get_compression_features(N, features, n_pca, partitions, landmarks): def cluster_components(data_subset, num_cluster, size, random_state=None): - """Short summary. + """Short summary. TODO Parameters ---------- - data_subset : type - Description of parameter `data_subset`. - num_cluster : type - Description of parameter `num_cluster`. - size : type - Description of parameter `size`. + data_subset : type TODO + Description of parameter `data_subset`. TODO + num_cluster : type TODO + Description of parameter `num_cluster`. TODO + size : type TODO + Description of parameter `size`. TODO random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize MiniBatchKMeans. If an integer is given, it fixes the seed. @@ -66,8 +67,8 @@ def cluster_components(data_subset, num_cluster, size, random_state=None): Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ if data_subset.shape[0] == 1: @@ -91,18 +92,18 @@ def cluster_components(data_subset, num_cluster, size, random_state=None): def subset_data(data, desired_num_clusters, n_jobs, num_cluster=100, random_state=None): - """Short summary. + """Short summary. TODO Parameters ---------- - data : type - Description of parameter `data`. - desired_num_clusters : type - Description of parameter `desired_num_clusters`. - n_jobs : type - Description of parameter `n_jobs`. - num_cluster : type - Description of parameter `num_cluster`. + data : type TODO + Description of parameter `data`. TODO + desired_num_clusters : type TODO + Description of parameter `desired_num_clusters`. TODO + n_jobs : type TODO + Description of parameter `n_jobs`. TODO + num_cluster : type TODO + Description of parameter `num_cluster`. TODO random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize MiniBatchKMeans. If an integer is given, it fixes the seed. @@ -110,13 +111,13 @@ def subset_data(data, desired_num_clusters, n_jobs, num_cluster=100, random_stat Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ N = data.shape[0] size = int(N / desired_num_clusters) - with tasklogger.log_task("partitions"): + with _logger.task("partitions"): mbk = sklearn.cluster.MiniBatchKMeans( init="k-means++", @@ -156,19 +157,19 @@ def subset_data(data, desired_num_clusters, n_jobs, num_cluster=100, random_stat def merge_clusters(diff_pot_unmerged, clusters): - """Short summary. + """Short summary. TODO Parameters ---------- - diff_pot_unmerged : type - Description of parameter `diff_pot_unmerged`. - clusters : type - Description of parameter `clusters`. + diff_pot_unmerged : type TODO + Description of parameter `diff_pot_unmerged`. TODO + clusters : type TODO + Description of parameter `clusters`. TODO Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ clusters_uni = np.unique(clusters) @@ -185,6 +186,22 @@ def merge_clusters(diff_pot_unmerged, clusters): def get_distance_from_centroids(centroids, data, clusters): + """Short summary. + + Parameters + ---------- + centroids : type + Description of parameter `centroids`. + data : type + Description of parameter `data`. + clusters : type + Description of parameter `clusters`. + + Returns + ------- + type + Description of returned object. + """ distance = np.zeros(centroids.shape[0]) for c in range(centroids.shape[0]): @@ -202,30 +219,30 @@ def get_distance_from_centroids(centroids, data, clusters): def map_update_data(centroids, data, new_data, partition_clusters, nn=5, n_jobs=10): - """Short summary. + """Short summary. TODO Parameters ---------- - centroids : type - Description of parameter `centroids`. - data : type - Description of parameter `data`. - new_data : type - Description of parameter `new_data`. - partition_clusters : type - Description of parameter `partition_clusters`. - nn : type - Description of parameter `nn`. - n_jobs : type - Description of parameter `n_jobs`. + centroids : type TODO + Description of parameter `centroids`. TODO + data : type TODO + Description of parameter `data`. TODO + new_data : type TODO + Description of parameter `new_data`. TODO + partition_clusters : type TODO + Description of parameter `partition_clusters`. TODO + nn : type TODO + Description of parameter `nn`. TODO + n_jobs : type TODO + Description of parameter `n_jobs`. TODO Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ - with tasklogger.log_task("map to computed partitions"): + with _logger.task("map to computed partitions"): # getting max distance to each partition centroid distance_merged = get_distance_from_centroids( centroids, data, partition_clusters @@ -246,7 +263,7 @@ def map_update_data(centroids, data, new_data, partition_clusters, nn=5, n_jobs= for r in range(len(subset_partition_assignment)): c = 0 while c < nn: - if parition_assignment_bool[r, c] == True: + if parition_assignment_bool[r, c] is True: subset_partition_assignment[r] = neighbor_idx[r, c] c = nn + 1 break diff --git a/multiscale_phate/condense.py b/multiscale_phate/condense.py index 97fc560..4a6251d 100644 --- a/multiscale_phate/condense.py +++ b/multiscale_phate/condense.py @@ -7,23 +7,25 @@ import scipy.spatial.distance import sklearn.metrics.pairwise +_logger = tasklogger.get_tasklogger("graphtools") + def comp(node, neigh, visited): - """Short summary. + """Short summary. TODO Parameters ---------- - node : type - Description of parameter `node`. - neigh : type - Description of parameter `neigh`. - visited : type - Description of parameter `visited`. + node : type TODO + Description of parameter `node`. TODO + neigh : type TODO + Description of parameter `neigh`. TODO + visited : type TODO + Description of parameter `visited`. TODO Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ vis = visited.add @@ -37,17 +39,17 @@ def comp(node, neigh, visited): def merge_common(lists): - """Short summary. + """Short summary. TODO Parameters ---------- - lists : type - Description of parameter `lists`. + lists : type TODO + Description of parameter `lists`. TODO Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ neigh = collections.defaultdict(set) @@ -62,46 +64,46 @@ def merge_common(lists): def compute_condensation_param(X, granularity): - """Short summary. + """Short summary. TODO Parameters ---------- - X : type - Description of parameter `X`. - granularity : type - Description of parameter `granularity`. + X : type TODO + Description of parameter `X`. TODO + granularity : type TODO + Description of parameter `granularity`. TODO Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ epsilon = granularity * (0.1 * np.mean(np.std(X))) / (X.shape[0] ** (-1 / 5)) D = scipy.spatial.distance.pdist(X, metric="euclidean") merge_threshold = np.percentile(D, 0.001) + 0.001 - tasklogger.log_info("Setting epsilon to " + str(round(epsilon, 4))) - tasklogger.log_info("Setting merge threshold to " + str(round(merge_threshold, 4))) + _logger.info("Setting epsilon to " + str(round(epsilon, 4))) + _logger.info("Setting merge threshold to " + str(round(merge_threshold, 4))) return epsilon, merge_threshold def condense(X, clusters, scale, epsilon, merge_threshold, n_jobs, random_state=None): - """Short summary. + """Short summary. TODO Parameters ---------- - X : type - Description of parameter `X`. - clusters : type - Description of parameter `clusters`. - scale : type - Description of parameter `scale`. - epsilon : type - Description of parameter `epsilon`. - merge_threshold : type - Description of parameter `merge_threshold`. - n_jobs : type - Description of parameter `n_jobs`. + X : type TODO + Description of parameter `X`. TODO + clusters : type TODO + Description of parameter `clusters`. TODO + scale : type TODO + Description of parameter `scale`. TODO + epsilon : type TODO + Description of parameter `epsilon`. TODO + merge_threshold : type TODO + Description of parameter `merge_threshold`. TODO + n_jobs : type TODO + Description of parameter `n_jobs`. TODO random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize graphtools. If an integer is given, it fixes the seed. @@ -109,8 +111,8 @@ def condense(X, clusters, scale, epsilon, merge_threshold, n_jobs, random_state= Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ NxT = [] @@ -118,8 +120,6 @@ def condense(X, clusters, scale, epsilon, merge_threshold, n_jobs, random_state= NxT.append(clusters) X_cont = [] - N = X.shape[0] - for c in range(len(np.unique(clusters))): loc = np.where(c == clusters)[0] X_cont.append(list(loc)) @@ -130,7 +130,7 @@ def condense(X, clusters, scale, epsilon, merge_threshold, n_jobs, random_state= X_list.append(X_1) P_list = [] merged = [] - with tasklogger.log_task("condensation"): + with _logger.task("condensation"): while X_1.shape[0] > 1: D = sklearn.metrics.pairwise.pairwise_distances( X_1, metric="euclidean", n_jobs=n_jobs diff --git a/multiscale_phate/diffuse.py b/multiscale_phate/diffuse.py index 7241cad..425b000 100644 --- a/multiscale_phate/diffuse.py +++ b/multiscale_phate/diffuse.py @@ -5,28 +5,32 @@ from . import compress +_logger = tasklogger.get_tasklogger("graphtools") + def compute_diffusion_potential( - data, N, decay, gamma, knn, landmarks=2000, n_jobs=10, random_state=None + data, N, decay, gamma, knn, landmarks=2000, n_jobs=10, verbose=0, random_state=None ): - """Short summary. + """Short summary. TODO Parameters ---------- - data : type - Description of parameter `data`. - N : type - Description of parameter `N`. - decay : type - Description of parameter `decay`. - gamma : type - Description of parameter `gamma`. - knn : type - Description of parameter `knn`. - landmarks : type - Description of parameter `landmarks`. - n_jobs : type - Description of parameter `n_jobs`. + data : type TODO + Description of parameter `data`. TODO + N : type TODO + Description of parameter `N`. TODO + decay : type TODO + Description of parameter `decay`. TODO + gamma : type TODO + Description of parameter `gamma`. TODO + knn : type TODO + Description of parameter `knn`. TODO + landmarks : type TODO + Description of parameter `landmarks`. TODO + n_jobs : type TODO + Description of parameter `n_jobs`. TODO + verbose : `int`, optional (default: 0) + If `> 0`, print status messages random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize PHATE and PCA. If an integer is given, it fixes the seed. @@ -34,23 +38,23 @@ def compute_diffusion_potential( Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ - with tasklogger.log_task("diffusion potential"): + with _logger.task("diffusion potential"): - if landmarks != None and landmarks > data.shape[0]: + if landmarks is not None and landmarks > data.shape[0]: landmarks = None diff_op = phate.PHATE( - verbose=False, n_landmark=landmarks, decay=decay, gamma=gamma, n_pca=None, knn=knn, n_jobs=n_jobs, + verbose=verbose, random_state=random_state, ) diff_op.fit(data) @@ -68,25 +72,25 @@ def compute_diffusion_potential( def online_update_diffusion_potential(unmapped_data, diff_op, dp_pca): - """Short summary. + """Short summary. TODO Parameters ---------- - unmapped_data : type - Description of parameter `unmapped_data`. - diff_op : type - Description of parameter `diff_op`. - dp_pca : type - Description of parameter `dp_pca`. + unmapped_data : type TODO + Description of parameter `unmapped_data`. TODO + diff_op : type TODO + Description of parameter `diff_op`. TODO + dp_pca : type TODO + Description of parameter `dp_pca`. TODO Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ - with tasklogger.log_task("extended diffusion potential"): - with tasklogger.log_task("extended kernel"): + with _logger.task("extended diffusion potential"): + with _logger.task("extended kernel"): # Extending kernel to new data transitions = diff_op.graph.extend_to_data(unmapped_data) diff --git a/multiscale_phate/embed.py b/multiscale_phate/embed.py index 07fec97..9214193 100644 --- a/multiscale_phate/embed.py +++ b/multiscale_phate/embed.py @@ -2,19 +2,21 @@ import phate import tasklogger +_logger = tasklogger.get_tasklogger("graphtools") + def repulsion(temp): - """Short summary. + """Short summary. TODO Parameters ---------- - temp : type - Description of parameter `temp`. + temp : type TODO + Description of parameter `temp`. TODO Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ for r in range(temp.shape[0]): @@ -29,19 +31,19 @@ def repulsion(temp): def condense_visualization(merge_pairs, phate): - """Short summary. + """Short summary. TODO Parameters ---------- - merge_pairs : type - Description of parameter `merge_pairs`. - phate : type - Description of parameter `phate`. + merge_pairs : type TODO + Description of parameter `merge_pairs`. TODO + phate : type TODO + Description of parameter `phate`. TODO Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ to_delete = [] @@ -54,56 +56,54 @@ def condense_visualization(merge_pairs, phate): def compute_gradient(Xs, merges): - """Short summary. + """Short summary. TODO Parameters ---------- - Xs : type - Description of parameter `Xs`. - merges : type - Description of parameter `merges`. + Xs : type TODO + Description of parameter `Xs`. TODO + merges : type TODO + Description of parameter `merges`. TODO Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ - tasklogger.log_info("Computing gradient...") + _logger.info("Computing gradient...") gradient = [] m = 0 X = Xs[0] - for l in range(0, len(Xs) - 1): - if X.shape[0] != Xs[l + 1].shape[0]: + for layer in range(0, len(Xs) - 1): + if X.shape[0] != Xs[layer + 1].shape[0]: X_1 = condense_visualization(merges[m], X) m = m + 1 - while X_1.shape[0] != Xs[l + 1].shape[0]: + while X_1.shape[0] != Xs[layer + 1].shape[0]: X_1 = condense_visualization(merges[m], X_1) m = m + 1 else: X_1 = X - gradient.append(np.sum(np.abs(X_1 - Xs[l + 1]))) - X = Xs[l + 1] + gradient.append(np.sum(np.abs(X_1 - Xs[layer + 1]))) + X = Xs[layer + 1] return np.array(gradient) def get_levels(grad): - """Short summary. + """Short summary. TODO Parameters ---------- - grad : type - Description of parameter `Xs`. + grad : type TODO + Description of parameter `Xs`. TODO Returns ------- - type - Description of returned object. - - + type TODO + Description of returned object. TODO """ - tasklogger.log_info("Identifying salient levels of resolution...") + _logger.info("Identifying salient levels of resolution...") minimum = np.max(grad) levels = [] levels.append(0) @@ -125,17 +125,16 @@ def get_zoom_visualization( n_jobs, random_state=None, ): - """Short summary + """Short summary TODO Parameters ---------- - + TODO random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize MDS. If an integer is given, it fixes the seed. Defaults to the global `numpy` random number generator """ - unique = np.unique( NxTs[zoom_visualization_level], return_index=True, return_counts=True ) @@ -148,56 +147,54 @@ def get_zoom_visualization( def compute_ideal_visualization_layer(gradient, Xs, min_cells=100): - """Short summary. + """Short summary. TODO Parameters ---------- - gradient : type - Description of parameter `gradient`. - Xs : type - Description of parameter `Xs`. - min_cells : type - Description of parameter `min_cells`. + gradient : type TODO + Description of parameter `gradient`. TODO + Xs : type TODO + Description of parameter `Xs`. TODO + min_cells : type TODO + Description of parameter `min_cells`. TODO Returns ------- - type - Description of returned object. - + type TODO + Description of returned object. TODO """ minimum = np.max(gradient) min_layer = 0 - for l in range(1, len(Xs)): - if Xs[l].shape[0] < min_cells: + for layer in range(1, len(Xs)): + if Xs[layer].shape[0] < min_cells: break - if gradient[l] < minimum: + if gradient[layer] < minimum: # print("New minimum!") - minimum = gradient[l] - min_layer = l + minimum = gradient[layer] + min_layer = layer return min_layer def get_clusters_sizes_2( clusters_full, layer, NxT, X, repulse=False, n_jobs=10, random_state=None ): - """Short summary. + """Short summary. TODO - Parameters Parameters ---------- - clusters_full : type - Description of parameter `clusters_full`. - layer : type - Description of parameter `layer`. - NxT : type - Description of parameter `NxT`. - X : type - Description of parameter `X`. - repulse : type - Description of parameter `repulse`. - n_jobs : type - Description of parameter `n_jobs`. + clusters_full : type TODO + Description of parameter `clusters_full`. TODO + layer : type TODO + Description of parameter `layer`. TODO + NxT : type TODO + Description of parameter `NxT`. TODO + X : type TODO + Description of parameter `X`. TODO + repulse : type TODO + Description of parameter `repulse`. TODO + n_jobs : type TODO + Description of parameter `n_jobs`. TODO random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize MDS. If an integer is given, it fixes the seed. @@ -205,9 +202,8 @@ def get_clusters_sizes_2( Returns ------- - type - Description of returned object. - + type TODO + Description of returned object. TODO """ unique = np.unique(NxT[layer], return_index=True, return_counts=True) diff --git a/multiscale_phate/multiscale_phate.py b/multiscale_phate/multiscale_phate.py index df0765d..53d08b0 100644 --- a/multiscale_phate/multiscale_phate.py +++ b/multiscale_phate/multiscale_phate.py @@ -1,8 +1,14 @@ +import tasklogger + from . import tree, embed, utils, visualize +_logger = tasklogger.get_tasklogger("graphtools") + class Multiscale_PHATE(object): - """Multscale PHATE operator which performs dimensionality reduction and clustering across granularities. + """Multscale PHATE operator. + + Performs dimensionality reduction and clustering across granularities. Parameters ---------- @@ -39,6 +45,15 @@ class Multiscale_PHATE(object): used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used + verbose : `int` or `boolean`, optional (default: 1) + If `True` or `> 0`, print status messages + random_state : integer or numpy.RandomState, optional, default: None + The generator used to initialize SMACOF (metric, nonmetric) MDS + If an integer is given, it fixes the seed + Defaults to the global `numpy` random number generator + + Attributes + ---------- NxTs : list of lists Cluster assignment for every point at all levels of Diffusion Condensation tree @@ -70,37 +85,6 @@ class Multiscale_PHATE(object): levels : list List of salient resolutions for downstream analysis, computed via gradient analysis - random_state : integer or numpy.RandomState, optional, default: None - The generator used to initialize SMACOF (metric, nonmetric) MDS - If an integer is given, it fixes the seed - Defaults to the global `numpy` random number generator - - Attributes - ---------- - scale - landmarks - partitions - granularity - n_pca - decay - gamma - knn - n_jobs - NxTs - Xs - Ks - merges - Ps - diff_op - data_pca - pca_op - partition_clusters - dp_pca - epsilon - merge_threshold - gradient - levels - """ def __init__( @@ -114,6 +98,7 @@ def __init__( gamma=1, knn=5, n_jobs=1, + verbose=1, random_state=None, ): self.scale = scale @@ -125,7 +110,12 @@ def __init__( self.gamma = gamma self.knn = knn self.n_jobs = n_jobs + self.verbose = verbose self.random_state = random_state + + _logger.set_level(int(verbose)) + + # TODO: remove all of the below? Why are they here self.NxTs = None self.Xs = None self.Ks = None @@ -144,7 +134,7 @@ def __init__( super().__init__() def fit(self, X): - """Builds Diffusion Condensation tree and computes ideal resolutions. + """Build Diffusion Condensation tree and computes ideal resolutions. Parameters ---------- @@ -184,6 +174,7 @@ def fit(self, X): gamma=self.gamma, knn=self.knn, n_jobs=self.n_jobs, + verbose=self.verbose, random_state=self.random_state, ) @@ -201,6 +192,7 @@ def transform( repulse=False, ): """Short summary. + Parameters ---------- visualization_level : int, default = levels[-2] @@ -216,6 +208,7 @@ def transform( Cluster in 'coarse_cluster_level' to zoom in on. repulse : bool, default = False Allows for repulsion between points in multiscale embedding. + Returns ------- embedding : array, shape=[number of points in visualization_level, 2] @@ -228,7 +221,6 @@ def transform( Number of points aggregated into each point as visualized at the granularity of visualization_level """ - if visualization_level is None: visualization_level = self.levels[2] if cluster_level is None: @@ -255,7 +247,7 @@ def transform( ) def build_tree(self): - """Computes and returns a tree from the Diffusion Condensation process. + """Compute and returns a tree from the Diffusion Condensation process. Returns ------- @@ -268,8 +260,9 @@ def build_tree(self): ) def fit_transform(self, X): - """Builds Diffusion Condensation tree, identifies ideal resolutions and returns - Multiscale PHATE embedding and clusters. + """Build Diffusion Condensation tree and identify ideal resolutions. + + Returns Multiscale PHATE embedding and clusters. Parameters ---------- @@ -294,7 +287,7 @@ def fit_transform(self, X): return self.transform() def get_tree_clusters(self, cluster_level): - """Colors Diffusion Condensation tree by a granularity of clusters. + """Color Diffusion Condensation tree by a granularity of clusters. Parameters ---------- @@ -306,6 +299,5 @@ def get_tree_clusters(self, cluster_level): clusters_tree : list, shape=[n_points_aggregated] Cluster labels of each point in computed diffusion condensation tree as dictated by a granularity of the tree - """ return visualize.map_clusters_to_tree(self.NxTs[cluster_level], self.NxTs) diff --git a/multiscale_phate/tree.py b/multiscale_phate/tree.py index 807a454..4e7d2d1 100644 --- a/multiscale_phate/tree.py +++ b/multiscale_phate/tree.py @@ -3,6 +3,8 @@ import sklearn.decomposition from . import compress, diffuse, condense +_logger = tasklogger.get_tasklogger("graphtools") + def build_tree( data_input, @@ -15,32 +17,35 @@ def build_tree( gamma=1, knn=5, n_jobs=10, + verbose=1, random_state=None, ): - """Short summary. + """Short summary. TODO Parameters ---------- - data_input : type - Description of parameter `data_input`. - scale : type - Description of parameter `scale`. - landmarks : type - Description of parameter `landmarks`. - partitions : type - Description of parameter `partitions`. - granularity : type - Description of parameter `granularity`. - n_pca : type - Description of parameter `n_pca`. - decay : type - Description of parameter `decay`. - gamma : type - Description of parameter `gamma`. - knn : type - Description of parameter `knn`. - n_jobs : type - Description of parameter `n_jobs`. + data_input : type TODO + Description of parameter `data_input`. TODO + scale : type TODO + Description of parameter `scale`. TODO + landmarks : type TODO + Description of parameter `landmarks`. TODO + partitions : type TODO + Description of parameter `partitions`. TODO + granularity : type TODO + Description of parameter `granularity`. TODO + n_pca : type TODO + Description of parameter `n_pca`. TODO + decay : type TODO + Description of parameter `decay`. TODO + gamma : type TODO + Description of parameter `gamma`. TODO + knn : type TODO + Description of parameter `knn`. TODO + n_jobs : type TODO + Description of parameter `n_jobs`. TODO + verbose : `int`, optional (default: 1) + If `> 0`, print status messages random_state : integer or numpy.RandomState, optional, default: None The random number generator. If an integer is given, it fixes the seed. @@ -48,25 +53,25 @@ def build_tree( Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ - with tasklogger.log_task("Multiscale PHATE tree"): + with _logger.task("Multiscale PHATE tree"): N, features = data_input.shape # Computing compression features n_pca, partitions = compress.get_compression_features( - N, features, n_pca, partitions, landmarks + N, features, n_pca, partitions ) - with tasklogger.log_task("PCA"): + with _logger.task("PCA"): pca_op = sklearn.decomposition.PCA(n_components=n_pca) data_pca = pca_op.fit_transform(np.array(data_input)) clusters = np.arange(N) # Subsetting if required - if partitions != None: + if partitions is not None: partition_clusters = compress.subset_data( data_pca, partitions, n_jobs=n_jobs, random_state=random_state ) @@ -74,7 +79,15 @@ def build_tree( clusters = partition_clusters X, diff_op, diff_pca = diffuse.compute_diffusion_potential( - data_pca, N, decay, gamma, knn, landmarks, n_jobs, random_state=random_state + data_pca, + N, + decay, + gamma, + knn, + landmarks, + n_jobs, + verbose=verbose - 1, + random_state=random_state, ) epsilon, merge_threshold = condense.compute_condensation_param( @@ -124,38 +137,38 @@ def online_update_tree( n_jobs=10, random_state=None, ): - """Short summary. + """Short summary. TODO Parameters ---------- - data_1 : type - Description of parameter `data_1`. - data_2 : type - Description of parameter `data_2`. - pca_centroid : type - Description of parameter `pca_centroid`. - pca_op : type - Description of parameter `pca_op`. - partitions : type - Description of parameter `partitions`. - diff_operator : type - Description of parameter `diff_operator`. - diff_pca_op : type - Description of parameter `diff_pca_op`. - Xs : type - Description of parameter `Xs`. - NxTs : type - Description of parameter `NxTs`. - Ks : type - Description of parameter `Ks`. - Merges : type - Description of parameter `Merges`. - Ps : type - Description of parameter `Ps`. - scale : type - Description of parameter `scale`. - n_jobs : type - Description of parameter `n_jobs`. + data_1 : type TODO + Description of parameter `data_1`. TODO + data_2 : type TODO + Description of parameter `data_2`. TODO + pca_centroid : type TODO + Description of parameter `pca_centroid`. TODO + pca_op : type TODO + Description of parameter `pca_op`. TODO + partitions : type TODO + Description of parameter `partitions`. TODO + diff_operator : type TODO + Description of parameter `diff_operator`. TODO + diff_pca_op : type TODO + Description of parameter `diff_pca_op`. TODO + Xs : type TODO + Description of parameter `Xs`. TODO + NxTs : type TODO + Description of parameter `NxTs`. TODO + Ks : type TODO + Description of parameter `Ks`. TODO + Merges : type TODO + Description of parameter `Merges`. TODO + Ps : type TODO + Description of parameter `Ps`. TODO + scale : type TODO + Description of parameter `scale`. TODO + n_jobs : type TODO + Description of parameter `n_jobs`. TODO random_state : integer or numpy.RandomState, optional, default: None The random number generator. If an integer is given, it fixes the seed. @@ -163,13 +176,13 @@ def online_update_tree( Returns ------- - type - Description of returned object. + type TODO + Description of returned object. TODO """ - with tasklogger.log_task("Multiscale PHATE tree mapping"): + with _logger.task("Multiscale PHATE tree mapping"): if data_1.shape[0] != len(np.unique(partitions)): - tasklogger.log_info("PCA compressing new data...") + _logger.info("PCA compressing new data...") data_pca_1 = pca_op.transform(np.array(data_1)) data_pca_2 = pca_op.transform(np.array(data_2)) @@ -177,9 +190,10 @@ def online_update_tree( partition_assignments = compress.map_update_data( pca_centroid, data_pca_1, data_pca_2, partitions, nn=5, n_jobs=n_jobs ) - tasklogger.log_info( - "Points not mapped to partitions: " - + str(sum(partition_assignments == -1)) + _logger.info( + "Points not mapped to partitions: {}".format( + sum(partition_assignments == -1) + ) ) # creating new joint paritions mapping @@ -223,7 +237,7 @@ def online_update_tree( else: clusters = new_partition_clusters - tasklogger.log_info("Rebuilding condensation tree...") + _logger.info("Rebuilding condensation tree...") clusters_idx = [] for c in clusters: @@ -231,12 +245,12 @@ def online_update_tree( NxTs_l = [] - for l in range(len(NxTs)): - NxTs_l.append(NxTs[l][clusters_idx]) + for layer in range(len(NxTs)): + NxTs_l.append(NxTs[layer][clusters_idx]) return NxTs_l, Xs, Ks, Merges, Ps, pca_centroid else: - tasklogger.log_info("PCA compressing new data...") + _logger.info("PCA compressing new data...") data_pca_2 = pca_op.transform(np.array(data_2)) diff_pot_1 = diffuse.online_update_diffusion_potential( data_pca_2, diff_operator, diff_pca_op diff --git a/multiscale_phate/utils.py b/multiscale_phate/utils.py index 63b013e..82c6125 100644 --- a/multiscale_phate/utils.py +++ b/multiscale_phate/utils.py @@ -2,17 +2,16 @@ def hash_object(X): - """Short summary. + """Compute a unique hash of any Python object. Parameters ---------- - X : type - Description of parameter `X`. + X : object + Object for which to compute unique hash Returns ------- - type - Description of returned object. - + hash : str + Unique hash based on pickle dump of X. """ return hash(pickle.dumps(X)) diff --git a/multiscale_phate/version.py b/multiscale_phate/version.py index 76fe15d..8f02035 100644 --- a/multiscale_phate/version.py +++ b/multiscale_phate/version.py @@ -1 +1 @@ -__version__ = "0.0" +__version__ = "0.1.0a0" diff --git a/multiscale_phate/visualize.py b/multiscale_phate/visualize.py index e82a7b9..eafa23d 100644 --- a/multiscale_phate/visualize.py +++ b/multiscale_phate/visualize.py @@ -4,19 +4,21 @@ from . import embed +_logger = tasklogger.get_tasklogger("graphtools") + def get_visualization( Xs, NxTs, cluster_level, visualization_level, repulse, random_state=None ): - """Short summary. + """Short summary. TODO Parameters ---------- - Xs : type + Xs : type TODO Description of parameter `Xs`. - NxTs : type + NxTs : type TODO Description of parameter `NxTs`. - merges : type + merges : type TODO Description of parameter `merges`. random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize MDS. @@ -25,7 +27,7 @@ def get_visualization( Returns ------- - type + type TODO Description of returned object. """ @@ -41,15 +43,15 @@ def get_visualization( def build_visualization(Xs, NxTs, merges, gradient, min_cells, random_state=None): - """Short summary. + """Short summary. TODO Parameters ---------- - Xs : type + Xs : type TODO Description of parameter `Xs`. - NxTs : type + NxTs : type TODO Description of parameter `NxTs`. - merges : type + merges : type TODO Description of parameter `merges`. random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize MDS. @@ -58,11 +60,10 @@ def build_visualization(Xs, NxTs, merges, gradient, min_cells, random_state=None Returns ------- - type + type TODO Description of returned object. """ - min_layer = embed.compute_ideal_visualization_layer(gradient, Xs, min_cells) (hp_embedding, cluster_viz, sizes_viz,) = embed.get_clusters_sizes_2( np.array(NxTs[-35]), @@ -76,44 +77,60 @@ def build_visualization(Xs, NxTs, merges, gradient, min_cells, random_state=None def map_clusters_to_tree(clusters, NxTs): + """Short summary. + + Parameters + ---------- + clusters : type + Description of parameter `clusters`. + NxTs : type + Description of parameter `NxTs`. + + Returns + ------- + type + Description of returned object. + + """ clusters_tree = [] - for l in range(len(NxTs) - 1): - _, ind = np.unique(NxTs[l], return_index=True) + for layer in range(len(NxTs) - 1): + _, ind = np.unique(NxTs[layer], return_index=True) clusters_tree.extend(clusters[ind]) return clusters_tree def build_condensation_tree(data_pca, diff_op, NxT, merged_list, Ps): - """Short summary. + """Short summary. TODO Parameters ---------- - data_pca : type + data_pca : type TODO Description of parameter `data_pca`. - diff_op : type + diff_op : type TODO Description of parameter `diff_op`. - NxT : type + NxT : type TODO Description of parameter `NxT`. - merged_list : type + merged_list : type TODO Description of parameter `merged_list`. - Ps : type + Ps : type TODO Description of parameter `Ps`. Returns ------- - type + type TODO Description of returned object. """ - with tasklogger.log_task("base visualization"): + with _logger.task("base visualization"): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", category=RuntimeWarning, - message="Pre-fit PHATE should not be used to transform a new data matrix. " - "Please fit PHATE to the new data by running 'fit' with the new data.", + message="Pre-fit PHATE should not be used to transform a new data " + "matrix. Please fit PHATE to the new data by running 'fit' with the " + "new data.", ) tree_phate = diff_op.transform(data_pca) @@ -131,22 +148,22 @@ def build_condensation_tree(data_pca, diff_op, NxT, merged_list, Ps): m = 0 - with tasklogger.log_task("tree"): - for l in range(0, len(Ps)): - if len(np.unique(NxT[l])) != len(np.unique(NxT[l + 1])): + with _logger.task("tree"): + for layer in range(0, len(Ps)): + if len(np.unique(NxT[layer])) != len(np.unique(NxT[layer + 1])): tree_phate_1 = embed.condense_visualization(merged_list[m], tree_phate) m = m + 1 - if Ps[l].shape[0] != tree_phate_1.shape[0]: + if Ps[layer].shape[0] != tree_phate_1.shape[0]: tree_phate_1 = embed.condense_visualization( merged_list[m], tree_phate_1 ) m = m + 1 - tree_phate = Ps[l] @ tree_phate_1 + tree_phate = Ps[layer] @ tree_phate_1 embeddings.append( np.concatenate( [ tree_phate, - np.repeat(l + 1, tree_phate.shape[0]).reshape( + np.repeat(layer + 1, tree_phate.shape[0]).reshape( tree_phate.shape[0], 1 ), ], diff --git a/test/test.py b/test/test.py index 68369f3..7dc1ad6 100644 --- a/test/test.py +++ b/test/test.py @@ -74,4 +74,56 @@ def test_random_seed(): mp_op = multiscale_phate.Multiscale_PHATE(partitions=100, landmarks=50) hp_embedding, _, _ = mp_op.fit_transform(X) hp_embedding2, _, _ = mp_op.fit_transform(X) - # np.testing.assert_all_close(hp_embedding, hp_embedding2) + if hp_embedding.shape[0] == hp_embedding2.shape[0]: + assert not np.all(hp_embedding == hp_embedding2) + + +@parameterized.parameterized( + [ + # n_pca is None -> min(N, features) + (100, 50, None, 50), + (50, 100, None, 50), + # n_pca < min(N, features) -> n_pca + (100, 50, 25, 25), + # n_pca > 100 -> 100 + (200, 150, 200, 100), + (200, 150, 125, 100), + # n_pca > min(N, features) -> min(N, features) + (100, 50, 75, 50), + (50, 100, 75, 50), + (100, 50, 125, 50), + (50, 100, 125, 50), + ] +) +def test_compression_features_pca(N, features, n_pca, expected): + partitions = None + output, _ = multiscale_phate.compress.get_compression_features( + N, features, n_pca, partitions + ) + assert output == expected, (output, expected) + + +@parameterized.parameterized( + [ + # TODO: is this desired behavior? seems pathological + # partitions is None -> None + (100, None, None), + # partitions > N -> None + (100, 101, None), + (200000, 200001, None), + # partitions > 50000 -> 50000 + (110000, 50001, 50000), + # N > 100000 -> 20000 + (110000, None, 20000), + (110000, 100, 20000), + (110000, 50000, 20000), + (110000, 110001, 20000), + ] +) +def test_compression_features_partitions(N, partitions, expected): + n_pca = None + features = 50 + _, output = multiscale_phate.compress.get_compression_features( + N, features, n_pca, partitions + ) + assert output == expected, (output, expected) diff --git a/tutorial/10X_pbmc.ipynb b/tutorial/10X_pbmc.ipynb index 236d892..2c897b1 100644 --- a/tutorial/10X_pbmc.ipynb +++ b/tutorial/10X_pbmc.ipynb @@ -41,7 +41,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install --user -q git+https://github.com/KrishnaswamyLab/Multiscale_PHATE" + "!pip install --user -q multiscale_phate" ] }, {