diff --git a/README.md b/README.md index 0016035..797bcfa 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,14 @@ Installing ---------- For usage and installation instructions, see the [install guide](docs/install.rst). +Examples +-------- +You can find example usage of this package in the examples directory. + Contributing ------------ We welcome contributions! Please see our [contribution guide](CONTRIBUTING.md) for more information. Thank you! Level of Support ---------------- -We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests. \ No newline at end of file +We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests. diff --git a/examples/clustering_example.py b/examples/clustering_example.py new file mode 100644 index 0000000..839c2f6 --- /dev/null +++ b/examples/clustering_example.py @@ -0,0 +1,115 @@ +import scanpy as sc +import pandas as pd +import numpy as np +import pickle + +# Skip this line if transcriptomic_clustering is installed +sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/') + +from transcriptomic_clustering.iterative_clustering import ( + build_cluster_dict, iter_clust, OnestepKwargs +) + +# Load the data that contains the raw counts in the 'X' slot. If using normalized data, skip the normalization step. +adata = sc.read('path/to/your/data.h5ad') + +# Normalize the data. Skip if adata.X is already normalized. This is the same as using sc.normalize_total(adata, target_sum=1e6) and sc.pp.log1p(adata) +adata=transcriptomic_clustering.normalize(adata) + +# Add scVI latent space to the adata object. Skip if adata.obsm['X_scVI'] is already present. +scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0) +adata.obsm['X_scVI'] = np.asarray(scvi) + +# Set up the clustering parameters +def setup_transcriptomic_clustering(): + means_vars_kwargs = { + 'low_thresh': 0.6931472, + 'min_cells': 4 + } + highly_variable_kwargs = { + 'max_genes': 4000 + } + pca_kwargs = { # not used if using a scvi latent space + 'cell_select': 30000, + 'n_comps': 50, + 'svd_solver': 'randomized' + } + filter_pcs_kwargs = { # not used if using a scvi latent space + 'known_components': None, + 'similarity_threshold': 0.7, + 'method': 'zscore', + 'zth': 2, + 'max_pcs': None, + } + filter_known_modes_kwargs = { + # 'known_modes': known_modes_df, + 'similarity_threshold': 0.7 + } + latent_kwargs = { + 'latent_component': "X_scVI" + } + cluster_louvain_kwargs = { + 'k': 15, + 'nn_measure': 'euclidean', + 'knn_method': 'annoy', + 'louvain_method': 'taynaud', + 'weighting_method': 'jaccard', + 'n_jobs': 30, + 'resolution': 1.0, + } + merge_clusters_kwargs = { + 'thresholds': { + 'q1_thresh': 0.5, + 'q2_thresh': None, + 'cluster_size_thresh': 10, + 'qdiff_thresh': 0.7, + 'padj_thresh': 0.05, + 'lfc_thresh': 1, + 'score_thresh': 100, + 'low_thresh': 0.6931472, + 'min_genes': 5 + }, + 'k': 4, + 'de_method': 'ebayes' + } + onestep_kwargs = OnestepKwargs( + means_vars_kwargs = means_vars_kwargs, + highly_variable_kwargs = highly_variable_kwargs, + pca_kwargs = pca_kwargs, + filter_pcs_kwargs = filter_pcs_kwargs, + filter_known_modes_kwargs = filter_known_modes_kwargs, + latent_kwargs = latent_kwargs, + cluster_louvain_kwargs = cluster_louvain_kwargs, + merge_clusters_kwargs = merge_clusters_kwargs + ) + return onestep_kwargs + +onestep_kwargs = setup_transcriptomic_clustering() + +# run the iterative clustering. need a tmp folder to store intermediate results +clusters, markers = iter_clust( + adata, + min_samples=10, + onestep_kwargs=onestep_kwargs, + random_seed=123, + tmp_dir="/path/to/your/tmp" +) + +# save the results in .pkl format, which will be used for final merging +with open('clustering_results.pkl', 'wb') as f: + pickle.dump(clusters, f) + +with open('markers.pkl', 'wb') as f: + pickle.dump(markers, f) + +# (Optional) save the clustering results in a data frame. +cluster_dict = build_cluster_dict(clusters) + +adata.obs["scrattch_py_cluster"] = "" +for cluster in cluster_dict.keys(): + adata.obs.scrattch_py_cluster[cluster_dict[cluster]] = cluster + +adata.obs.scrattch_py_cluster = adata.obs.scrattch_py_cluster.astype('category') + +res = pd.DataFrame({'sample_id':adata.obs_names, 'cl': adata.obs.scrattch_py_cluster}) +res.to_csv('/path/to/your/clustering_results.csv') diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py new file mode 100644 index 0000000..9917c91 --- /dev/null +++ b/examples/final_merging_example.py @@ -0,0 +1,119 @@ +import sys +import os +import pickle +import pandas as pd +import scanpy as sc +import importlib +import time +import anndata as ad +import numpy as np +import math + +# Skip this line if transcriptomic_clustering is installed +sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/') + +from transcriptomic_clustering.final_merging import final_merge, FinalMergeKwargs + +# Load the data that contains the raw counts in the 'X' slot. If adata.X is normalized, skip the next normalization step. +adata = sc.read('path/to/your/adata.h5ad') + +# Normalize the data. Skip if adata.X is already normalized. +sc.pp.normalize_total(adata, target_sum=1e6) +sc.pp.log1p(adata) + +# Add scVI latent space to the adata object. Skip if adata.obsm['scVI'] is already present. +scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0) +adata.obsm['scVI'] = np.asarray(scvi) + +# Loading clustering results +cl_pth = "/path/to/clustering_results" +with open(os.path.join(cl_pth, 'clustering_results.pkl'), 'rb') as f: + clusters = pickle.load(f) + +# marker genes are only needed for computing PCA +with open(os.path.join(cl_pth, 'markers.pkl'), 'rb') as f: + markers = pickle.load(f) + +# The first 4 are for PCA only. modify latent_kwargs if using a pre-computed latent space +def setup_merging(): + pca_kwargs ={ + # 'cell_select': 30000, # should not use set this for final merging, as we need to sample from each cluster if computing PCA + 'n_comps': 50, + 'svd_solver': 'randomized' + } + filter_pcs_kwargs = { + 'known_components': None, + 'similarity_threshold': 0.7, + 'method': 'zscore', + 'zth': 2, + 'max_pcs': None} + filter_known_modes_kwargs = { + 'known_modes': 'log2ngene', + 'similarity_threshold': 0.7} + project_kwargs = {} + merge_clusters_kwargs = { + 'thresholds': { + 'q1_thresh': 0.5, + 'q2_thresh': None, + 'cluster_size_thresh': 10, + 'qdiff_thresh': 0.7, + 'padj_thresh': 0.05, + 'lfc_thresh': 1, + 'score_thresh': 100, + 'low_thresh': 0.6931472, + 'min_genes': 5 + }, + 'k': 4, + 'de_method': 'ebayes', + 'n_markers': None, # if set to None, will bypass the marker calculation step, which is the time-consuming step + } + latent_kwargs = { + 'latent_component': "scVI" # None or a obsm in adata. if None: default is running pca, else use the latent_component in adata.obsm + } + + merge_kwargs = FinalMergeKwargs( + pca_kwargs = pca_kwargs, + filter_pcs_kwargs = filter_pcs_kwargs, + filter_known_modes_kwargs = filter_known_modes_kwargs, + project_kwargs = project_kwargs, + merge_clusters_kwargs = merge_clusters_kwargs, + latent_kwargs = latent_kwargs + ) + return merge_kwargs + +merge_kwargs = setup_merging() + +# Run the final merging +clusters_after_merging, markers_after_merging = final_merge( + adata, + clusters, + markers, # required for PCA, but optional if using a pre-computed latent space + n_samples_per_clust=20, + random_seed=2024, + n_jobs = 30, # modify this to the number of cores you want to use + return_markers_df = False, # return the pair-wise DE results for each cluster pair. If False (default), only return a set of markers (top 20 of up and down regulated genes in each pair comparison) + final_merge_kwargs=merge_kwargs +) + +out_dir = "/path/to/output" + +with open(os.path.join(out_dir, "clustering_results_after_merging.pkl"), 'wb') as f: + pickle.dump(clusters_after_merging, f) + +# determine datatype for markers_after_merging and save +if markers_after_merging is None: + print("Skipped calculating markers. Did not save markers.") +elif isinstance(markers_after_merging, pd.DataFrame): + markers_after_merging.to_csv(os.path.join(out_dir,'markers_after_merging.csv')) +else: + with open(os.path.join(out_dir,'markers_after_merging.pkl'), 'wb') as f: + pickle.dump(markers_after_merging, f) + +# convert the clustering results to a .csv file +n_cells = sum(len(i) for i in clusters_after_merging) +cl = ['unknown']*n_cells +for i in range(len(clusters_after_merging)): + for j in clusters_after_merging[i]: + cl[j] = i+1 +res = pd.DataFrame({'cl': cl}, index=adata.obs_names) +res.to_csv(os.path.join(out_dir,'clustering_results_after_merging.csv')) \ No newline at end of file diff --git a/examples/pairwiseDE_example.py b/examples/pairwiseDE_example.py new file mode 100644 index 0000000..b9496e7 --- /dev/null +++ b/examples/pairwiseDE_example.py @@ -0,0 +1,51 @@ +# this example script is used to get pair-wise DEGs for all clusters given the normalized count and the cluster assignment +import sys +import scanpy as sc +import pickle +from collections import defaultdict + +sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/') +from transcriptomic_clustering.pairwise_DEGs import pairwise_degs + +thresholds = { + 'q1_thresh': 0.5, + 'q2_thresh': None, + 'cluster_size_thresh': 10, + 'qdiff_thresh': 0.7, + 'padj_thresh': 0.05, + 'lfc_thresh': 1, + 'score_thresh': 100, + 'low_thresh': 0.6931472, + 'min_genes': 5 +} + +# reading in the adata that adata.X is already normalizedd +adata = sc.read('/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/HMBA_analysis/iscANVI_mapping/troubleshoot/test_data/adata_query_MHGlut.h5ad') + +# reading in the clustering results, a list of lists of cell indices in adata +clusters_pth = '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/custom_packages/mpi_tc/archive_versions/v3_githubVersion/test_70k_worked/out/clustering_results.pkl' +with open(clusters_pth, 'rb') as f: + clusters = pickle.load(f) + +# convert to a dictionary of cluster id to cell indices +obs_by_cluster = defaultdict(lambda: []) +for i, cell_ids in enumerate(clusters): + obs_by_cluster[i] = cell_ids + +# subset the adata to include only the first two clusters for testing +adata = adata[obs_by_cluster[0] + obs_by_cluster[1], :].copy() + +# create the new obs_by_cluster of cluster id to the cell indices in the subsetted adata +obs_by_cluster_subset = {} +obs_by_cluster_subset[0] = [i for i in range(len(obs_by_cluster[0]))] +obs_by_cluster_subset[1] = [i + len(obs_by_cluster[0]) for i in range(len(obs_by_cluster[1]))] + +# returns a set of markers (20 up regulated and 20 down regulated) +degs = pairwise_degs( + adata, + obs_by_cluster_subset, + thresholds, + n_markers=20, + de_method='ebayes', + n_jobs=30 + ) \ No newline at end of file diff --git a/transcriptomic_clustering/__init__.py b/transcriptomic_clustering/__init__.py index e69b5a1..68b7e5f 100644 --- a/transcriptomic_clustering/__init__.py +++ b/transcriptomic_clustering/__init__.py @@ -29,5 +29,6 @@ from .cluster_means import get_cluster_means from .merging import merge_clusters from .diff_expression import de_pairs_chisq, vec_chisq_test -from .de_ebayes import de_pairs_ebayes +from .de_ebayes import de_pairs_ebayes, de_pairs_ebayes_parallel from .merging import merge_clusters +from .final_merging import final_merge diff --git a/transcriptomic_clustering/de_ebayes.py b/transcriptomic_clustering/de_ebayes.py index ba9514d..240b6d6 100644 --- a/transcriptomic_clustering/de_ebayes.py +++ b/transcriptomic_clustering/de_ebayes.py @@ -15,6 +15,9 @@ from .diff_expression import get_qdiff, filter_gene_stats, calc_de_score +import multiprocessing as mp +from functools import partial + logger = logging.getLogger(__name__) """ @@ -232,3 +235,90 @@ def de_pairs_ebayes( de_pairs = pd.DataFrame(de_pairs).T return de_pairs + +def process_pair(cl_means, cl_present, cl_size, stdev_unscaled, df, df_prior, sigma_sq_post, de_thresholds, pair): + cluster_a, cluster_b = pair + # t-test with ebayes adjusted variances + means_diff = cl_means.loc[cluster_a] - cl_means.loc[cluster_b] + means_diff = means_diff.to_frame() + stdev_unscaled_comb = np.sqrt(np.sum(stdev_unscaled.loc[[cluster_a, cluster_b]] ** 2))[0] + + df_total = df + df_prior + df_pooled = np.sum(df) + df_total = min(df_total, df_pooled) + + t_vals = means_diff / np.sqrt(sigma_sq_post) / stdev_unscaled_comb + + p_adj = np.ones((len(t_vals),)) + p_vals = 2 * stats.t.sf(np.abs(t_vals[0]), df_total) + reject, p_adj, alphacSidak, alphacBonf= multipletests(p_vals, alpha=de_thresholds['padj_thresh'], method='holm') + lfc = means_diff + + # Get DE score + de_pair_stats = pd.DataFrame(index=cl_means.columns) + de_pair_stats['p_value'] = p_vals + de_pair_stats['p_adj'] = p_adj + de_pair_stats['lfc'] = lfc + de_pair_stats["meanA"] = cl_means.loc[cluster_a] + de_pair_stats["meanB"] = cl_means.loc[cluster_b] + de_pair_stats["q1"] = cl_present.loc[cluster_a] + de_pair_stats["q2"] = cl_present.loc[cluster_b] + de_pair_stats["qdiff"] = get_qdiff(cl_present.loc[cluster_a], cl_present.loc[cluster_b]) + + de_pair_up = filter_gene_stats( + de_stats=de_pair_stats, + gene_type='up-regulated', + cl1_size=cl_size[cluster_a], + cl2_size=cl_size[cluster_b], + **de_thresholds + ) + up_score = calc_de_score(de_pair_up['p_adj'].values) + + de_pair_down = filter_gene_stats( + de_stats=de_pair_stats, + gene_type='down-regulated', + cl1_size=cl_size[cluster_a], + cl2_size=cl_size[cluster_b], + **de_thresholds + ) + down_score = calc_de_score(de_pair_down['p_adj'].values) + + return (pair, { + 'score': up_score + down_score, + 'up_score': up_score, + 'down_score': down_score, + 'up_genes': de_pair_up.index.to_list(), + 'down_genes': de_pair_down.index.to_list(), + 'up_num': len(de_pair_up.index), + 'down_num': len(de_pair_down.index), + 'num': len(de_pair_up.index) + len(de_pair_down.index) + }) + +def de_pairs_ebayes_parallel( + pairs: List[Tuple[Any, Any]], + cl_means: pd.DataFrame, + cl_vars: pd.DataFrame, + cl_present: pd.DataFrame, + cl_size: Dict[Any, int], + de_thresholds: Dict[str, Any], + n_jobs: int = 1 + ) -> pd.DataFrame: + + logger.info('Fitting Variances') + sigma_sq, df, stdev_unscaled = get_linear_fit_vals(cl_vars, cl_size) + logger.info('Moderating Variances') + sigma_sq_post, var_prior, df_prior = moderate_variances(sigma_sq, df) + + logger.info(f'Comparing {len(pairs)} pairs') + + # de_pairs = {} + + partial_process = partial(process_pair, cl_means, cl_present, cl_size, stdev_unscaled, df, df_prior, sigma_sq_post, de_thresholds) + + # Use multiprocessing to parallelize the process + with mp.Pool(processes=n_jobs) as pool: + results = pool.map(partial_process, pairs) + + de_pairs = {pair: result for pair, result in results} + de_pairs = pd.DataFrame(de_pairs).T + return de_pairs \ No newline at end of file diff --git a/transcriptomic_clustering/dimension_reduction.py b/transcriptomic_clustering/dimension_reduction.py index 499e083..2a39f39 100644 --- a/transcriptomic_clustering/dimension_reduction.py +++ b/transcriptomic_clustering/dimension_reduction.py @@ -116,7 +116,7 @@ def pca( _, vidx = adata._normalize_indices((slice(None), gene_mask)) # handle gene mask like anndata would vidx_bool = np.zeros((adata.n_vars,), dtype=bool) vidx_bool[vidx] = True - n_genes = len(vidx) + n_genes = sum(vidx) # select n_comps max_comps = min(n_cells, n_genes) - 1 diff --git a/transcriptomic_clustering/filter_known_modes.py b/transcriptomic_clustering/filter_known_modes.py index 7b17c43..98ce71c 100644 --- a/transcriptomic_clustering/filter_known_modes.py +++ b/transcriptomic_clustering/filter_known_modes.py @@ -10,7 +10,7 @@ def filter_known_modes( projected_adata: ad.AnnData, - known_modes: Union[pd.DataFrame, pd.Series], + known_modes: Optional[str] = None, similarity_threshold: Optional[float] = 0.7): """ Filters out principal components which correlate strongly with the known modes @@ -27,6 +27,12 @@ def filter_known_modes( projected_adata: after filtering out correlated principal components """ + # determine if know_modes is in adata.obs + if known_modes in projected_adata.obs.columns: + known_modes = projected_adata.obs[known_modes] + else: + raise ValueError(f'{known_modes} not found in adata.obs') + if isinstance(known_modes, pd.Series): known_modes = known_modes.to_frame() diff --git a/transcriptomic_clustering/final_merging.py b/transcriptomic_clustering/final_merging.py index 0ed9471..f64080f 100644 --- a/transcriptomic_clustering/final_merging.py +++ b/transcriptomic_clustering/final_merging.py @@ -1,4 +1,4 @@ -fromt typing import Optional, Dict, Set, List, Any +from typing import Optional, Dict, Set, List, Any, Tuple, Union from collections import defaultdict from dataclasses import dataclass, field @@ -9,11 +9,9 @@ import anndata as ad import transcriptomic_clustering as tc -from transcriptomic_clustering.iterative_clustering import ( - build_cluster_dict, summarize_final_clusters -) - +import logging +import time logger = logging.getLogger(__name__) @@ -25,6 +23,7 @@ class FinalMergeKwargs: filter_known_modes_kwargs: Dict = field(default_factory = lambda: ({})) project_kwargs: Dict = field(default_factory = lambda: ({})) merge_clusters_kwargs: Dict = field(default_factory = lambda: ({})) + latent_kwargs: Dict = field(default_factory = lambda: ({})) def sample_clusters( @@ -38,7 +37,10 @@ def sample_clusters( Parameters ---------- - Adata + adata + AnnData object + cluster_dict + Dictionary of cluster assignments. values are lists of cell names Returns ------- @@ -46,13 +48,15 @@ def sample_clusters( """ rng = default_rng(random_seed) - cell_samples = [] + cell_samples_ids = [] for k, v in cluster_dict.items(): if len(v) > n_samples_per_clust: choices = rng.choice(v, size=(n_samples_per_clust,)) else: choices = v - cell_samples.extend(choices) + cell_samples_ids.extend(choices) + + cell_samples = adata.obs_names[cell_samples_ids] cell_mask = pd.Series( index=adata.obs.index, @@ -62,112 +66,156 @@ def sample_clusters( return cell_mask +def _cluster_obs_dict_to_list(obs_by_cluster: Dict[int, List[int]]) -> List[int]: + """ + Convert a dictionary of cluster assignments to a list of cluster assignments + """ + # Find the total number of observations + max_index = max(max(indices) for indices in obs_by_cluster.values()) + + # Initialize the list of clusters with None (or some other default value) + cluster_by_obs = [None] * (max_index + 1) + + # Fill in the list with the corresponding cluster for each observation + for cluster, cell_ids in obs_by_cluster.items(): + for obs in cell_ids: + cluster_by_obs[obs] = cluster + + return cluster_by_obs + + def final_merge( adata: ad.AnnData, - cluster_assignments: pd.Series, - marker_genes: Set, + cluster_assignments: List, + marker_genes: Optional[set] = None, n_samples_per_clust: int = 20, random_seed: Optional[int]=None, + n_jobs: Optional[int] = 1, + return_markers_df: Optional[bool] = False, final_merge_kwargs: FinalMergeKwargs = FinalMergeKwargs(), -) -> pd.DataFrame: +) -> Tuple[List[List[int]], Union[pd.DataFrame, set]]: """ Runs a final merging step on cluster assignment results + Step1: Using a pre-defined latent space or compute PCA as below: * Do PCA on random sample of cells per cluster and selected marker genes * Filter PCA results to select top eigenvectors * Project to reduced space * remove known eigen vector - * Do differential expression merge + Step2: Do differential expression merge Parameters ---------- + adata + AnnData object + cluster_assignments + List of arrays of cell ids, one array per cluster. This is the result returned by onestep_clust/iter_clust """ - # Quick data rearranging for convenience - cluster_dict = defaultdict(lambda: []) - for cell_name, cl_label in cluster_assignments.iteritems(): - cluster_dict[row].append(cell_name) - - cell_mask = sample_clusters( - adata=adata, - cluster_dict=cluster_dict, - n_samples_per_clust=n_samples_per_clust, - random_seed=random_seed - ) - gene_mask = pd.Series( - index=adata.var.index, - dtype=bool - ) - gene_mask[markers] = True + obs_by_cluster = defaultdict(lambda: []) + for i, cell_ids in enumerate(cluster_assignments): + obs_by_cluster[i] = cell_ids + + cluster_by_obs = _cluster_obs_dict_to_list(obs_by_cluster) - # Do PCA on cell samples and marker genes - logger.info('Computing PCA on cell samples and marker genes') - tic = time.perf_counter() - (components, explained_variance_ratio, explained_variance, means) = tc.pca( - adata, - gene_mask=gene_mask, - cell_select=cell_mask, - random_state=random_seed, - **final_merge_kwargs.pca_kwargs - ) - logger.info(f'Computed {components.shape[1]} principal components') - toc = time.perf_counter() - logger.info(f'PCA Elapsed Time: {toc - tic}') + if final_merge_kwargs.latent_kwargs.get("latent_component") is None: - # Filter PCA - logger.info('Filtering PCA Components') - tic = time.perf_counter() - components = tc.dimension_reduction.filter_components( - components, - explained_variance, - explained_variance_ratio, - **final_merge_kwargs.filter_pcs_kwargs - ) - logger.info(f'Filtered to {components.shape[1]} principal components') - toc = time.perf_counter() - logger.info(f'Filter PCA Elapsed Time: {toc - tic}') + if marker_genes is None: + raise ValueError("Need marker genes to run PCA") - # Project - logger.info("Projecting into PCA space") - tic = time.perf_counter() - projected_adata = tc.project( - adata, components, means, - **final_merge_kwargs.project_kwargs - ) - logger.info(f'Projected Adata Dimensions: {projected_adata.shape}') - toc = time.perf_counter() - logger.info(f'Projection Elapsed Time: {toc - tic}') + cell_mask = sample_clusters( + adata=adata, + cluster_dict=obs_by_cluster, + n_samples_per_clust=n_samples_per_clust, + random_seed=random_seed + ) + gene_mask = pd.Series( + index=adata.var.index, + dtype=bool + ) + gene_mask[marker_genes] = True - # Filter Projection - #Filter Known Modes - if final_merge_kwargs.filter_known_modes_kwargs: - logger.info('Filtering Known Modes') + # Do PCA on cell samples and marker genes + logger.info('Computing PCA on cell samples and marker genes') tic = time.perf_counter() + (components, explained_variance_ratio, explained_variance, means) = tc.pca( + adata, + gene_mask=gene_mask, + cell_select=cell_mask, + random_state=random_seed, + **final_merge_kwargs.pca_kwargs + ) + logger.info(f'Computed {components.shape[1]} principal components') + toc = time.perf_counter() + logger.info(f'PCA Elapsed Time: {toc - tic}') - projected_adata = tc.filter_known_modes( - projected_adata, - **final_merge_kwargs.filter_known_modes_kwargs + # Filter PCA + logger.info('Filtering PCA Components') + tic = time.perf_counter() + components = tc.dimension_reduction.filter_components( + components, + explained_variance, + explained_variance_ratio, + **final_merge_kwargs.filter_pcs_kwargs ) + logger.info(f'Filtered to {components.shape[1]} principal components') + toc = time.perf_counter() + logger.info(f'Filter PCA Elapsed Time: {toc - tic}') - logger.info(f'Projected Adata Dimensions after Filtering Known Modes: {projected_adata.shape}') + # Project + logger.info("Projecting into PCA space") + tic = time.perf_counter() + projected_adata = tc.project( + adata, components, means, + **final_merge_kwargs.project_kwargs + ) + logger.info(f'Projected Adata Dimensions: {projected_adata.shape}') toc = time.perf_counter() - logger.info(f'Filter Known Modes Elapsed Time: {toc - tic}') - else: - logger.info('No known modes, skipping Filter Known Modes') + logger.info(f'Projection Elapsed Time: {toc - tic}') + + # Filter Projection + #Filter Known Modes + if final_merge_kwargs.filter_known_modes_kwargs: + logger.info('Filtering Known Modes') + tic = time.perf_counter() + + projected_adata = tc.filter_known_modes( + projected_adata, + **final_merge_kwargs.filter_known_modes_kwargs + ) + + logger.info(f'Projected Adata Dimensions after Filtering Known Modes: {projected_adata.shape}') + toc = time.perf_counter() + logger.info(f'Filter Known Modes Elapsed Time: {toc - tic}') + else: + logger.info('No known modes, skipping Filter Known Modes') + else: + logger.info('Extracting latent dims') + tic = time.perf_counter() + + ## Extract latent dimensions + projected_adata = tc.latent_project(adata, **final_merge_kwargs.latent_kwargs) + + toc = time.perf_counter() + logger.info(f'Extracting latent dims Elapsed Time: {toc - tic}') # Merging logger.info('Starting Cluster Merging') tic = time.perf_counter() - cluster_assignments_after_merging = tc.merge_clusters( + cluster_assignments_after_merging, markers = tc.merge_clusters( adata_norm=adata, adata_reduced=projected_adata, - cluster_assignments=cluster_dict, - cluster_by_obs=cluster_assignments, + cluster_assignments=obs_by_cluster, + cluster_by_obs=cluster_by_obs, + return_markers_df=return_markers_df, + n_jobs=n_jobs, **final_merge_kwargs.merge_clusters_kwargs ) logger.info(f'Completed Merging') toc = time.perf_counter() logger.info(f'Merging Elapsed Time: {toc - tic}') - return cluster_assignments_after_merging + cluster_assignments_after_merging = list(cluster_assignments_after_merging.values()) + + return cluster_assignments_after_merging, markers diff --git a/transcriptomic_clustering/markers.py b/transcriptomic_clustering/markers.py index 1ac0700..04b8c56 100644 --- a/transcriptomic_clustering/markers.py +++ b/transcriptomic_clustering/markers.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Set, Literal, Any, Optional +from typing import Dict, List, Set, Literal, Any, Optional, Union import logging from itertools import combinations @@ -20,7 +20,9 @@ def select_marker_genes( thresholds: Dict[str, Any], n_markers: int = 20, de_method: Optional[Literal['ebayes', 'chisq']] = 'ebayes', -) -> Set: + return_markers_df: Optional[bool] = False, + n_jobs: Optional[int] = 1 +) -> Union[pd.DataFrame, set]: """ Selects n up genes and n down genes from the differentially expressed genes between each pair of clusters, and saves the combined set for all cluster pairs. @@ -58,13 +60,14 @@ def select_marker_genes( thresholds.pop('score_thresh', None) neighbor_pairs = list(combinations(cl_names, 2)) if de_method == 'ebayes': - de_df = tc.de_pairs_ebayes( + de_df = tc.de_pairs_ebayes_parallel( neighbor_pairs, cluster_means, cluster_variances, present_cluster_means, cl_size, thresholds, + n_jobs = n_jobs ) elif de_method == 'chisq': de_df = tc.de_pairs_chisq( @@ -77,6 +80,9 @@ def select_marker_genes( else: raise ValueError(f'Unknown de_method {de_method}, must be one of [chisq, ebayes]') + if return_markers_df: + return de_df + markers = set() for pair, row in de_df.iterrows(): markers.update(row.up_genes[:n_markers]) diff --git a/transcriptomic_clustering/merging.py b/transcriptomic_clustering/merging.py index 97c469c..6c7ce9b 100644 --- a/transcriptomic_clustering/merging.py +++ b/transcriptomic_clustering/merging.py @@ -34,7 +34,9 @@ def merge_clusters( k: Optional[int] = 2, de_method: Optional[str] = 'ebayes', n_markers: Optional[int] = 20, - chunk_size: Optional[int] = None + chunk_size: Optional[int] = None, + return_markers_df: Optional[bool] = False, + n_jobs: Optional[int] = 1 ) -> Tuple[Dict[Any, np.ndarray], Set]: """ Merge clusters based on size and differential gene expression score @@ -63,6 +65,10 @@ def merge_clusters( if 0 or None will skip chunk_size: number of observations to process in a single chunk + return_markers_df: + return markers as a dataframe + n_jobs: + number of jobs to run in parallel Returns ------- @@ -132,7 +138,9 @@ def merge_clusters( logger.info(f'Merging DE Elapsed Time: {toc - tic}') # Select marker genes - if n_markers: + if return_markers_df is False and n_markers is None: + markers = None + else: logger.info('Starting Marker Selection') tic = time.perf_counter() markers = select_marker_genes( @@ -143,12 +151,13 @@ def merge_clusters( thresholds=thresholds, n_markers=n_markers, de_method=de_method, + return_markers_df=return_markers_df, + n_jobs=n_jobs ) logger.info('Completed Marker Selection') toc = time.perf_counter() logger.info(f'Marker Selection Elapsed Time: {toc - tic}') - else: - markers = None + return cluster_assignments_merge, markers diff --git a/transcriptomic_clustering/pairwise_DEGs.py b/transcriptomic_clustering/pairwise_DEGs.py new file mode 100644 index 0000000..6c28e95 --- /dev/null +++ b/transcriptomic_clustering/pairwise_DEGs.py @@ -0,0 +1,79 @@ +import anndata as ad +import scanpy as sc +import time +from typing import Dict, List, Any, Optional +import transcriptomic_clustering as tc +from transcriptomic_clustering.markers import select_marker_genes +import logging + +logger = logging.getLogger(__name__) + +def _cluster_obs_dict_to_list(obs_by_cluster: Dict[int, List[int]]) -> List[int]: + """ + Convert a dictionary of cluster assignments to a list of cluster assignments + """ + # Find the total number of observations + max_index = max(max(indices) for indices in obs_by_cluster.values()) + + # Initialize the list of clusters with None (or some other default value) + cluster_by_obs = [None] * (max_index + 1) + + # Fill in the list with the corresponding cluster for each observation + for cluster, cell_ids in obs_by_cluster.items(): + for obs in cell_ids: + cluster_by_obs[obs] = cluster + + return cluster_by_obs + +def pairwise_degs( + adata_norm: ad.AnnData, + obs_by_cluster: Dict[Any, List], # a dictionary with keys as cluster names and values as lists of cell names + thresholds: Dict[str, Any], + n_markers: int = 20, + de_method: str = 'ebayes', + n_jobs: int = 30, + chunk_size: Optional[int] = None + ): + """ + Perform pairwise differential expression analysis on clusters in an AnnData object. + Parameters + ---------- + adata_norm : AnnData + The normalized AnnData object containing the data. + cluster_assignments : Dict[Any, List] + A dictionary where keys are cluster names and values are lists of cell names belonging to those clusters + chunk_size : Optional[int] + The size of chunks to process at a time. If None, the entire dataset is processed + thresholds : Dict[str, Any] + n_markers : int + The number of up-regulated and downregulated markers to return for all pairs of clusters + """ + cluster_by_obs = _cluster_obs_dict_to_list(obs_by_cluster) + logger.info("Computing Cluster Means") + tic = time.perf_counter() + cl_means, present_cl_means, cl_vars = tc.get_cluster_means(adata_norm, + obs_by_cluster, + cluster_by_obs, + chunk_size, + low_th=thresholds['low_thresh']) + logger.info(f'Completed Cluster Means') + toc = time.perf_counter() + logger.info(f'Cluster Means Elapsed Time: {toc - tic}') + + logger.info('Starting Marker Selection') + tic = time.perf_counter() + markers = select_marker_genes( + cluster_assignments=obs_by_cluster, + cluster_means=cl_means, + cluster_variances=cl_vars, + present_cluster_means=present_cl_means, + thresholds=thresholds, + n_markers=n_markers, + de_method=de_method, + n_jobs=n_jobs + ) + logger.info('Completed Marker Selection') + toc = time.perf_counter() + logger.info(f'Marker Selection Elapsed Time: {toc - tic}') + + return markers \ No newline at end of file