Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions docker/tc/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
FROM python:3.9.16-buster

RUN python -m pip install --upgrade pip

WORKDIR /tc
ADD transcriptomic_clustering ./transcriptomic_clustering
ADD scripts ./scripts
ADD requirements.txt ./requirements.txt
ADD setup.py ./setup.py
ADD README.md ./README.md

RUN ls

RUN python3 -m venv tc_env
RUN . tc_env/bin/activate

RUN mkdir -p /mnt/tmp/
RUN mkdir -p /mnt/scripts/
RUN mkdir -p /mnt/adata/
RUN mkdir -p /mnt/output/

RUN pip install numpy
RUN pip install -r requirements.txt && pip install -e .

CMD ["python", "/mnt/scripts/run_iter_clust.py"]
21 changes: 21 additions & 0 deletions docker/tc/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Build
from top level transcriptomic_clustering directory

`docker build -f docker/tc/Dockerfile -t tc .`


# Run with docker
To run, you'll need three things: your script, a folder containing input files, including selected genes and normalized anndata (or unnormalized one if you're including the normalization step), and a temporary directory where the program can save AnnData files (as a rule of thumb, make sure you have 2x the size of the AnnData file in available disk space). You'll need to mount the python script, anndata file, and temporary directory.

`docker run \
-v /local1/marmot/matt_dev/transcriptomic_clustering/scripts/run_iter_clust_docker.py:/mnt/scripts/run_iter_clust.py:ro \
-v /localssd/marmot/matt_dev/tc_data:/mnt/adata:ro \
-v /localssd/marmot/matt_dev/tc_data/tmp_data/MacoskoTmp:/mnt/tmp \
-v /localssd/marmot/matt_dev/tc_data/output/macosko:/mnt/output \
tc
`

Replace the first -v line with the script you want to run (should be based off of scripts/run_iter_clust_docker.py) - the script should be copied to /mnt/scripts/run_iter_clust.py otherwise docker run won't work.
Replace the second -v line with the folder containing the normalized h5ad file and the rm.eigen file
Replace the third -v line with a path to a temporary directory where temporary files can be written
Replace the fourth -v line with a path to where an output folder can be written
187 changes: 187 additions & 0 deletions scripts/run_iter_clust_docker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
import tempfile
from pathlib import Path
import os
import shutil
import json
import logging
import datetime

import matplotlib.pyplot as plt

import numpy as np
import scipy as scp
import scanpy as sc
import pandas as pd
import transcriptomic_clustering as tc
from transcriptomic_clustering.iterative_clustering import (
build_cluster_dict, iter_clust, OnestepKwargs, summarize_final_clusters
)


logger = logging.getLogger(__name__)


def setup_filelogger(logfile: Path):
fhandler = logging.FileHandler(filename=logfile, mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
fhandler.setLevel(logging.INFO)

root_logger = logging.getLogger()
root_logger.addHandler(fhandler)


def run_iter_clust():
"""

"""
# Paths
run_datetime = datetime.datetime.now().isoformat()
tc_data_path = Path('/mnt/adata/')
tmp_path = Path('/mnt/tmp')
top_tmp_dir = tmp_path
output_path = Path('/mnt/output') / f"{run_datetime}"
output_path.mkdir(parents=True, exist_ok=True)
log_file_path = output_path / "clustering.log"

rm_eigen_path = tc_data_path / "macosko" / "rm.eigen.csv"
normalized_adata_path = tc_data_path / "macosko" / 'Macosko_BICCN.fbm.1004.fixed.selgene_normalized.h5ad'

# Outputs
clusters_path = output_path / "Macosko_BICCN.fbm.1004.fixed.selgene_clusters.csv"
markers_path = output_path / "Macosko_BICCN.fbm.1004.fixed.selgene_markers.csv"

# Setup logging
setup_filelogger(log_file_path)

# Setup tmpfile
tmp_dir = tempfile.mkdtemp(dir=top_tmp_dir)
tmp_dir = Path(str(tmp_dir))
logger.debug(f"tmp_dir: {str(tmp_dir)}")

# Set memory params
tc.memory.set_memory_limit(GB=100)
tc.memory.allow_chunking = True

# Log info
logger.debug('debug test')
logger.info(f"normalized_adata: {normalized_adata_path}")
logger.info(f"rm eigen: {rm_eigen_path}")
logger.info(f"temporary directory: {tmp_dir}")
logger.info(f"output directory: {output_path}")

# Load normalized adata
normalized_adata = sc.read(normalized_adata_path, backed='r')
normalized_adata.var_names_make_unique()

# Load rm.eigien
match = lambda a, b: [ b.index(x) if x in b else None for x in a ]
rm_eigen = pd.read_csv(rm_eigen_path)
rm_eigen_df = rm_eigen.set_index("Unnamed: 0").reindex(normalized_adata.obs.index)
logger.info(f'rm_eigen mapped: {rm_eigen_df[0:5]}')

# Assign kwargs. Any unassigned args will be set to their respective function defaults
means_vars_kwargs = {
'low_thresh': 1,
'min_cells': 4
}
highly_variable_kwargs = {
'max_genes': 3000
}

pca_kwargs = {
'cell_select': 500000,
'n_comps': 200,
'svd_solver': 'randomized'
}

filter_pcs_kwargs = {
'known_components': None,
'similarity_threshold': 0.7,
'method': 'elbow', #'elbow' or 'zscore'
'zth': 1.3,
'max_pcs': 20,
}

# project_kwargs = {

# }

# Leave empty if you don't want to use known_modes
filter_known_modes_kwargs = {
'known_modes': rm_eigen_df,
'similarity_threshold': 0.7
}

cluster_louvain_kwargs = {
'k': 15, # number of nn
'nn_measure': 'euclidean',
'knn_method': 'annoy',
'louvain_method': 'taynaud',
'weighting_method': 'jaccard',
'n_jobs': 8, # cpus
'resolution': 1., # resolution of louvain for taynaud method
}

merge_clusters_kwargs = {
'thresholds': {
'q1_thresh': 0.4,
'q2_thresh': 0.7,
'cluster_size_thresh': 20,
'qdiff_thresh': 0.7,
'padj_thresh': 0.05,
'lfc_thresh': 1.0,
'score_thresh': 150,
'low_thresh': 1,
'min_genes': 5,
},
'k': 2, # number of nn for de merge
'de_method': 'ebayes',
'n_markers': 20,
}

onestep_kwargs = OnestepKwargs(
means_vars_kwargs = means_vars_kwargs,
highly_variable_kwargs = highly_variable_kwargs,
pca_kwargs = pca_kwargs,
filter_pcs_kwargs = filter_pcs_kwargs,
# project_kwargs = project_kwargs,
filter_known_modes_kwargs = filter_known_modes_kwargs,
cluster_louvain_kwargs = cluster_louvain_kwargs,
merge_clusters_kwargs = merge_clusters_kwargs
)

# Run Iter Clust
clusters, markers = iter_clust(
normalized_adata,
min_samples=4,
onestep_kwargs=onestep_kwargs,
random_seed=345,
tmp_dir=tmp_dir
)

# Log cluster size
logger.info(f'final number of clusters: {len(clusters)}')
cl_sizes = [len(cluster) for cluster in clusters]
logger.info(f'final cluster sizes: {cl_sizes}')
logger.info(f'max cluster size: {np.max(cl_sizes)}')
logger.info(f'clusters: {clusters}, \nmarkers: {markers}')

# Save clusters to csv
cluster_dict = build_cluster_dict(clusters)
cluster_by_obs = np.zeros(normalized_adata.n_obs, dtype=int)
for cluster, obs in cluster_dict.items():
cluster_by_obs[obs] = cluster

df = pd.DataFrame(data=cluster_by_obs, index = [normalized_adata.obs.index], columns=["cl"])
df.to_csv(clusters_path)

# Save Markers to csv
df_m = pd.Series(data=list(markers), name="markers")
df_m.to_csv(markers_path, header=True)

logger.info(f"Don't forget to delete temporary directory {tmp_dir}")


if __name__ == "__main__":
run_iter_clust()