Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ab81d83
Bug fix
AntoinePassemiers Oct 1, 2025
fc0b408
Merge branch 'new_metrics' of https://github.com/openproblems-bio/tas…
AntoinePassemiers Oct 1, 2025
8019522
Fix VC metric + replace anchor_regression by new metric 'regression3'
AntoinePassemiers Oct 4, 2025
33a21f3
Merge remote-tracking branch 'origin/jalil' into new_metrics
AntoinePassemiers Oct 4, 2025
2b974ee
Merge
AntoinePassemiers Oct 4, 2025
347a4e1
Fix recovery_2 metric
AntoinePassemiers Oct 5, 2025
b632939
recovery_2 metric: Add TG-TG correlations back
AntoinePassemiers Oct 5, 2025
244c365
Merge
AntoinePassemiers Oct 6, 2025
27c6fc3
Fix recovery_2, regression_3 and sem metrics
AntoinePassemiers Oct 9, 2025
fa5f99c
Minor changes
AntoinePassemiers Oct 9, 2025
0e83e1e
Merge remote-tracking branch 'origin/jalil' into new_metrics
AntoinePassemiers Oct 9, 2025
ecc7828
Fix issues with tf_binding metric
AntoinePassemiers Oct 10, 2025
33c324a
Deterministic algorithm for GRN baseline creation
AntoinePassemiers Oct 11, 2025
1c29769
Use immediate early genes as reporter genes
AntoinePassemiers Oct 12, 2025
f739a5c
Merge
AntoinePassemiers Oct 13, 2025
8bce32a
Minor change
AntoinePassemiers Oct 17, 2025
5651d34
Minor changes
AntoinePassemiers Oct 20, 2025
0f6cf0b
Minor change
AntoinePassemiers Oct 20, 2025
7db1968
tf_binding GT: strand-aware promoter region
AntoinePassemiers Oct 22, 2025
71cc64f
Add BEELINE datasets
AntoinePassemiers Oct 25, 2025
17aac8a
Merge
AntoinePassemiers Oct 25, 2025
a640995
Merge remote-tracking branch 'origin/jalil' into new_metrics
AntoinePassemiers Oct 26, 2025
b79593c
Improve anchor_regression metric
AntoinePassemiers Oct 27, 2025
c6f14f8
Minor change
AntoinePassemiers Oct 31, 2025
8471f2d
Merge remote-tracking branch 'origin/jalil' into new_metrics
AntoinePassemiers Oct 31, 2025
0def7ef
Fix GRN baseline generation algorithm
AntoinePassemiers Nov 3, 2025
83b2cda
Dictys: fix version conflicts + improve use of subprocess
AntoinePassemiers Nov 4, 2025
7122ef4
Dictys: minor change
AntoinePassemiers Nov 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

# related to files
.pybiomart.sqlite
.venv/
logs/
params*
resources*
Expand Down
122 changes: 122 additions & 0 deletions dockers/dictys_v4/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
FROM ubuntu:22.04

ARG DEBIAN_FRONTEND=noninteractive
ENV TZ="America/New_York"

# Base OS deps (build tools + common libs) + libpng for matplotlib
RUN apt-get update && apt-get install -y --no-install-recommends \
curl ca-certificates git unzip zip \
build-essential pkg-config \
zlib1g-dev libbz2-dev liblzma-dev \
libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev \
libfreetype6-dev libpng-dev \
python3-venv python3-distutils python3-dev \
# bio tools via apt instead of building
samtools tabix \
perl \
&& rm -rf /var/lib/apt/lists/*

# Install CPython 3.9.17 from source
ARG PYTHON_VERSION=3.9.17
RUN set -eux; \
cd /tmp; \
curl -fsSLO https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz; \
tar -xzf Python-${PYTHON_VERSION}.tgz; \
cd Python-${PYTHON_VERSION}; \
./configure --enable-optimizations; \
make -j"$(nproc)"; \
make install; \
cd /; rm -rf /tmp/Python-${PYTHON_VERSION}*; \
ln -s /usr/local/bin/python3 /usr/local/bin/python; \
ln -s /usr/local/bin/pip3 /usr/local/bin/pip

# Make constraints global for all pip installs
COPY constraints.txt /tmp/constraints.txt
ENV PIP_CONSTRAINT=/tmp/constraints.txt \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_DEFAULT_TIMEOUT=180

# Clean any existing numpy/matplotlib remnants aggressively
RUN python - <<'PY'
import sys, site, pkgutil, shutil, pathlib
paths = set(site.getsitepackages() + [site.getusersitepackages()])
for p in list(paths):
if not p:
continue
for name in ("numpy", "matplotlib"):
for m in pathlib.Path(p).glob(name):
shutil.rmtree(m, ignore_errors=True)
for m in pathlib.Path(p).glob(f"{name}-*.dist-info"):
shutil.rmtree(m, ignore_errors=True)
for m in pathlib.Path(p).glob(f"{name}-*.egg-info"):
shutil.rmtree(m, ignore_errors=True)
print("Cleaned numpy/matplotlib from:", *paths, sep="\n - ")
PY

# Install bedtools
RUN apt-get update && apt-get install -y --no-install-recommends bedtools \
&& rm -rf /var/lib/apt/lists/*

# Install tools + exact pins
RUN python -m pip install --no-cache-dir -U pip setuptools wheel \
&& pip install --no-cache-dir --upgrade --force-reinstall \
"numpy==1.26.4" "matplotlib==3.8.4" "cython<3"

# Install MACS2. Build-from-source packages must reuse our pinned toolchain
RUN pip install --no-cache-dir --no-build-isolation MACS2==2.2.9.1

# Install Dictys without dependencies (we'll install them manually right after)
RUN pip install --no-cache-dir --no-build-isolation --no-deps \
git+https://github.com/pinellolab/dictys.git@a82930fe8030af2785f9069ef5e909e49acc866f

# Install Dictys dependencies and more
RUN pip install --no-cache-dir --prefer-binary \
pandas scipy networkx h5py threadpoolctl joblib \
jupyter jupyterlab adjustText pyro-ppl docutils requests

# Install pyDNase and anndata without dependencies so it can't pin matplotlib<2
RUN pip install --no-cache-dir --no-build-isolation --no-deps pyDNase clint pysam packaging array_api_compat legacy-api-wrap zarr natsort anndata

# Install pybedtools version that works with cython<3
RUN pip install --no-cache-dir --no-build-isolation "pybedtools==0.9.1"

# Install pytorch
# RUN pip install --no-cache-dir --prefer-binary --index-url https://download.pytorch.org/whl/cpu torch

# HOMER prerequisites
RUN apt-get update && apt-get install -y --no-install-recommends \
wget perl unzip build-essential zlib1g-dev \
&& rm -rf /var/lib/apt/lists/*

# Install HOMER core + hg38 genome
RUN set -eux; \
mkdir -p /opt/homer && cd /opt/homer; \
curl -fsSLO http://homer.ucsd.edu/homer/configureHomer.pl; \
chmod +x configureHomer.pl; \
perl configureHomer.pl -install homer; \
perl configureHomer.pl -install homerTools; \
perl configureHomer.pl -install hg38
ENV PATH="/opt/homer/bin:${PATH}"

# hg38 annotations
RUN set -eux; \
cd /opt/homer; \
grep "hg38" update.txt > tmp.txt && mv tmp.txt update.txt; \
cd update && ./updateUCSCGenomeAnnotations.pl ../update.txt

# Install CUDA
# RUN curl -fsSLo /etc/apt/preferences.d/cuda-repository-pin-600 \
# https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
# curl -fsSLo /usr/share/keyrings/nvidia-cuda.gpg \
# https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
# echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" \
# > /etc/apt/sources.list.d/cuda.list && \
# apt-get update && apt-get install -y --no-install-recommends cuda && \
# rm -rf /var/lib/apt/lists/*

# Install AWS CLI
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
unzip awscliv2.zip && \
./aws/install

CMD ["/bin/bash"]
3 changes: 3 additions & 0 deletions dockers/dictys_v4/constraints.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
numpy==1.26.4
matplotlib<3.9
cython<3
118 changes: 84 additions & 34 deletions src/methods/dictys/helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
os.environ["MKL_THREADING_LAYER"] = "GNU"
import shutil
from typing import Optional, List

import numpy as np
import pandas as pd
Expand All @@ -11,6 +13,27 @@
warnings.filterwarnings("ignore")


OVERRIDE_DOWNLOAD = False


def run_cmd(cmd: List[str], cwd: Optional[str] = None) -> None:
kwargs = {}
if cwd is not None:
kwargs['cwd'] = cwd
with subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
**kwargs
) as proc:
for line in proc.stdout:
print(line, end="")
rc = proc.wait()
if rc != 0:
raise RuntimeError(f"Command {cmd} failed with exit code {rc}")


def define_vars(par):
os.makedirs(par['temp_dir'], exist_ok=True)
Expand All @@ -28,6 +51,7 @@ def define_vars(par):
par['bams_dir'] = f"{par['data_dir']}/bams/"

par['gene_bed'] = f"{par['data_dir']}/gene.bed"
par['make_dir'] = f"{par['temp_dir']}/makefiles"


def extract_exp(par):
Expand Down Expand Up @@ -88,6 +112,7 @@ def extract_atac(par):
print(f'Sort and compress tsv file {frags_path}')
os.system(f"sort -k1,1 -k2,2n {temp_path} | bgzip -c > {frags_path}")


def create_bam(par):
print('Creating BAM file from fragments', flush=True)
cmd = f"python {par['frag_to_bam']} --fnames {par['frags_path']} --barcodes {par['barcodes']}"
Expand All @@ -107,9 +132,24 @@ def bam_to_bams(par):
- 'bams_dir': path to output folder for per-cell BAMs
- 'exp_path': path to reference expression file
"""

print('Delete temp BAM directories', flush=True)
folders = [
par['bams_dir'],
os.path.join(par['bams_dir'], '..', 'bams_text'),
os.path.join(par['bams_dir'], '..', 'bams_header')
]
for folder in folders:
if os.path.exists(folder):
shutil.rmtree(folder)

print('Splitting BAM into per-cell BAMs', flush=True)
cmd = f"bash dictys_helper split_bam.sh {par['bam_name']} {par['bams_dir']} --section CB:Z: --ref_expression {par['exp_path']}"
run_cmd(cmd)
run_cmd([
"bash", "dictys_helper", "split_bam.sh", par['bam_name'], par['bams_dir'],
"--section", "CB:Z:", "--ref_expression", par['exp_path']
])


def extrac_clusters(par):
print('Extracting clusters', flush=True)
subsets = f"{par['data_dir']}/subsets.txt"
Expand All @@ -127,15 +167,6 @@ def extrac_clusters(par):
subprocess.run(cp, shell=True, check=True)
print('Extracting clusters successful', flush=True)

def run_cmd(cmd):
try:
result = subprocess.run(cmd, check=True, text=True, capture_output=True, shell=True)
print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)
except subprocess.CalledProcessError as e:
print("Command failed with exit code", e.returncode)
print("STDOUT:", e.stdout)
print("STDERR:", e.stderr)

def download_file(url, dest):
import requests
Expand All @@ -145,66 +176,85 @@ def download_file(url, dest):
with open(dest, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)


def get_priors(par):
import gzip
import shutil
# - get the genome
print('Getting genome ...', flush=True)
os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True)
cmd = f"aws s3 cp s3://openproblems-data/resources/grn/supp_data/genome/genome.fa {par['data_dir']}/genome/ --no-sign-request"
try:
run_cmd(cmd)
except:
if OVERRIDE_DOWNLOAD or (not os.path.exists(f"{par['data_dir']}/genome/genome.fa")):
os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True)
try:
cmd = f"cp resources/supp_data/genome/genome.fa {par['data_dir']}/genome/"
run_cmd(cmd)
run_cmd([
"aws", "s3", "cp", "s3://openproblems-data/resources/grn/supp_data/genome/genome.fa",
f"{par['data_dir']}/genome/", "--no-sign-request"
])
except:
raise ValueError("Could not get the genome")
try:
run_cmd([
"cp", "resources/supp_data/genome/genome.fa", f"{par['data_dir']}/genome/"
])
except:
raise ValueError("Could not get the reference genome")

# - get gene annotation
print('Getting gene annotation ...', flush=True)
data_dir = Path(par['data_dir'])
gtf_gz = data_dir / "gene.gtf.gz"
gtf = data_dir / "gene.gtf"
url = "http://ftp.ensembl.org/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.gtf.gz"
download_file(url, gtf_gz)
if OVERRIDE_DOWNLOAD or (not os.path.exists(gtf_gz)):
download_file(url, gtf_gz)

with gzip.open(gtf_gz, "rb") as f_in, open(gtf, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
gtf_gz.unlink()

cmd = f"bash dictys_helper gene_gtf.sh {gtf} {par['gene_bed']}"
print('Making bed files for gene annotation ...', flush=True)
run_cmd(cmd)
run_cmd([
"bash", "dictys_helper", "gene_gtf.sh", gtf, par['gene_bed']
])

print('Downloading motif file...', flush=True)

url='https://hocomoco11.autosome.org/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.0001.motif'
motif_file = data_dir / 'motifs.motif'
download_file(url, motif_file)

if OVERRIDE_DOWNLOAD or (not os.path.exists(motif_file)):
download_file(url, motif_file)


def configure(par):
import json
device='cuda:0' #cuda:0 , cpu
par['make_dir'] = f"{par['temp_dir']}/makefiles"
os.makedirs(par['make_dir'], exist_ok=True)
cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_template.sh common.mk config.mk env_none.mk static.mk"
run_cmd(cmd)
run_cmd([
"bash", "dictys_helper", "makefile_template.sh", "common.mk", "config.mk", "env_none.mk", "static.mk"
], cwd=par['make_dir'])

json_arg = json.dumps({
"DEVICE": device,
"GENOME_MACS2": "hs",
"JOINT": "1"
})

cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_update.py config.mk '{json_arg}'"
run_cmd(cmd)
cmd = f"cd {par['temp_dir']} && bash dictys_helper makefile_check.py"
run_cmd(cmd)
run_cmd([
"bash", "dictys_helper", "makefile_update.py", "config.mk", json_arg
], cwd=par['make_dir'])

run_cmd([
"bash", "dictys_helper", "makefile_check.py", "--dir_makefiles", par['make_dir'],
"--dir_data", par['data_dir']
])


def infer_grn(par):
print('Inferring GRNs', flush=True)
cmd = f"cd {par['temp_dir']} && bash dictys_helper network_inference.sh -j {par['num_workers']} -J 1 static"
run_cmd(cmd)
run_cmd([
"bash", "dictys_helper", "network_inference.sh", "-j", str(par['num_workers']), "-J", "1", "static"
], cwd=par['temp_dir'])


def export_net(par):
from util import process_links
from dictys.net import network
Expand All @@ -224,8 +274,8 @@ def export_net(par):
output.write(par['prediction'])

def main(par):
define_vars(par)

define_vars(par)
extract_exp(par)
extract_atac(par)
create_bam(par)
Expand Down
4 changes: 1 addition & 3 deletions src/methods/portia/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,17 @@ def main(par):
tf_names = [gene_name for gene_name in gene_names if (gene_name in tfs)]
tf_idx = np.asarray([i for i, gene_name in enumerate(gene_names) if gene_name in tf_names], dtype=int)


print('Inferring grn')
dataset = pt.GeneExpressionDataset()

for exp_id, data in enumerate(X):
dataset.add(pt.Experiment(exp_id, data))

M_bar = pt.run(dataset, tf_idx=tf_idx, method='no-transform')
ranked_scores = pt.rank_scores(M_bar, gene_names, limit=par['max_n_links'])
sources, targets, weights = zip(*[(gene_a, gene_b, score) for gene_a, gene_b, score in ranked_scores])

grn = pd.DataFrame({'source':sources, 'target':targets, 'weight':weights})
print(grn.shape)
grn = grn[grn.source.isin(tf_names)]

grn = process_links(grn, par)
Expand Down
Loading