openproblems-bio · AntoinePassemiers · Oct 1, 2025 · Oct 1, 2025 · Oct 4, 2025 · Oct 4, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@
 
 # related to files
 .pybiomart.sqlite
+.venv/
 logs/
 params*
 resources*

diff --git a/common b/common
diff --git a/dockers/dictys_v4/Dockerfile b/dockers/dictys_v4/Dockerfile
@@ -0,0 +1,122 @@
+FROM ubuntu:22.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ="America/New_York"
+
+# Base OS deps (build tools + common libs) + libpng for matplotlib
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl ca-certificates git unzip zip \
+    build-essential pkg-config \
+    zlib1g-dev libbz2-dev liblzma-dev \
+    libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev \
+    libfreetype6-dev libpng-dev \
+    python3-venv python3-distutils python3-dev \
+    # bio tools via apt instead of building
+    samtools tabix \
+    perl \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install CPython 3.9.17 from source
+ARG PYTHON_VERSION=3.9.17
+RUN set -eux; \
+    cd /tmp; \
+    curl -fsSLO https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz; \
+    tar -xzf Python-${PYTHON_VERSION}.tgz; \
+    cd Python-${PYTHON_VERSION}; \
+    ./configure --enable-optimizations; \
+    make -j"$(nproc)"; \
+    make install; \
+    cd /; rm -rf /tmp/Python-${PYTHON_VERSION}*; \
+    ln -s /usr/local/bin/python3 /usr/local/bin/python; \
+    ln -s /usr/local/bin/pip3 /usr/local/bin/pip
+
+# Make constraints global for all pip installs
+COPY constraints.txt /tmp/constraints.txt
+ENV PIP_CONSTRAINT=/tmp/constraints.txt \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_DEFAULT_TIMEOUT=180
+
+# Clean any existing numpy/matplotlib remnants aggressively
+RUN python - <<'PY'
+import sys, site, pkgutil, shutil, pathlib
+paths = set(site.getsitepackages() + [site.getusersitepackages()])
+for p in list(paths):
+    if not p: 
+        continue
+    for name in ("numpy", "matplotlib"):
+        for m in pathlib.Path(p).glob(name):
+            shutil.rmtree(m, ignore_errors=True)
+        for m in pathlib.Path(p).glob(f"{name}-*.dist-info"):
+            shutil.rmtree(m, ignore_errors=True)
+        for m in pathlib.Path(p).glob(f"{name}-*.egg-info"):
+            shutil.rmtree(m, ignore_errors=True)
+print("Cleaned numpy/matplotlib from:", *paths, sep="\n  - ")
+PY
+
+# Install bedtools
+RUN apt-get update && apt-get install -y --no-install-recommends bedtools \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install tools + exact pins
+RUN python -m pip install --no-cache-dir -U pip setuptools wheel \
+ && pip install --no-cache-dir --upgrade --force-reinstall \
+      "numpy==1.26.4" "matplotlib==3.8.4" "cython<3"
+
+# Install MACS2. Build-from-source packages must reuse our pinned toolchain
+RUN pip install --no-cache-dir --no-build-isolation MACS2==2.2.9.1
+
+# Install Dictys without dependencies (we'll install them manually right after)
+RUN pip install --no-cache-dir --no-build-isolation --no-deps \
+  git+https://github.com/pinellolab/dictys.git@a82930fe8030af2785f9069ef5e909e49acc866f
+
+# Install Dictys dependencies and more
+RUN pip install --no-cache-dir --prefer-binary \
+  pandas scipy networkx h5py threadpoolctl joblib \
+  jupyter jupyterlab adjustText pyro-ppl docutils requests
+
+# Install pyDNase and anndata without dependencies so it can't pin matplotlib<2
+RUN pip install --no-cache-dir --no-build-isolation --no-deps pyDNase clint pysam packaging array_api_compat legacy-api-wrap zarr natsort anndata
+
+# Install pybedtools version that works with cython<3
+RUN pip install --no-cache-dir --no-build-isolation "pybedtools==0.9.1"
+
+# Install pytorch
+# RUN pip install --no-cache-dir --prefer-binary --index-url https://download.pytorch.org/whl/cpu torch
+
+# HOMER prerequisites
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget perl unzip build-essential zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install HOMER core + hg38 genome
+RUN set -eux; \
+    mkdir -p /opt/homer && cd /opt/homer; \
+    curl -fsSLO http://homer.ucsd.edu/homer/configureHomer.pl; \
+    chmod +x configureHomer.pl; \
+    perl configureHomer.pl -install homer; \
+    perl configureHomer.pl -install homerTools; \
+    perl configureHomer.pl -install hg38
+ENV PATH="/opt/homer/bin:${PATH}"
+
+# hg38 annotations
+RUN set -eux; \
+    cd /opt/homer; \
+    grep "hg38" update.txt > tmp.txt && mv tmp.txt update.txt; \
+    cd update && ./updateUCSCGenomeAnnotations.pl ../update.txt
+
+# Install CUDA
+# RUN curl -fsSLo /etc/apt/preferences.d/cuda-repository-pin-600 \
+#       https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+#     curl -fsSLo /usr/share/keyrings/nvidia-cuda.gpg \
+#       https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
+#     echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" \
+#       > /etc/apt/sources.list.d/cuda.list && \
+#     apt-get update && apt-get install -y --no-install-recommends cuda && \
+#     rm -rf /var/lib/apt/lists/*
+
+# Install AWS CLI
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
+    unzip awscliv2.zip && \
+    ./aws/install
+
+CMD ["/bin/bash"]
diff --git a/dockers/dictys_v4/constraints.txt b/dockers/dictys_v4/constraints.txt
@@ -0,0 +1,3 @@
+numpy==1.26.4
+matplotlib<3.9
+cython<3
diff --git a/src/methods/dictys/helper.py b/src/methods/dictys/helper.py
@@ -1,6 +1,8 @@
 import os
 os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
 os.environ["MKL_THREADING_LAYER"] = "GNU"
+import shutil
+from typing import Optional, List
 
 import numpy as np
 import pandas as pd
@@ -11,6 +13,27 @@
 warnings.filterwarnings("ignore")
 
 
+OVERRIDE_DOWNLOAD = False
+
+
+def run_cmd(cmd: List[str], cwd: Optional[str] = None) -> None:
+    kwargs = {}
+    if cwd is not None:
+        kwargs['cwd'] = cwd
+    with subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        **kwargs
+    ) as proc:
+        for line in proc.stdout:
+            print(line, end="")
+        rc = proc.wait()
+    if rc != 0:
+        raise RuntimeError(f"Command {cmd} failed with exit code {rc}")
+
 
 def define_vars(par):
     os.makedirs(par['temp_dir'], exist_ok=True)
@@ -28,6 +51,7 @@ def define_vars(par):
     par['bams_dir'] = f"{par['data_dir']}/bams/"
 
     par['gene_bed'] = f"{par['data_dir']}/gene.bed"
+    par['make_dir'] = f"{par['temp_dir']}/makefiles"
 
 
 def extract_exp(par):
@@ -88,6 +112,7 @@ def extract_atac(par):
     print(f'Sort and compress tsv file {frags_path}')
     os.system(f"sort -k1,1 -k2,2n {temp_path} | bgzip -c > {frags_path}")
 
+
 def create_bam(par):
     print('Creating BAM file from fragments', flush=True)
     cmd = f"python {par['frag_to_bam']} --fnames {par['frags_path']} --barcodes {par['barcodes']}"
@@ -107,9 +132,24 @@ def bam_to_bams(par):
             - 'bams_dir': path to output folder for per-cell BAMs
             - 'exp_path': path to reference expression file
     """
+
+    print('Delete temp BAM directories', flush=True)
+    folders = [
+        par['bams_dir'],
+        os.path.join(par['bams_dir'], '..', 'bams_text'),
+        os.path.join(par['bams_dir'], '..', 'bams_header')
+    ]
+    for folder in folders:
+        if os.path.exists(folder):
+            shutil.rmtree(folder)
+
     print('Splitting BAM into per-cell BAMs', flush=True)
-    cmd = f"bash dictys_helper split_bam.sh {par['bam_name']} {par['bams_dir']} --section CB:Z: --ref_expression {par['exp_path']}"
-    run_cmd(cmd)
+    run_cmd([
+        "bash", "dictys_helper", "split_bam.sh", par['bam_name'], par['bams_dir'],
+        "--section", "CB:Z:", "--ref_expression", par['exp_path']
+    ])
+
+
 def extrac_clusters(par):
     print('Extracting clusters', flush=True)
     subsets = f"{par['data_dir']}/subsets.txt"
@@ -127,15 +167,6 @@ def extrac_clusters(par):
     subprocess.run(cp, shell=True, check=True)
     print('Extracting clusters successful', flush=True)
 
-def run_cmd(cmd):
-    try:
-        result = subprocess.run(cmd, check=True, text=True, capture_output=True, shell=True)
-        print("STDOUT:", result.stdout)
-        print("STDERR:", result.stderr)
-    except subprocess.CalledProcessError as e:
-        print("Command failed with exit code", e.returncode)
-        print("STDOUT:", e.stdout)
-        print("STDERR:", e.stderr)
 
 def download_file(url, dest):
     import requests
@@ -145,66 +176,85 @@ def download_file(url, dest):
         with open(dest, "wb") as f:
             for chunk in r.iter_content(chunk_size=8192):
                 f.write(chunk)
+
+
 def get_priors(par):
     import gzip
     import shutil
     # - get the genome
     print('Getting genome ...', flush=True)
-    os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True)
-    cmd = f"aws s3 cp s3://openproblems-data/resources/grn/supp_data/genome/genome.fa {par['data_dir']}/genome/ --no-sign-request"
-    try:
-        run_cmd(cmd)
-    except:
+    if OVERRIDE_DOWNLOAD or (not os.path.exists(f"{par['data_dir']}/genome/genome.fa")):
+        os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True)
         try:
-            cmd = f"cp resources/supp_data/genome/genome.fa {par['data_dir']}/genome/"
-            run_cmd(cmd)
+            run_cmd([
+                "aws", "s3", "cp", "s3://openproblems-data/resources/grn/supp_data/genome/genome.fa",
+                f"{par['data_dir']}/genome/", "--no-sign-request"
+            ])
         except:
-            raise ValueError("Could not get the genome")
+            try:
+                run_cmd([
+                    "cp", "resources/supp_data/genome/genome.fa", f"{par['data_dir']}/genome/"
+                ])
+            except:
+                raise ValueError("Could not get the reference genome")
 
     # - get gene annotation       
     print('Getting gene annotation ...', flush=True)
     data_dir = Path(par['data_dir'])
     gtf_gz = data_dir / "gene.gtf.gz"
     gtf =  data_dir / "gene.gtf"
     url = "http://ftp.ensembl.org/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.gtf.gz"
-    download_file(url, gtf_gz)
+    if OVERRIDE_DOWNLOAD or (not os.path.exists(gtf_gz)):
+        download_file(url, gtf_gz)
 
     with gzip.open(gtf_gz, "rb") as f_in, open(gtf, "wb") as f_out:
         shutil.copyfileobj(f_in, f_out)
     gtf_gz.unlink()
 
-    cmd = f"bash dictys_helper gene_gtf.sh {gtf} {par['gene_bed']}"
     print('Making bed files for gene annotation ...', flush=True)
-    run_cmd(cmd)
+    run_cmd([
+        "bash", "dictys_helper", "gene_gtf.sh", gtf, par['gene_bed']
+    ])
 
     print('Downloading motif file...', flush=True)
 
     url='https://hocomoco11.autosome.org/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.0001.motif'
     motif_file = data_dir / 'motifs.motif'
-    download_file(url, motif_file)
-
+    if OVERRIDE_DOWNLOAD or (not os.path.exists(motif_file)):
+        download_file(url, motif_file)
+
+
 def configure(par):
     import json
     device='cuda:0' #cuda:0 , cpu
-    par['make_dir'] = f"{par['temp_dir']}/makefiles"
     os.makedirs(par['make_dir'], exist_ok=True)
-    cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_template.sh common.mk config.mk env_none.mk static.mk"
-    run_cmd(cmd)
+    run_cmd([
+        "bash", "dictys_helper", "makefile_template.sh", "common.mk", "config.mk", "env_none.mk", "static.mk"
+    ], cwd=par['make_dir'])
 
     json_arg = json.dumps({
         "DEVICE": device,
         "GENOME_MACS2": "hs",
         "JOINT": "1"
     })
 
-    cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_update.py config.mk '{json_arg}'"
-    run_cmd(cmd)
-    cmd = f"cd {par['temp_dir']} && bash dictys_helper makefile_check.py"
-    run_cmd(cmd)
+    run_cmd([
+        "bash", "dictys_helper", "makefile_update.py", "config.mk", json_arg
+    ], cwd=par['make_dir'])
+
+    run_cmd([
+        "bash", "dictys_helper", "makefile_check.py", "--dir_makefiles", par['make_dir'],
+        "--dir_data", par['data_dir']
+    ])
+
+
 def infer_grn(par):
     print('Inferring GRNs', flush=True)
-    cmd = f"cd {par['temp_dir']} && bash dictys_helper network_inference.sh -j {par['num_workers']} -J 1 static"
-    run_cmd(cmd)
+    run_cmd([
+        "bash", "dictys_helper", "network_inference.sh", "-j", str(par['num_workers']), "-J", "1", "static"
+    ], cwd=par['temp_dir'])
+
+
 def export_net(par):
     from util import process_links
     from dictys.net import network
@@ -224,8 +274,8 @@ def export_net(par):
     output.write(par['prediction'])
 
 def main(par):
-    define_vars(par)
 
+    define_vars(par)
     extract_exp(par)
     extract_atac(par)
     create_bam(par)

diff --git a/src/methods/portia/script.py b/src/methods/portia/script.py
@@ -44,19 +44,17 @@ def main(par):
   tf_names = [gene_name for gene_name in gene_names if (gene_name in tfs)]
   tf_idx = np.asarray([i for i, gene_name in enumerate(gene_names) if gene_name in tf_names], dtype=int)
 
-
   print('Inferring grn')
   dataset = pt.GeneExpressionDataset()
 
   for exp_id, data in enumerate(X):
       dataset.add(pt.Experiment(exp_id, data))
-  
+
   M_bar = pt.run(dataset, tf_idx=tf_idx, method='no-transform')
   ranked_scores = pt.rank_scores(M_bar, gene_names, limit=par['max_n_links'])
   sources, targets, weights = zip(*[(gene_a, gene_b, score) for gene_a, gene_b, score in ranked_scores])
 
   grn = pd.DataFrame({'source':sources, 'target':targets, 'weight':weights})
-  print(grn.shape)
   grn = grn[grn.source.isin(tf_names)]
 
   grn = process_links(grn, par)
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ @@
     # related to files
     .pybiomart.sqlite
+    .venv/
     logs/
     params*
     resources*
@@ Expand Down @@
+3 −11		nextflow_helpers/labels_tw.config
+0 −35		schemas/results_v4/combined_output.json
+0 −63		schemas/results_v4/core.json
+0 −90		schemas/results_v4/dataset_info.json
+0 −84		schemas/results_v4/method_info.json
+0 −77		schemas/results_v4/metric_info.json
+0 −50		schemas/results_v4/quality_control.json
+0 −183		schemas/results_v4/results.json
+0 −64		schemas/results_v4/task_info.json