From 479437adba35d20952eabdc0f874e8ccb61591ae Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 8 Jul 2025 13:02:05 +0200 Subject: [PATCH 01/21] Add transcriptformer_mlflow method component --- .../transcriptformer_mlflow/config.vsh.yaml | 60 +++++++++++++++ src/methods/transcriptformer_mlflow/script.py | 76 +++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 src/methods/transcriptformer_mlflow/config.vsh.yaml create mode 100644 src/methods/transcriptformer_mlflow/script.py diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml new file mode 100644 index 00000000..e693428a --- /dev/null +++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml @@ -0,0 +1,60 @@ +__merge__: ../../api/base_method.yaml + +name: transcriptformer_mlflow +label: TranscriptFormer (MLflow model) +summary: "Context-aware representations of single-cell transcriptomes by jointly modeling genes and transcripts" +description: | + TranscriptFormer is designed to learn rich, context-aware representations of + single-cell transcriptomes while jointly modeling genes and transcripts using + a novel generative architecture. + + It is a family of generative foundation models representing a cross-species + generative cell atlas trained on up to 112 million cells spanning 1.53 billion + years of evolution across 12 species. + + Here, we use a version packaged as an MLflow model. +references: + doi: + - 10.1101/2025.04.25.650731 +links: + documentation: https://github.com/czi-ai/transcriptformer#readme + repository: https://github.com/czi-ai/transcriptformer + +info: + method_types: [embedding] + preferred_normalization: counts + +arguments: + - name: --model + type: file + description: | + An MLflow model URL for the transcriptformer model + required: true + +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py + - path: /src/utils/exit_codes.py + +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1 + setup: + - type: docker + add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh + run: sh /uv-installer.sh && rm /uv-installer.sh + env: PATH="/root/.local/bin/:$PATH" + - type: docker + run: uv venv --python 3.11 /opt/venv + - type: docker + env: + - VIRTUAL_ENV=/opt/venv + - PATH="/opt/venv/bin:$PATH" + run: uv pip install mlflow==3.0.0 "transcriptformer>=0.3.0" + +runners: + - type: executable + - type: nextflow + directives: + label: [hightime, midmem, midcpu, gpu] diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py new file mode 100644 index 00000000..db8bda4d --- /dev/null +++ b/src/methods/transcriptformer_mlflow/script.py @@ -0,0 +1,76 @@ +import anndata as ad +import sys +import mlflow.pyfunc +from tempfile import NamedTemporaryFile +import os +import pandas as pd + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + "input": "resources_test/.../input.h5ad", + "output": "output.h5ad", + "model": "resources_test/.../model", +} +meta = {"name": "transcriptformer_mlflow"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata +from exit_codes import exit_non_applicable + +print(f"====== TranscriptFormer (MLflow model) ======", flush=True) + +print("\n>>> Reading input files...", flush=True) +print(f"Input H5AD file: '{par['input']}'", flush=True) +adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") + +if adata.uns["dataset_organism"] != "homo_sapiens": + exit_non_applicable( + f"Transcriptformer can only be used with human data " + f"(dataset_organism == \"{adata.uns['dataset_organism']}\")" + ) + +print(adata, flush=True) + +print("\n>>> Writing temporary H5AD file...", flush=True) +input_adata = ad.AnnData(X = adata.X.copy(), var = adata.var.filter(items=["feature_id"]).rename(columns = {"feature_id": "ensembl_id"})) +input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing +print(input_adata, flush=True) +h5ad_file = NamedTemporaryFile(suffix=".h5ad", delete=False) +print(f"Temporary H5AD file: '{h5ad_file}'", flush=True) +input_adata.write(h5ad_file.name) +del input_adata + +print("\n>>> Loading model...", flush=True) +model = mlflow.pyfunc.load_model(par["model"]) + +print("\n>>> Running model...", flush=True) +input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) +model.predict(input_df) + +print("\n>>> Storing output...", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + # obsm={ + # "X_emb": embedded.X, + # }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, +) +print(output) + +print("\n>>> Writing output to file...", flush=True) +print(f"Output H5AD file: '{par['output']}'", flush=True) +output.write_h5ad(par["output"], compression="gzip") + +print("\n>>> Cleaning up temporary files...", flush=True) +h5ad_file.close() +os.unlink(h5ad_file.name) + +print("\n>>> Done!", flush=True) From 95ced9ba05092d9f83b14a61db9c538f6307149c Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 8 Jul 2025 16:58:37 +0200 Subject: [PATCH 02/21] Adjust transcriptformer_mlflow dependencies --- src/methods/transcriptformer_mlflow/config.vsh.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml index e693428a..ee874547 100644 --- a/src/methods/transcriptformer_mlflow/config.vsh.yaml +++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml @@ -49,9 +49,9 @@ engines: run: uv venv --python 3.11 /opt/venv - type: docker env: - - VIRTUAL_ENV=/opt/venv - - PATH="/opt/venv/bin:$PATH" - run: uv pip install mlflow==3.0.0 "transcriptformer>=0.3.0" + - VIRTUAL_ENV=/opt/venv + - PATH="/opt/venv/bin:$PATH" + run: uv pip install mlflow==3.1.0 pandas==2.3.0 scanpy==1.11.2 scipy==1.16.0 "transcriptformer>=0.3.0" runners: - type: executable From 6e8dce2e7006a5bdc80593a69dc8972ef34dd73b Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 8 Jul 2025 17:00:50 +0200 Subject: [PATCH 03/21] Output embedding in transcriptformer_mlflow --- src/methods/transcriptformer_mlflow/script.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py index db8bda4d..6c62f3f4 100644 --- a/src/methods/transcriptformer_mlflow/script.py +++ b/src/methods/transcriptformer_mlflow/script.py @@ -39,7 +39,7 @@ input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing print(input_adata, flush=True) h5ad_file = NamedTemporaryFile(suffix=".h5ad", delete=False) -print(f"Temporary H5AD file: '{h5ad_file}'", flush=True) +print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) input_adata.write(h5ad_file.name) del input_adata @@ -48,15 +48,15 @@ print("\n>>> Running model...", flush=True) input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) -model.predict(input_df) +embedding = model.predict(input_df) print("\n>>> Storing output...", flush=True) output = ad.AnnData( obs=adata.obs[[]], var=adata.var[[]], - # obsm={ - # "X_emb": embedded.X, - # }, + obsm={ + "X_emb": embedding, + }, uns={ "dataset_id": adata.uns["dataset_id"], "normalization_id": adata.uns["normalization_id"], From 440a18b8c5ea0446cf839f9009f56538194b66c3 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 8 Jul 2025 17:26:59 +0200 Subject: [PATCH 04/21] Install transcriptformer requirements from file --- src/methods/transcriptformer_mlflow/config.vsh.yaml | 6 +++++- src/methods/transcriptformer_mlflow/script.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml index ee874547..c1524f87 100644 --- a/src/methods/transcriptformer_mlflow/config.vsh.yaml +++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml @@ -36,6 +36,7 @@ resources: path: script.py - path: /src/utils/read_anndata_partial.py - path: /src/utils/exit_codes.py + - path: requirements.txt engines: - type: docker @@ -51,7 +52,10 @@ engines: env: - VIRTUAL_ENV=/opt/venv - PATH="/opt/venv/bin:$PATH" - run: uv pip install mlflow==3.1.0 pandas==2.3.0 scanpy==1.11.2 scipy==1.16.0 "transcriptformer>=0.3.0" + add: requirements.txt /requirements.txt + run: uv pip install -r /requirements.txt + - type: docker + run: uv pip install mlflow==3.1.0 runners: - type: executable diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py index 6c62f3f4..f2cd1e40 100644 --- a/src/methods/transcriptformer_mlflow/script.py +++ b/src/methods/transcriptformer_mlflow/script.py @@ -45,6 +45,7 @@ print("\n>>> Loading model...", flush=True) model = mlflow.pyfunc.load_model(par["model"]) +print(model, flush=True) print("\n>>> Running model...", flush=True) input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) From 5332e00cd7a71b26f5044c803dfd94931d28b941 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 9 Jul 2025 10:16:16 +0200 Subject: [PATCH 05/21] Add extracting model archive to transcriptformer --- .../transcriptformer_mlflow/config.vsh.yaml | 3 +- .../transcriptformer_mlflow/requirements.txt | 338 ++++++++++++++++++ src/methods/transcriptformer_mlflow/script.py | 44 ++- 3 files changed, 377 insertions(+), 8 deletions(-) create mode 100644 src/methods/transcriptformer_mlflow/requirements.txt diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml index c1524f87..ba708fa1 100644 --- a/src/methods/transcriptformer_mlflow/config.vsh.yaml +++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml @@ -28,7 +28,8 @@ arguments: - name: --model type: file description: | - An MLflow model URL for the transcriptformer model + An MLflow model URI for the transcriptformer model. If it is a .zip or + .tar.gz file it will be extracted to a temporary directory. required: true resources: diff --git a/src/methods/transcriptformer_mlflow/requirements.txt b/src/methods/transcriptformer_mlflow/requirements.txt new file mode 100644 index 00000000..70d923d1 --- /dev/null +++ b/src/methods/transcriptformer_mlflow/requirements.txt @@ -0,0 +1,338 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o requirements.txt +aiobotocore==2.23.0 + # via s3fs +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.13 + # via + # aiobotocore + # fsspec + # s3fs +aioitertools==0.12.0 + # via aiobotocore +aiosignal==1.3.2 + # via aiohttp +anndata==0.11.4 + # via + # cellxgene-census + # scanpy + # somacore + # tiledbsoma + # transcriptformer +antlr4-python3-runtime==4.9.3 + # via + # hydra-core + # omegaconf +array-api-compat==1.12.0 + # via anndata +attrs==25.3.0 + # via + # aiohttp + # somacore + # tiledbsoma +boto3==1.38.27 + # via transcriptformer +botocore==1.38.27 + # via + # aiobotocore + # boto3 + # s3transfer +cellxgene-census==1.17.0 + # via transcriptformer +certifi==2025.6.15 + # via requests +charset-normalizer==3.4.2 + # via requests +contourpy==1.3.2 + # via matplotlib +cycler==0.12.1 + # via matplotlib +filelock==3.18.0 + # via + # torch + # triton +fonttools==4.58.4 + # via matplotlib +frozenlist==1.7.0 + # via + # aiohttp + # aiosignal +fsspec==2025.5.1 + # via + # pytorch-lightning + # s3fs + # torch +h5py==3.14.0 + # via + # anndata + # scanpy + # transcriptformer +hydra-core==1.3.2 + # via transcriptformer +idna==3.10 + # via + # requests + # yarl +iniconfig==2.1.0 + # via pytest +jinja2==3.1.6 + # via torch +jmespath==1.0.1 + # via + # aiobotocore + # boto3 + # botocore +joblib==1.5.1 + # via + # pynndescent + # scanpy + # scikit-learn +kiwisolver==1.4.8 + # via matplotlib +legacy-api-wrap==1.4.1 + # via scanpy +lightning-utilities==0.14.3 + # via + # pytorch-lightning + # torchmetrics +llvmlite==0.44.0 + # via + # numba + # pynndescent +markupsafe==3.0.2 + # via jinja2 +matplotlib==3.10.3 + # via + # scanpy + # seaborn +more-itertools==10.7.0 + # via tiledbsoma +mpmath==1.3.0 + # via sympy +multidict==6.6.0 + # via + # aiobotocore + # aiohttp + # yarl +natsort==8.4.0 + # via + # anndata + # scanpy +networkx==3.5 + # via + # scanpy + # torch +numba==0.61.2 + # via + # pynndescent + # scanpy + # umap-learn +numpy==2.2.6 + # via + # anndata + # cellxgene-census + # contourpy + # h5py + # matplotlib + # numba + # pandas + # patsy + # scanpy + # scikit-learn + # scipy + # seaborn + # shapely + # somacore + # statsmodels + # tiledbsoma + # torchmetrics + # transcriptformer + # umap-learn +nvidia-cublas-cu12==12.4.5.8 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.575.51 + # via pynvml +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.4.127 + # via torch +omegaconf==2.3.0 + # via hydra-core +packaging==25.0 + # via + # anndata + # hydra-core + # lightning-utilities + # matplotlib + # pytest + # pytorch-lightning + # scanpy + # statsmodels + # torchmetrics +pandas==2.3.0 + # via + # anndata + # scanpy + # seaborn + # somacore + # statsmodels + # tiledbsoma + # transcriptformer +patsy==1.0.1 + # via + # scanpy + # statsmodels +pillow==11.2.1 + # via matplotlib +pluggy==1.6.0 + # via pytest +propcache==0.3.2 + # via + # aiohttp + # yarl +psutil==7.0.0 + # via transcriptformer +pyarrow==20.0.0 + # via + # somacore + # tiledbsoma +pyarrow-hotfix==0.7 + # via somacore +pygments==2.19.2 + # via pytest +pynndescent==0.5.13 + # via + # scanpy + # umap-learn +pynvml==12.0.0 + # via transcriptformer +pyparsing==3.2.3 + # via matplotlib +pytest==8.4.1 + # via transcriptformer +python-dateutil==2.9.0.post0 + # via + # aiobotocore + # botocore + # matplotlib + # pandas +pytorch-lightning==2.5.2 + # via transcriptformer +pytz==2025.2 + # via pandas +pyyaml==6.0.2 + # via + # omegaconf + # pytorch-lightning +requests==2.32.4 + # via cellxgene-census +s3fs==2025.5.1 + # via cellxgene-census +s3transfer==0.13.0 + # via boto3 +scanpy==1.11.2 + # via + # tiledbsoma + # transcriptformer +scikit-learn==1.7.0 + # via + # pynndescent + # scanpy + # umap-learn +scipy==1.16.0 + # via + # anndata + # pynndescent + # scanpy + # scikit-learn + # somacore + # statsmodels + # tiledbsoma + # transcriptformer + # umap-learn +seaborn==0.13.2 + # via scanpy +session-info2==0.1.2 + # via scanpy +setuptools==80.9.0 + # via lightning-utilities +shapely==2.1.1 + # via somacore +six==1.17.0 + # via python-dateutil +somacore==1.0.28 + # via tiledbsoma +statsmodels==0.14.4 + # via scanpy +sympy==1.13.1 + # via torch +threadpoolctl==3.6.0 + # via scikit-learn +tiledbsoma==1.17.0 + # via cellxgene-census +timeout-decorator==0.5.0 + # via transcriptformer +torch==2.5.1 + # via + # pytorch-lightning + # torchmetrics + # transcriptformer +torchmetrics==1.7.3 + # via pytorch-lightning +tqdm==4.67.1 + # via + # pytorch-lightning + # scanpy + # umap-learn +transcriptformer==0.3.0 + # via -r requirements.in +triton==3.1.0 + # via torch +typing-extensions==4.14.0 + # via + # cellxgene-census + # lightning-utilities + # pytorch-lightning + # scanpy + # somacore + # tiledbsoma + # torch +tzdata==2025.2 + # via pandas +umap-learn==0.5.7 + # via scanpy +urllib3==2.5.0 + # via + # botocore + # requests +wrapt==1.17.2 + # via aiobotocore +yarl==1.20.1 + # via aiohttp diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py index f2cd1e40..76b41023 100644 --- a/src/methods/transcriptformer_mlflow/script.py +++ b/src/methods/transcriptformer_mlflow/script.py @@ -1,9 +1,11 @@ import anndata as ad import sys import mlflow.pyfunc -from tempfile import NamedTemporaryFile +import tempfile import os import pandas as pd +import zipfile +import tarfile ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -34,19 +36,45 @@ print(adata, flush=True) -print("\n>>> Writing temporary H5AD file...", flush=True) +if os.path.isdir(par["model"]): + print("\n>>> Using model directory...", flush=True) + model_temp = None + model_dir = par["model"] +else: + model_temp = tempfile.TemporaryDirectory() + model_dir = model_temp.name + + if zipfile.is_zipfile(par["model"]): + print("\n>>> Extracting model from .zip...", flush=True) + print(f".zip path: '{par['model']}'", flush=True) + with zipfile.ZipFile(par["model"], "r") as zip_file: + zip_file.extractall(model_dir) + elif tarfile.is_tarfile(par["model"]) and par["model"].endswith( + ".tar.gz" + ): + print("\n>>> Extracting model from .tar.gz...", flush=True) + print(f".tar.gz path: '{par['model']}'", flush=True) + with tarfile.open(par["model"], "r:gz") as tar_file: + tar_file.extractall(model_dir) + model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) + else: + raise ValueError( + "The 'model' argument should be a directory a .zip file or a .tar.gz file" + ) + +print("\n>>> Loading model...", flush=True) +model = mlflow.pyfunc.load_model(model_dir) +print(model, flush=True) + +print("\n>>> Writing temporary input H5AD file...", flush=True) input_adata = ad.AnnData(X = adata.X.copy(), var = adata.var.filter(items=["feature_id"]).rename(columns = {"feature_id": "ensembl_id"})) input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing print(input_adata, flush=True) -h5ad_file = NamedTemporaryFile(suffix=".h5ad", delete=False) +h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) input_adata.write(h5ad_file.name) del input_adata -print("\n>>> Loading model...", flush=True) -model = mlflow.pyfunc.load_model(par["model"]) -print(model, flush=True) - print("\n>>> Running model...", flush=True) input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) embedding = model.predict(input_df) @@ -71,6 +99,8 @@ output.write_h5ad(par["output"], compression="gzip") print("\n>>> Cleaning up temporary files...", flush=True) +if model_temp is not None: + model_temp.cleanup() h5ad_file.close() os.unlink(h5ad_file.name) From b84a40aec116be12dd1b4069cd617ea8ac1485b4 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 9 Jul 2025 10:16:46 +0200 Subject: [PATCH 06/21] Add transcriptformer_mlflow to benchmark workflow --- scripts/run_benchmark/run_full_local.sh | 2 +- scripts/run_benchmark/run_test_local.sh | 2 +- src/workflows/run_benchmark/config.vsh.yaml | 1 + src/workflows/run_benchmark/main.nf | 3 +++ 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh index 20e434b3..b60940c9 100755 --- a/scripts/run_benchmark/run_full_local.sh +++ b/scripts/run_benchmark/run_full_local.sh @@ -26,7 +26,7 @@ input_states: resources/datasets/**/state.yaml rename_keys: 'input_dataset:output_dataset;input_solution:output_solution' output_state: "state.yaml" publish_dir: "$publish_dir" -settings: '{"methods_exclude": ["uce", "scgpt_finetuned"]}' +settings: '{"methods_exclude": ["uce", "scgpt_finetuned", "transcriptformer_mlflow"]}' HERE # run the benchmark diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh index 85e39583..4b7bf15e 100755 --- a/scripts/run_benchmark/run_test_local.sh +++ b/scripts/run_benchmark/run_test_local.sh @@ -21,7 +21,7 @@ input_states: resources_test/task_batch_integration/**/state.yaml rename_keys: 'input_dataset:output_dataset;input_solution:output_solution' output_state: "state.yaml" publish_dir: "$publish_dir" -settings: '{"methods_exclude": ["uce", "scgpt_finetuned"]}' +settings: '{"methods_exclude": ["uce", "scgpt_finetuned", "transcriptformer_mlflow"]}' HERE nextflow run . \ diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 09905ad0..d9fe9504 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -106,6 +106,7 @@ dependencies: - name: methods/scimilarity - name: methods/scprint - name: methods/scvi + - name: methods/transcriptformer_mlflow - name: methods/uce # metrics - name: metrics/asw_batch diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 6196f749..104485bd 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -40,6 +40,9 @@ methods = [ ), scprint, scvi, + transcriptformer_mlflow.run( + args: [model: file("s3://openproblems-work/cache/transcriptformer-mlflow-model.zip")] + ), uce.run( args: [model: file("s3://openproblems-work/cache/uce-model-v5.zip")] ) From 74be8558a5d2d31701acf338c908615f03d54d8e Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 9 Jul 2025 11:38:05 +0200 Subject: [PATCH 07/21] Install openproblems package for transcriptformer --- src/methods/transcriptformer_mlflow/config.vsh.yaml | 2 ++ src/methods/transcriptformer_mlflow/script.py | 1 + 2 files changed, 3 insertions(+) diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml index ba708fa1..9cb3544f 100644 --- a/src/methods/transcriptformer_mlflow/config.vsh.yaml +++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml @@ -57,6 +57,8 @@ engines: run: uv pip install -r /requirements.txt - type: docker run: uv pip install mlflow==3.1.0 + - type: docker + run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems runners: - type: executable diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py index 76b41023..c5c37575 100644 --- a/src/methods/transcriptformer_mlflow/script.py +++ b/src/methods/transcriptformer_mlflow/script.py @@ -38,6 +38,7 @@ if os.path.isdir(par["model"]): print("\n>>> Using model directory...", flush=True) + print(f"Directory path: '{par['model']}'", flush=True) model_temp = None model_dir = par["model"] else: From 8693f35176d8245b61ae18c76c1fc5b8181ee51a Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 9 Jul 2025 12:16:12 +0200 Subject: [PATCH 08/21] Style transcriptformer_mlflow script --- src/methods/transcriptformer_mlflow/script.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py index c5c37575..b16806d3 100644 --- a/src/methods/transcriptformer_mlflow/script.py +++ b/src/methods/transcriptformer_mlflow/script.py @@ -1,11 +1,12 @@ -import anndata as ad +import os import sys -import mlflow.pyfunc +import tarfile import tempfile -import os -import pandas as pd import zipfile -import tarfile + +import anndata as ad +import mlflow.pyfunc +import pandas as pd ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -19,10 +20,10 @@ ## VIASH END sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata from exit_codes import exit_non_applicable +from read_anndata_partial import read_anndata -print(f"====== TranscriptFormer (MLflow model) ======", flush=True) +print("====== TranscriptFormer (MLflow model) ======", flush=True) print("\n>>> Reading input files...", flush=True) print(f"Input H5AD file: '{par['input']}'", flush=True) @@ -31,7 +32,7 @@ if adata.uns["dataset_organism"] != "homo_sapiens": exit_non_applicable( f"Transcriptformer can only be used with human data " - f"(dataset_organism == \"{adata.uns['dataset_organism']}\")" + f'(dataset_organism == "{adata.uns["dataset_organism"]}")' ) print(adata, flush=True) @@ -50,9 +51,7 @@ print(f".zip path: '{par['model']}'", flush=True) with zipfile.ZipFile(par["model"], "r") as zip_file: zip_file.extractall(model_dir) - elif tarfile.is_tarfile(par["model"]) and par["model"].endswith( - ".tar.gz" - ): + elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): print("\n>>> Extracting model from .tar.gz...", flush=True) print(f".tar.gz path: '{par['model']}'", flush=True) with tarfile.open(par["model"], "r:gz") as tar_file: @@ -68,8 +67,13 @@ print(model, flush=True) print("\n>>> Writing temporary input H5AD file...", flush=True) -input_adata = ad.AnnData(X = adata.X.copy(), var = adata.var.filter(items=["feature_id"]).rename(columns = {"feature_id": "ensembl_id"})) -input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing +input_adata = ad.AnnData( + X=adata.X.copy(), + var=adata.var.filter(items=["feature_id"]).rename( + columns={"feature_id": "ensembl_id"} + ), +) +input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing print(input_adata, flush=True) h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) From b7dae0b4b7b98a17a3966ded2ffaaa8335379a20 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 29 Jul 2025 10:45:10 +0200 Subject: [PATCH 09/21] Increase transcriptformer memory label --- src/methods/transcriptformer_mlflow/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml index 9cb3544f..2d144c23 100644 --- a/src/methods/transcriptformer_mlflow/config.vsh.yaml +++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml @@ -64,4 +64,4 @@ runners: - type: executable - type: nextflow directives: - label: [hightime, midmem, midcpu, gpu] + label: [hightime, highmem, midcpu, gpu] From 5d3c6e0f213f5e2e025173fe19cf1a0354857a74 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 13 Aug 2025 10:59:28 +0200 Subject: [PATCH 10/21] Add scvi_mlflow method --- src/methods/scvi_mlflow/config.vsh.yaml | 61 +++ src/methods/scvi_mlflow/requirements.txt | 459 ++++++++++++++++++++ src/methods/scvi_mlflow/script.py | 114 +++++ src/workflows/run_benchmark/config.vsh.yaml | 1 + src/workflows/run_benchmark/main.nf | 3 + 5 files changed, 638 insertions(+) create mode 100644 src/methods/scvi_mlflow/config.vsh.yaml create mode 100644 src/methods/scvi_mlflow/requirements.txt create mode 100644 src/methods/scvi_mlflow/script.py diff --git a/src/methods/scvi_mlflow/config.vsh.yaml b/src/methods/scvi_mlflow/config.vsh.yaml new file mode 100644 index 00000000..d50a6e62 --- /dev/null +++ b/src/methods/scvi_mlflow/config.vsh.yaml @@ -0,0 +1,61 @@ +__merge__: ../../api/base_method.yaml + +name: scvi_mlflow +label: scVI (MLflow model) +summary: scVI combines a variational autoencoder with a hierarchical Bayesian model (MLflow model) +description: | + scVI combines a variational autoencoder with a hierarchical Bayesian model. + It uses the negative binomial distribution to describe gene expression of + each cell, conditioned on unobserved factors and the batch variable. + + This version uses a pre-trained MLflow model. +references: + doi: + - 10.1038/s41592-018-0229-2 +links: + repository: https://github.com/scverse/scvi-tools + documentation: https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html + +info: + method_types: [embedding] + preferred_normalization: counts + +arguments: + - name: --model + type: file + description: | + An MLflow model URI for the scVI model. If it is a .zip or + .tar.gz file it will be extracted to a temporary directory. + required: true + +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py + - path: /src/utils/exit_codes.py + - path: requirements.txt + +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1 + setup: + - type: docker + add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh + run: sh /uv-installer.sh && rm /uv-installer.sh + env: PATH="/root/.local/bin/:$PATH" + - type: docker + run: uv venv --python 3.11 /opt/venv + - type: docker + env: + - VIRTUAL_ENV=/opt/venv + - PATH="/opt/venv/bin:$PATH" + add: requirements.txt /requirements.txt + run: uv pip install -r /requirements.txt && uv pip install mlflow==3.1.0 + - type: docker + run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems + +runners: + - type: executable + - type: nextflow + directives: + label: [hightime, highmem, midcpu, gpu] diff --git a/src/methods/scvi_mlflow/requirements.txt b/src/methods/scvi_mlflow/requirements.txt new file mode 100644 index 00000000..c3c79df5 --- /dev/null +++ b/src/methods/scvi_mlflow/requirements.txt @@ -0,0 +1,459 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o /tmp/tmp6b02zuzi/requirements_initial.txt +absl-py==2.3.1 + # via + # chex + # ml-collections + # optax + # orbax-checkpoint +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.15 + # via fsspec +aiosignal==1.4.0 + # via aiohttp +alembic==1.16.4 + # via mlflow +anndata==0.10.8 + # via + # -r requirements.in + # mudata + # scvi-tools +annotated-types==0.7.0 + # via pydantic +anyio==4.10.0 + # via starlette +array-api-compat==1.12.0 + # via anndata +attrs==25.3.0 + # via aiohttp +blinker==1.9.0 + # via flask +cachetools==5.5.2 + # via + # google-auth + # mlflow-skinny +certifi==2025.8.3 + # via requests +charset-normalizer==3.4.3 + # via requests +chex==0.1.90 + # via optax +click==8.2.1 + # via + # flask + # mlflow-skinny + # uvicorn +cloudpickle==3.1.1 + # via mlflow-skinny +contourpy==1.3.3 + # via matplotlib +cycler==0.12.1 + # via matplotlib +databricks-sdk==0.62.0 + # via mlflow-skinny +docker==7.1.0 + # via mlflow +docrep==0.3.2 + # via scvi-tools +etils==1.13.0 + # via orbax-checkpoint +fastapi==0.116.1 + # via mlflow-skinny +filelock==3.18.0 + # via + # torch + # triton +flask==3.1.1 + # via mlflow +flax==0.10.4 + # via scvi-tools +fonttools==4.59.0 + # via matplotlib +frozenlist==1.7.0 + # via + # aiohttp + # aiosignal +fsspec==2025.7.0 + # via + # etils + # lightning + # pytorch-lightning + # torch +gitdb==4.0.12 + # via gitpython +gitpython==3.1.45 + # via mlflow-skinny +google-auth==2.40.3 + # via databricks-sdk +graphene==3.4.3 + # via mlflow +graphql-core==3.2.6 + # via + # graphene + # graphql-relay +graphql-relay==3.2.0 + # via graphene +greenlet==3.2.4 + # via sqlalchemy +gunicorn==23.0.0 + # via mlflow +h11==0.16.0 + # via uvicorn +h5py==3.14.0 + # via + # anndata + # scvi-tools +humanize==4.12.3 + # via orbax-checkpoint +idna==3.10 + # via + # anyio + # requests + # yarl +importlib-metadata==8.7.0 + # via + # mlflow-skinny + # opentelemetry-api +importlib-resources==6.5.2 + # via etils +itsdangerous==2.2.0 + # via flask +jax==0.4.33 + # via + # -r requirements.in + # chex + # flax + # numpyro + # optax + # orbax-checkpoint + # scvi-tools +jaxlib==0.4.33 + # via + # -r requirements.in + # chex + # jax + # numpyro + # optax + # orbax-checkpoint + # scvi-tools +jinja2==3.1.6 + # via + # flask + # torch +joblib==1.5.1 + # via scikit-learn +kiwisolver==1.4.9 + # via matplotlib +lightning==2.5.2 + # via scvi-tools +lightning-utilities==0.15.2 + # via + # lightning + # pytorch-lightning + # torchmetrics +mako==1.3.10 + # via alembic +markdown-it-py==4.0.0 + # via rich +markupsafe==3.0.2 + # via + # flask + # jinja2 + # mako + # werkzeug +matplotlib==3.10.5 + # via mlflow +mdurl==0.1.2 + # via markdown-it-py +ml-collections==1.1.0 + # via scvi-tools +ml-dtypes==0.5.3 + # via + # jax + # jaxlib + # tensorstore +mlflow==3.1.0 + # via -r requirements.in +mlflow-skinny==3.1.0 + # via mlflow +mpmath==1.3.0 + # via sympy +msgpack==1.1.1 + # via + # flax + # orbax-checkpoint +mudata==0.3.2 + # via scvi-tools +multidict==6.6.4 + # via + # aiohttp + # yarl +multipledispatch==1.0.0 + # via numpyro +natsort==8.4.0 + # via anndata +nest-asyncio==1.6.0 + # via orbax-checkpoint +networkx==3.5 + # via torch +numpy==1.26.4 + # via + # anndata + # chex + # contourpy + # flax + # h5py + # jax + # jaxlib + # matplotlib + # ml-dtypes + # mlflow + # numpyro + # optax + # orbax-checkpoint + # pandas + # pyro-ppl + # scikit-learn + # scipy + # scvi-tools + # tensorstore + # torchmetrics + # treescope +numpyro==0.19.0 + # via scvi-tools +nvidia-cublas-cu12==12.4.5.8 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.4.127 + # via torch +opentelemetry-api==1.36.0 + # via + # mlflow-skinny + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-sdk==1.36.0 + # via mlflow-skinny +opentelemetry-semantic-conventions==0.57b0 + # via opentelemetry-sdk +opt-einsum==3.4.0 + # via + # jax + # pyro-ppl +optax==0.2.5 + # via + # flax + # scvi-tools +orbax-checkpoint==0.6.4 + # via flax +packaging==25.0 + # via + # anndata + # gunicorn + # lightning + # lightning-utilities + # matplotlib + # mlflow-skinny + # pytorch-lightning + # torchmetrics +pandas==2.2.3 + # via + # -r requirements.in + # anndata + # mlflow + # scvi-tools +pillow==11.3.0 + # via matplotlib +propcache==0.3.2 + # via + # aiohttp + # yarl +protobuf==6.31.1 + # via + # mlflow-skinny + # orbax-checkpoint +pyarrow==20.0.0 + # via mlflow +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth +pydantic==2.11.7 + # via + # fastapi + # mlflow-skinny +pydantic-core==2.33.2 + # via pydantic +pygments==2.19.2 + # via rich +pyparsing==3.2.3 + # via matplotlib +pyro-api==0.1.2 + # via pyro-ppl +pyro-ppl==1.9.1 + # via scvi-tools +python-dateutil==2.9.0.post0 + # via + # graphene + # matplotlib + # pandas +pytorch-lightning==2.5.2 + # via lightning +pytz==2025.2 + # via pandas +pyyaml==6.0.2 + # via + # flax + # lightning + # ml-collections + # mlflow-skinny + # orbax-checkpoint + # pytorch-lightning +requests==2.32.4 + # via + # databricks-sdk + # docker + # mlflow-skinny +rich==14.1.0 + # via + # flax + # scvi-tools +rsa==4.9.1 + # via google-auth +scikit-learn==1.7.1 + # via + # mlflow + # scvi-tools +scipy==1.16.1 + # via + # anndata + # jax + # jaxlib + # mlflow + # scikit-learn + # scvi-tools +scvi-tools==1.1.6.post2 + # via -r requirements.in +setuptools==80.9.0 + # via lightning-utilities +six==1.17.0 + # via + # docrep + # python-dateutil +smmap==5.0.2 + # via gitdb +sniffio==1.3.1 + # via anyio +sqlalchemy==2.0.43 + # via + # alembic + # mlflow +sqlparse==0.5.3 + # via mlflow-skinny +starlette==0.47.2 + # via fastapi +sympy==1.13.1 + # via torch +tensorstore==0.1.76 + # via + # flax + # orbax-checkpoint +threadpoolctl==3.6.0 + # via scikit-learn +toolz==1.0.0 + # via chex +torch==2.5.1 + # via + # -r requirements.in + # lightning + # pyro-ppl + # pytorch-lightning + # scvi-tools + # torchmetrics +torchmetrics==1.8.1 + # via + # lightning + # pytorch-lightning + # scvi-tools +tqdm==4.67.1 + # via + # lightning + # numpyro + # pyro-ppl + # pytorch-lightning + # scvi-tools +treescope==0.1.10 + # via flax +triton==3.1.0 + # via torch +typing-extensions==4.14.1 + # via + # aiosignal + # alembic + # anyio + # chex + # etils + # fastapi + # flax + # graphene + # lightning + # lightning-utilities + # mlflow-skinny + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # orbax-checkpoint + # pydantic + # pydantic-core + # pytorch-lightning + # sqlalchemy + # starlette + # torch + # typing-inspection +typing-inspection==0.4.1 + # via pydantic +tzdata==2025.2 + # via pandas +urllib3==2.5.0 + # via + # docker + # requests +uvicorn==0.35.0 + # via mlflow-skinny +werkzeug==3.1.3 + # via flask +yarl==1.20.1 + # via aiohttp +zipp==3.23.0 + # via + # etils + # importlib-metadata diff --git a/src/methods/scvi_mlflow/script.py b/src/methods/scvi_mlflow/script.py new file mode 100644 index 00000000..04ff94d5 --- /dev/null +++ b/src/methods/scvi_mlflow/script.py @@ -0,0 +1,114 @@ +import os +import sys +import tarfile +import tempfile +import zipfile + +import anndata as ad +import mlflow.pyfunc +import pandas as pd + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + "input": "resources_test/.../input.h5ad", + "output": "output.h5ad", + "model": "resources_test/.../model", +} +meta = {"name": "scvi_mlflow"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from exit_codes import exit_non_applicable +from read_anndata_partial import read_anndata + +print("====== scVI (MLflow model) ======", flush=True) + +print("\n>>> Reading input files...", flush=True) +print(f"Input H5AD file: '{par['input']}'", flush=True) +adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") + +if adata.uns["dataset_organism"] == "homo_sapiens": + organism = "human" +elif adata.uns["dataset_organism"] == "mus_musculus": + organism = "mouse" +else: + exit_non_applicable( + f"scVI (MLflow) can only be used with human or mouse data " + f'(dataset_organism == "{adata.uns["dataset_organism"]}")' + ) + +print(adata, flush=True) + +if os.path.isdir(par["model"]): + print("\n>>> Using model directory...", flush=True) + print(f"Directory path: '{par['model']}'", flush=True) + model_temp = None + model_dir = par["model"] +else: + model_temp = tempfile.TemporaryDirectory() + model_dir = model_temp.name + + if zipfile.is_zipfile(par["model"]): + print("\n>>> Extracting model from .zip...", flush=True) + print(f".zip path: '{par['model']}'", flush=True) + with zipfile.ZipFile(par["model"], "r") as zip_file: + zip_file.extractall(model_dir) + elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): + print("\n>>> Extracting model from .tar.gz...", flush=True) + print(f".tar.gz path: '{par['model']}'", flush=True) + with tarfile.open(par["model"], "r:gz") as tar_file: + tar_file.extractall(model_dir) + model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) + else: + raise ValueError( + "The 'model' argument should be a directory a .zip file or a .tar.gz file" + ) + +print("\n>>> Loading model...", flush=True) +model = mlflow.pyfunc.load_model(model_dir) +print(model, flush=True) + +print("\n>>> Writing temporary input H5AD file...", flush=True) +input_adata = ad.AnnData(X=adata.X.copy()) +input_adata.var_names = adata.var["feature_id"].values +input_adata.obs["batch"] = adata.obs["batch"].values +print(input_adata, flush=True) + +h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) +print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) +input_adata.write(h5ad_file.name) +del input_adata + +print("\n>>> Running model...", flush=True) +input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) +input_params = {"organism": organism, "return_dist": True, "batch_keys": "batch"} +embedding = model.predict(input_df, params=input_params) + +print("\n>>> Storing output...", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": embedding, + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, +) +print(output) + +print("\n>>> Writing output to file...", flush=True) +print(f"Output H5AD file: '{par['output']}'", flush=True) +output.write_h5ad(par["output"], compression="gzip") + +print("\n>>> Cleaning up temporary files...", flush=True) +if model_temp is not None: + model_temp.cleanup() +h5ad_file.close() +os.unlink(h5ad_file.name) + +print("\n>>> Done!", flush=True) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index d9fe9504..af6f51b7 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -106,6 +106,7 @@ dependencies: - name: methods/scimilarity - name: methods/scprint - name: methods/scvi + - name: methods/scvi_mlflow - name: methods/transcriptformer_mlflow - name: methods/uce # metrics diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 104485bd..2db049f6 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -40,6 +40,9 @@ methods = [ ), scprint, scvi, + scvi_mlflow.run( + args: [model: file("s3://openproblems-work/cache/scvi-mlflow-model.zip")] + ), transcriptformer_mlflow.run( args: [model: file("s3://openproblems-work/cache/transcriptformer-mlflow-model.zip")] ), From 0243f8d042082969c4a7571a95ee2b401d5059ce Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 13 Aug 2025 11:32:57 +0200 Subject: [PATCH 11/21] Add geneformer_mlflow method --- src/methods/geneformer_mlflow/config.vsh.yaml | 65 +++ .../geneformer_mlflow/requirements.txt | 540 ++++++++++++++++++ src/methods/geneformer_mlflow/script.py | 112 ++++ src/workflows/run_benchmark/config.vsh.yaml | 1 + src/workflows/run_benchmark/main.nf | 5 +- 5 files changed, 722 insertions(+), 1 deletion(-) create mode 100644 src/methods/geneformer_mlflow/config.vsh.yaml create mode 100644 src/methods/geneformer_mlflow/requirements.txt create mode 100644 src/methods/geneformer_mlflow/script.py diff --git a/src/methods/geneformer_mlflow/config.vsh.yaml b/src/methods/geneformer_mlflow/config.vsh.yaml new file mode 100644 index 00000000..b9d08eda --- /dev/null +++ b/src/methods/geneformer_mlflow/config.vsh.yaml @@ -0,0 +1,65 @@ +__merge__: ../../api/base_method.yaml + +name: geneformer_mlflow +label: Geneformer (MLflow model) +summary: Geneformer is a foundation transformer model pretrained on a large-scale corpus of single cell transcriptomes +description: | + Geneformer is a foundation transformer model pretrained on a large-scale + corpus of single cell transcriptomes to enable context-aware predictions in + network biology. For this task, Geneformer is used to create a batch-corrected + cell embedding. + + Here, we use a version packaged as an MLflow model. +references: + doi: + - 10.1038/s41586-023-06139-9 + - 10.1101/2024.08.16.608180 +links: + documentation: https://geneformer.readthedocs.io/en/latest/index.html + repository: https://huggingface.co/ctheodoris/Geneformer + +info: + method_types: [embedding] + preferred_normalization: counts + +arguments: + - name: --model + type: file + description: | + An MLflow model URI for the Geneformer model. If it is a .zip or + .tar.gz file it will be extracted to a temporary directory. + required: true + +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py + - path: /src/utils/exit_codes.py + - path: requirements.txt + +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1 + setup: + - type: docker + add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh + run: sh /uv-installer.sh && rm /uv-installer.sh + env: PATH="/root/.local/bin/:$PATH" + - type: docker + run: uv venv --python 3.11 /opt/venv + - type: docker + env: + - VIRTUAL_ENV=/opt/venv + - PATH="/opt/venv/bin:$PATH" + add: requirements.txt /requirements.txt + run: uv pip install -r /requirements.txt + - type: docker + run: uv pip install mlflow==3.1.0 + - type: docker + run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems + +runners: + - type: executable + - type: nextflow + directives: + label: [hightime, highmem, midcpu, gpu] diff --git a/src/methods/geneformer_mlflow/requirements.txt b/src/methods/geneformer_mlflow/requirements.txt new file mode 100644 index 00000000..21bec26b --- /dev/null +++ b/src/methods/geneformer_mlflow/requirements.txt @@ -0,0 +1,540 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --output-file=/tmp/tmpmz65ifid/requirements_pip_final.txt requirements.in +# +absl-py==2.3.1 + # via tensorboard +accelerate==1.10.0 + # via peft +accumulation-tree==0.6.4 + # via tdigest +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.15 + # via fsspec +aiosignal==1.4.0 + # via aiohttp +alembic==1.16.4 + # via + # mlflow + # optuna +anndata==0.10.9 + # via + # -r requirements.in + # geneformer + # scanpy +annotated-types==0.7.0 + # via pydantic +antlr4-python3-runtime==4.9.3 + # via omegaconf +anyio==4.10.0 + # via starlette +array-api-compat==1.12.0 + # via anndata +attrs==25.3.0 + # via + # aiohttp + # jsonschema + # referencing +blinker==1.9.0 + # via flask +cachetools==5.5.2 + # via + # google-auth + # mlflow-skinny +certifi==2025.8.3 + # via requests +charset-normalizer==3.4.3 + # via requests +click==8.2.1 + # via + # flask + # loompy + # mlflow-skinny + # ray + # uvicorn +cloudpickle==3.1.1 + # via mlflow-skinny +colorlog==6.9.0 + # via optuna +contourpy==1.3.3 + # via matplotlib +cycler==0.12.1 + # via matplotlib +databricks-sdk==0.62.0 + # via mlflow-skinny +datasets==4.0.0 + # via geneformer +dill==0.3.8 + # via + # datasets + # multiprocess +docker==7.1.0 + # via mlflow +fastapi==0.116.1 + # via mlflow-skinny +filelock==3.18.0 + # via + # datasets + # huggingface-hub + # ray + # torch + # transformers +flask==3.1.1 + # via mlflow +fonttools==4.59.0 + # via matplotlib +frozenlist==1.7.0 + # via + # aiohttp + # aiosignal +fsspec[http]==2025.3.0 + # via + # datasets + # huggingface-hub + # torch +geneformer @ git+https://huggingface.co/ctheodoris/Geneformer@69e6887 + # via -r requirements.in +gitdb==4.0.12 + # via gitpython +gitpython==3.1.45 + # via mlflow-skinny +google-auth==2.40.3 + # via databricks-sdk +graphene==3.4.3 + # via mlflow +graphql-core==3.2.6 + # via + # graphene + # graphql-relay +graphql-relay==3.2.0 + # via graphene +greenlet==3.2.4 + # via sqlalchemy +grpcio==1.74.0 + # via tensorboard +gunicorn==23.0.0 + # via mlflow +h11==0.16.0 + # via uvicorn +h5py==3.14.0 + # via + # anndata + # loompy + # scanpy +hf-xet==1.1.7 + # via huggingface-hub +huggingface-hub==0.34.4 + # via + # accelerate + # datasets + # peft + # tokenizers + # transformers +idna==3.10 + # via + # anyio + # requests + # yarl +importlib-metadata==8.7.0 + # via + # mlflow-skinny + # opentelemetry-api +itsdangerous==2.2.0 + # via flask +jinja2==3.1.6 + # via + # flask + # torch +joblib==1.5.1 + # via + # pynndescent + # scanpy + # scikit-learn +jsonschema==4.25.0 + # via ray +jsonschema-specifications==2025.4.1 + # via jsonschema +kiwisolver==1.4.9 + # via matplotlib +legacy-api-wrap==1.4.1 + # via scanpy +llvmlite==0.44.0 + # via + # numba + # pynndescent +loompy==3.0.8 + # via geneformer +mako==1.3.10 + # via alembic +markdown==3.8.2 + # via tensorboard +markupsafe==3.0.2 + # via + # flask + # jinja2 + # mako + # werkzeug +matplotlib==3.10.5 + # via + # geneformer + # mlflow + # scanpy + # seaborn +mlflow==3.1.0 + # via -r requirements.in +mlflow-skinny==3.1.0 + # via mlflow +mpmath==1.3.0 + # via sympy +msgpack==1.1.1 + # via ray +multidict==6.6.4 + # via + # aiohttp + # yarl +multiprocess==0.70.16 + # via datasets +natsort==8.4.0 + # via + # anndata + # scanpy +networkx==3.5 + # via + # scanpy + # torch +numba==0.61.2 + # via + # loompy + # pynndescent + # scanpy + # umap-learn +numpy==2.2.6 + # via + # accelerate + # anndata + # contourpy + # datasets + # geneformer + # h5py + # loompy + # matplotlib + # mlflow + # numba + # numpy-groupies + # optuna + # pandas + # patsy + # peft + # scanpy + # scikit-learn + # scipy + # seaborn + # statsmodels + # tensorboard + # transformers + # umap-learn +numpy-groupies==0.11.3 + # via loompy +nvidia-cublas-cu12==12.8.4.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.8.90 + # via torch +nvidia-cuda-nvrtc-cu12==12.8.93 + # via torch +nvidia-cuda-runtime-cu12==12.8.90 + # via torch +nvidia-cudnn-cu12==9.10.2.21 + # via torch +nvidia-cufft-cu12==11.3.3.83 + # via torch +nvidia-cufile-cu12==1.13.1.3 + # via torch +nvidia-curand-cu12==10.3.9.90 + # via torch +nvidia-cusolver-cu12==11.7.3.90 + # via torch +nvidia-cusparse-cu12==12.5.8.93 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-cusparselt-cu12==0.7.1 + # via torch +nvidia-nccl-cu12==2.27.3 + # via torch +nvidia-nvjitlink-cu12==12.8.93 + # via + # nvidia-cufft-cu12 + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.8.90 + # via torch +omegaconf==2.3.0 + # via -r requirements.in +opentelemetry-api==1.36.0 + # via + # mlflow-skinny + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-sdk==1.36.0 + # via mlflow-skinny +opentelemetry-semantic-conventions==0.57b0 + # via opentelemetry-sdk +optuna==4.4.0 + # via + # geneformer + # optuna-integration +optuna-integration==4.4.0 + # via geneformer +packaging==25.0 + # via + # accelerate + # anndata + # datasets + # geneformer + # gunicorn + # huggingface-hub + # matplotlib + # mlflow-skinny + # optuna + # peft + # ray + # scanpy + # statsmodels + # tensorboard + # transformers +pandas==2.3.1 + # via + # anndata + # datasets + # geneformer + # mlflow + # scanpy + # seaborn + # statsmodels +patsy==1.0.1 + # via + # scanpy + # statsmodels +peft==0.17.0 + # via geneformer +pillow==11.3.0 + # via + # matplotlib + # tensorboard +propcache==0.3.2 + # via + # aiohttp + # yarl +protobuf==6.31.1 + # via + # mlflow-skinny + # ray + # tensorboard +psutil==7.0.0 + # via + # accelerate + # peft +pyarrow==20.0.0 + # via + # datasets + # geneformer + # mlflow +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth +pydantic==2.11.7 + # via + # fastapi + # mlflow-skinny +pydantic-core==2.33.2 + # via pydantic +pynndescent==0.5.13 + # via + # scanpy + # umap-learn +pyparsing==3.2.3 + # via matplotlib +python-dateutil==2.9.0.post0 + # via + # graphene + # matplotlib + # pandas +pytz==2025.2 + # via + # geneformer + # pandas +pyudorandom==1.0.0 + # via tdigest +pyyaml==6.0.2 + # via + # accelerate + # datasets + # huggingface-hub + # mlflow-skinny + # omegaconf + # optuna + # peft + # ray + # transformers +ray==2.48.0 + # via geneformer +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications +regex==2025.7.34 + # via transformers +requests==2.32.4 + # via + # databricks-sdk + # datasets + # docker + # huggingface-hub + # mlflow-skinny + # ray + # transformers +rpds-py==0.27.0 + # via + # jsonschema + # referencing +rsa==4.9.1 + # via google-auth +safetensors==0.6.2 + # via + # accelerate + # peft + # transformers +scanpy==1.11.4 + # via geneformer +scikit-learn==1.7.1 + # via + # geneformer + # mlflow + # pynndescent + # scanpy + # umap-learn +scipy==1.16.1 + # via + # anndata + # geneformer + # loompy + # mlflow + # pynndescent + # scanpy + # scikit-learn + # statsmodels + # umap-learn +seaborn==0.13.2 + # via + # geneformer + # scanpy +session-info2==0.2 + # via scanpy +six==1.17.0 + # via python-dateutil +smmap==5.0.2 + # via gitdb +sniffio==1.3.1 + # via anyio +sqlalchemy==2.0.43 + # via + # alembic + # mlflow + # optuna +sqlparse==0.5.3 + # via mlflow-skinny +starlette==0.47.2 + # via fastapi +statsmodels==0.14.5 + # via + # geneformer + # scanpy +sympy==1.14.0 + # via torch +tdigest==0.5.2.2 + # via geneformer +tensorboard==2.20.0 + # via geneformer +tensorboard-data-server==0.7.2 + # via tensorboard +threadpoolctl==3.6.0 + # via scikit-learn +tokenizers==0.21.4 + # via transformers +torch==2.8.0 + # via + # accelerate + # geneformer + # peft +tqdm==4.67.1 + # via + # datasets + # geneformer + # huggingface-hub + # optuna + # peft + # scanpy + # transformers + # umap-learn +transformers==4.49.0 + # via + # -r requirements.in + # geneformer + # peft +triton==3.4.0 + # via torch +typing-extensions==4.14.1 + # via + # aiosignal + # alembic + # anyio + # fastapi + # graphene + # huggingface-hub + # mlflow-skinny + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # referencing + # scanpy + # sqlalchemy + # starlette + # torch + # typing-inspection +typing-inspection==0.4.1 + # via pydantic +tzdata==2025.2 + # via pandas +umap-learn==0.5.9.post2 + # via scanpy +urllib3==2.5.0 + # via + # docker + # requests +uvicorn==0.35.0 + # via mlflow-skinny +werkzeug==3.1.3 + # via + # flask + # tensorboard +xxhash==3.5.0 + # via datasets +yarl==1.20.1 + # via aiohttp +zipp==3.23.0 + # via importlib-metadata + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/src/methods/geneformer_mlflow/script.py b/src/methods/geneformer_mlflow/script.py new file mode 100644 index 00000000..800ab80b --- /dev/null +++ b/src/methods/geneformer_mlflow/script.py @@ -0,0 +1,112 @@ +import os +import sys +import tarfile +import tempfile +import zipfile + +import anndata as ad +import mlflow.pyfunc +import pandas as pd + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + "input": "resources_test/.../input.h5ad", + "output": "output.h5ad", + "model": "resources_test/.../model", +} +meta = {"name": "geneformer_mlflow"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from exit_codes import exit_non_applicable +from read_anndata_partial import read_anndata + +print("====== Geneformer (MLflow model) ======", flush=True) + +print("\n>>> Reading input files...", flush=True) +print(f"Input H5AD file: '{par['input']}'", flush=True) +adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") + +if adata.uns["dataset_organism"] != "homo_sapiens": + exit_non_applicable( + f"Geneformer (MLflow) can only be used with human data " + f'(dataset_organism == "{adata.uns["dataset_organism"]}")' + ) + +print(adata, flush=True) + +if os.path.isdir(par["model"]): + print("\n>>> Using model directory...", flush=True) + print(f"Directory path: '{par['model']}'", flush=True) + model_temp = None + model_dir = par["model"] +else: + model_temp = tempfile.TemporaryDirectory() + model_dir = model_temp.name + + if zipfile.is_zipfile(par["model"]): + print("\n>>> Extracting model from .zip...", flush=True) + print(f".zip path: '{par['model']}'", flush=True) + with zipfile.ZipFile(par["model"], "r") as zip_file: + zip_file.extractall(model_dir) + elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): + print("\n>>> Extracting model from .tar.gz...", flush=True) + print(f".tar.gz path: '{par['model']}'", flush=True) + with tarfile.open(par["model"], "r:gz") as tar_file: + tar_file.extractall(model_dir) + model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) + else: + raise ValueError( + "The 'model' argument should be a directory a .zip file or a .tar.gz file" + ) + +print("\n>>> Loading model...", flush=True) +model = mlflow.pyfunc.load_model(model_dir) +print(model, flush=True) + +print("\n>>> Writing temporary input H5AD file...", flush=True) +input_adata = ad.AnnData( + X=adata.X.copy(), + var=adata.var.filter(items=["feature_id"]).rename( + columns={"feature_id": "ensembl_id"} + ), +) +print(input_adata, flush=True) + +h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) +print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) +input_adata.write(h5ad_file.name) +del input_adata + +print("\n>>> Running model...", flush=True) +input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) +embedding = model.predict(input_df) + +print("\n>>> Storing output...", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": embedding, + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, +) +print(output) + +print("\n>>> Writing output to file...", flush=True) +print(f"Output H5AD file: '{par['output']}'", flush=True) +output.write_h5ad(par["output"], compression="gzip") + +print("\n>>> Cleaning up temporary files...", flush=True) +if model_temp is not None: + model_temp.cleanup() +h5ad_file.close() +os.unlink(h5ad_file.name) + +print("\n>>> Done!", flush=True) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index af6f51b7..34db6276 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -93,6 +93,7 @@ dependencies: - name: methods/bbknn - name: methods/combat - name: methods/geneformer + - name: methods/geneformer_mlflow - name: methods/harmony - name: methods/harmonypy - name: methods/liger diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 2db049f6..44ff5ed5 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -21,6 +21,9 @@ methods = [ bbknn, combat, geneformer, + geneformer_mlflow.run( + args: [model: file("s3://openproblems-work/cache/geneformer-mlflow-model.zip")] + ), harmony, harmonypy, liger, @@ -61,7 +64,7 @@ metrics = [ hvg_overlap, isolated_label_asw, isolated_label_f1, - kbet, + // kbet, kbet_pg, kbet_pg_label, lisi, From 04d8a5da3cee43307223ab88b03aa36f27efb497 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 13 Aug 2025 12:37:20 +0200 Subject: [PATCH 12/21] Add scgpt_mlflow method --- src/methods/scgpt_mlflow/config.vsh.yaml | 62 ++ src/methods/scgpt_mlflow/requirements.txt | 684 ++++++++++++++++++++ src/methods/scgpt_mlflow/script.py | 111 ++++ src/workflows/run_benchmark/config.vsh.yaml | 1 + src/workflows/run_benchmark/main.nf | 3 + 5 files changed, 861 insertions(+) create mode 100644 src/methods/scgpt_mlflow/config.vsh.yaml create mode 100644 src/methods/scgpt_mlflow/requirements.txt create mode 100644 src/methods/scgpt_mlflow/script.py diff --git a/src/methods/scgpt_mlflow/config.vsh.yaml b/src/methods/scgpt_mlflow/config.vsh.yaml new file mode 100644 index 00000000..b8455165 --- /dev/null +++ b/src/methods/scgpt_mlflow/config.vsh.yaml @@ -0,0 +1,62 @@ +__merge__: ../../api/base_method.yaml + +name: scgpt_mlflow +label: scGPT (MLflow model) +summary: A foundation model for single-cell biology +description: | + scGPT is a foundation model for single-cell biology based on a generative + pre-trained transformer and trained on a repository of over 33 million cells. + + Here, we use a version packaged as an MLflow model. +references: + doi: + - 10.1038/s41592-024-02201-0 +links: + documentation: https://scgpt.readthedocs.io/en/latest/ + repository: https://github.com/bowang-lab/scGPT + +info: + method_types: [embedding] + preferred_normalization: counts + +arguments: + - name: --model + type: file + description: | + An MLflow model URI for the transcriptformer model. If it is a .zip or + .tar.gz file it will be extracted to a temporary directory. + required: true + +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py + - path: /src/utils/exit_codes.py + - path: requirements.txt + +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1 + setup: + - type: docker + add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh + run: sh /uv-installer.sh && rm /uv-installer.sh + env: PATH="/root/.local/bin/:$PATH" + - type: docker + run: uv venv --python 3.11 /opt/venv + - type: docker + env: + - VIRTUAL_ENV=/opt/venv + - PATH="/opt/venv/bin:$PATH" + add: requirements.txt /requirements.txt + run: uv pip install -r /requirements.txt + - type: docker + run: uv pip install mlflow==3.1.0 + - type: docker + run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems + +runners: + - type: executable + - type: nextflow + directives: + label: [hightime, highmem, midcpu, gpu] diff --git a/src/methods/scgpt_mlflow/requirements.txt b/src/methods/scgpt_mlflow/requirements.txt new file mode 100644 index 00000000..2ad53dc3 --- /dev/null +++ b/src/methods/scgpt_mlflow/requirements.txt @@ -0,0 +1,684 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o /tmp/tmp7yfkiop2/requirements_initial.txt +absl-py==2.3.1 + # via + # chex + # ml-collections + # optax + # orbax + # orbax-checkpoint +aiofiles==24.1.0 + # via orbax-checkpoint +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.15 + # via + # datasets + # fsspec +aiosignal==1.4.0 + # via aiohttp +alembic==1.16.4 + # via mlflow +anndata==0.10.9 + # via + # -r requirements.in + # mudata + # scanpy + # scib + # scvi-tools +annotated-types==0.7.0 + # via pydantic +anyio==4.10.0 + # via starlette +array-api-compat==1.12.0 + # via anndata +asttokens==3.0.0 + # via stack-data +async-timeout==5.0.1 + # via aiohttp +attrs==25.3.0 + # via aiohttp +blinker==1.9.0 + # via flask +cached-property==2.0.1 + # via orbax +cachetools==5.5.2 + # via + # google-auth + # mlflow-skinny +cell-gears==0.0.2 + # via scgpt +certifi==2025.8.3 + # via requests +charset-normalizer==3.4.3 + # via requests +chex==0.1.90 + # via + # optax + # scvi-tools +click==8.2.1 + # via + # flask + # mlflow-skinny + # uvicorn +cloudpickle==3.1.1 + # via mlflow-skinny +contourpy==1.3.2 + # via matplotlib +cycler==0.12.1 + # via matplotlib +databricks-sdk==0.62.0 + # via mlflow-skinny +datasets==2.14.4 + # via scgpt +dcor==0.6 + # via cell-gears +decorator==5.2.1 + # via ipython +deprecated==1.2.18 + # via scib +dill==0.3.7 + # via + # datasets + # multiprocess +docker==7.1.0 + # via mlflow +docrep==0.3.2 + # via scvi-tools +et-xmlfile==2.0.0 + # via openpyxl +etils==1.13.0 + # via + # orbax + # orbax-checkpoint +exceptiongroup==1.3.0 + # via + # anndata + # anyio + # ipython +executing==2.2.0 + # via stack-data +fastapi==0.116.1 + # via mlflow-skinny +filelock==3.18.0 + # via + # huggingface-hub + # torch + # triton +flask==3.1.1 + # via mlflow +flax==0.10.7 + # via scvi-tools +fonttools==4.59.0 + # via matplotlib +frozenlist==1.7.0 + # via + # aiohttp + # aiosignal +fsspec==2025.7.0 + # via + # datasets + # etils + # huggingface-hub + # pytorch-lightning + # torch +gitdb==4.0.12 + # via gitpython +gitpython==3.1.45 + # via mlflow-skinny +google-auth==2.40.3 + # via databricks-sdk +graphene==3.4.3 + # via mlflow +graphql-core==3.2.6 + # via + # graphene + # graphql-relay +graphql-relay==3.2.0 + # via graphene +greenlet==3.2.4 + # via sqlalchemy +gunicorn==23.0.0 + # via mlflow +h11==0.16.0 + # via uvicorn +h5py==3.14.0 + # via + # anndata + # scanpy + # scib + # scvi-tools +hf-xet==1.1.7 + # via huggingface-hub +huggingface-hub==0.34.4 + # via datasets +humanize==4.12.3 + # via orbax-checkpoint +idna==3.10 + # via + # anyio + # requests + # yarl +igraph==0.11.9 + # via + # leidenalg + # scib +importlib-metadata==8.7.0 + # via + # mlflow-skinny + # opentelemetry-api +importlib-resources==6.5.2 + # via + # etils + # orbax +ipython==8.27.0 + # via -r requirements.in +itsdangerous==2.2.0 + # via flask +jax==0.6.2 + # via + # chex + # flax + # numpyro + # optax + # orbax + # orbax-checkpoint + # scvi-tools +jaxlib==0.6.2 + # via + # chex + # jax + # numpyro + # optax + # orbax + # scvi-tools +jedi==0.19.2 + # via ipython +jinja2==3.1.6 + # via + # flask + # torch +joblib==1.5.1 + # via + # dcor + # pynndescent + # scanpy + # scikit-learn +kiwisolver==1.4.9 + # via matplotlib +legacy-api-wrap==1.4.1 + # via scanpy +leidenalg==0.10.2 + # via + # scgpt + # scib +lightning-utilities==0.15.2 + # via + # pytorch-lightning + # torchmetrics +llvmlite==0.44.0 + # via + # numba + # pynndescent + # scib +mako==1.3.10 + # via alembic +markdown-it-py==4.0.0 + # via rich +markupsafe==3.0.2 + # via + # flask + # jinja2 + # mako + # werkzeug +matplotlib==3.10.5 + # via + # mlflow + # scanpy + # scib + # seaborn +matplotlib-inline==0.1.7 + # via ipython +mdurl==0.1.2 + # via markdown-it-py +ml-collections==1.1.0 + # via scvi-tools +ml-dtypes==0.5.3 + # via + # jax + # jaxlib + # tensorstore +mlflow==3.1.0 + # via -r requirements.in +mlflow-skinny==3.1.0 + # via mlflow +mpmath==1.3.0 + # via sympy +msgpack==1.1.1 + # via + # flax + # orbax + # orbax-checkpoint +mudata==0.3.2 + # via scvi-tools +multidict==6.6.4 + # via + # aiohttp + # yarl +multipledispatch==1.0.0 + # via numpyro +multiprocess==0.70.15 + # via datasets +natsort==8.4.0 + # via + # anndata + # scanpy +nest-asyncio==1.6.0 + # via + # orbax + # orbax-checkpoint +networkx==3.4.2 + # via + # cell-gears + # scanpy + # torch +numba==0.61.2 + # via + # dcor + # pynndescent + # scanpy + # scgpt + # scib + # umap-learn +numpy==1.26.4 + # via + # anndata + # cell-gears + # chex + # contourpy + # datasets + # dcor + # h5py + # jax + # jaxlib + # matplotlib + # ml-dtypes + # mlflow + # numba + # numpyro + # optax + # orbax + # orbax-checkpoint + # pandas + # patsy + # pyro-ppl + # pytorch-lightning + # scanpy + # scib + # scikit-learn + # scikit-misc + # scipy + # scvi-tools + # seaborn + # statsmodels + # tensorstore + # torchmetrics + # torchtext + # treescope + # umap-learn +numpyro==0.19.0 + # via scvi-tools +nvidia-cublas-cu12==12.1.3.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via torch +nvidia-cuda-runtime-cu12==12.1.105 + # via torch +nvidia-cudnn-cu12==8.9.2.26 + # via torch +nvidia-cufft-cu12==11.0.2.54 + # via torch +nvidia-curand-cu12==10.3.2.106 + # via torch +nvidia-cusolver-cu12==11.4.5.107 + # via torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.18.1 + # via torch +nvidia-nvjitlink-cu12==12.9.86 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via torch +openpyxl==3.1.5 + # via scvi-tools +opentelemetry-api==1.36.0 + # via + # mlflow-skinny + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-sdk==1.36.0 + # via mlflow-skinny +opentelemetry-semantic-conventions==0.57b0 + # via opentelemetry-sdk +opt-einsum==3.4.0 + # via + # jax + # pyro-ppl +optax==0.2.5 + # via + # flax + # scvi-tools +orbax==0.1.7 + # via scgpt +orbax-checkpoint==0.11.21 + # via flax +packaging==25.0 + # via + # anndata + # datasets + # gunicorn + # huggingface-hub + # lightning-utilities + # matplotlib + # mlflow-skinny + # pytorch-lightning + # scanpy + # statsmodels + # torchmetrics +pandas==2.3.1 + # via + # anndata + # cell-gears + # datasets + # mlflow + # scanpy + # scgpt + # scib + # scvi-tools + # seaborn + # statsmodels +parso==0.8.4 + # via jedi +patsy==1.0.1 + # via + # scanpy + # statsmodels +pexpect==4.9.0 + # via ipython +pillow==11.3.0 + # via matplotlib +prompt-toolkit==3.0.51 + # via ipython +propcache==0.3.2 + # via + # aiohttp + # yarl +protobuf==6.31.1 + # via + # mlflow-skinny + # orbax-checkpoint +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.3 + # via stack-data +pyarrow==20.0.0 + # via + # datasets + # mlflow +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth +pydantic==2.11.7 + # via + # fastapi + # mlflow-skinny +pydantic-core==2.33.2 + # via pydantic +pydot==4.0.1 + # via scib +pygments==2.19.2 + # via + # ipython + # rich +pynndescent==0.5.13 + # via + # scanpy + # umap-learn +pyparsing==3.2.3 + # via + # matplotlib + # pydot +pyro-api==0.1.2 + # via pyro-ppl +pyro-ppl==1.9.1 + # via scvi-tools +python-dateutil==2.9.0.post0 + # via + # graphene + # matplotlib + # pandas +pytorch-lightning==1.9.5 + # via scvi-tools +pytz==2025.2 + # via pandas +pyyaml==6.0.2 + # via + # datasets + # flax + # huggingface-hub + # ml-collections + # mlflow-skinny + # orbax + # orbax-checkpoint + # pytorch-lightning +requests==2.32.4 + # via + # databricks-sdk + # datasets + # docker + # huggingface-hub + # mlflow-skinny + # torchdata + # torchtext +rich==14.1.0 + # via + # flax + # scvi-tools +rsa==4.9.1 + # via google-auth +scanpy==1.11.4 + # via + # cell-gears + # scgpt + # scib +scgpt==0.2.1 + # via -r requirements.in +scib==1.1.7 + # via scgpt +scikit-learn==1.7.1 + # via + # cell-gears + # mlflow + # pynndescent + # scanpy + # scib + # scvi-tools + # umap-learn +scikit-misc==0.5.1 + # via + # scgpt + # scib +scipy==1.12.0 + # via + # -r requirements.in + # anndata + # dcor + # jax + # jaxlib + # mlflow + # pynndescent + # scanpy + # scib + # scikit-learn + # scvi-tools + # statsmodels + # umap-learn +scvi-tools==0.20.3 + # via scgpt +seaborn==0.13.2 + # via + # scanpy + # scib +session-info2==0.2 + # via scanpy +setuptools==80.9.0 + # via lightning-utilities +simplejson==3.20.1 + # via orbax-checkpoint +six==1.17.0 + # via + # docrep + # python-dateutil +smmap==5.0.2 + # via gitdb +sniffio==1.3.1 + # via anyio +sqlalchemy==2.0.43 + # via + # alembic + # mlflow +sqlparse==0.5.3 + # via mlflow-skinny +stack-data==0.6.3 + # via ipython +starlette==0.47.2 + # via fastapi +statsmodels==0.14.5 + # via scanpy +sympy==1.14.0 + # via torch +tensorstore==0.1.76 + # via + # flax + # orbax + # orbax-checkpoint +texttable==1.7.0 + # via igraph +threadpoolctl==3.6.0 + # via scikit-learn +tomli==2.2.1 + # via alembic +toolz==1.0.0 + # via chex +torch==2.1.2 + # via + # cell-gears + # pyro-ppl + # pytorch-lightning + # scgpt + # scvi-tools + # torchdata + # torchmetrics + # torchtext +torchdata==0.7.1 + # via torchtext +torchmetrics==1.8.1 + # via + # pytorch-lightning + # scvi-tools +torchtext==0.16.2 + # via scgpt +tqdm==4.67.1 + # via + # cell-gears + # datasets + # huggingface-hub + # numpyro + # pyro-ppl + # pytorch-lightning + # scanpy + # scvi-tools + # torchtext + # umap-learn +traitlets==5.14.3 + # via + # ipython + # matplotlib-inline +treescope==0.1.10 + # via flax +triton==2.1.0 + # via torch +typing-extensions==4.14.1 + # via + # aiosignal + # alembic + # anyio + # chex + # etils + # exceptiongroup + # fastapi + # flax + # graphene + # huggingface-hub + # ipython + # lightning-utilities + # mlflow-skinny + # multidict + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # orbax + # orbax-checkpoint + # pydantic + # pydantic-core + # pytorch-lightning + # scanpy + # scgpt + # sqlalchemy + # starlette + # torch + # typing-inspection + # uvicorn +typing-inspection==0.4.1 + # via pydantic +tzdata==2025.2 + # via pandas +umap-learn==0.5.9.post2 + # via + # scanpy + # scgpt + # scib +urllib3==2.5.0 + # via + # docker + # requests + # torchdata +uvicorn==0.35.0 + # via mlflow-skinny +wcwidth==0.2.13 + # via prompt-toolkit +werkzeug==3.1.3 + # via flask +wrapt==1.17.3 + # via deprecated +xxhash==3.5.0 + # via datasets +yarl==1.20.1 + # via aiohttp +zipp==3.23.0 + # via + # etils + # importlib-metadata diff --git a/src/methods/scgpt_mlflow/script.py b/src/methods/scgpt_mlflow/script.py new file mode 100644 index 00000000..7c70c6a0 --- /dev/null +++ b/src/methods/scgpt_mlflow/script.py @@ -0,0 +1,111 @@ +import os +import sys +import tarfile +import tempfile +import zipfile + +import anndata as ad +import mlflow.pyfunc +import pandas as pd + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + "input": "resources_test/.../input.h5ad", + "output": "output.h5ad", + "model": "resources_test/.../model", +} +meta = {"name": "scGPT_mlflow"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from exit_codes import exit_non_applicable +from read_anndata_partial import read_anndata + +print("====== scGPT (MLflow model) ======", flush=True) + +print("\n>>> Reading input files...", flush=True) +print(f"Input H5AD file: '{par['input']}'", flush=True) +adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") + +if adata.uns["dataset_organism"] != "homo_sapiens": + exit_non_applicable( + f"scGPT (MLflow) can only be used with human data " + f'(dataset_organism == "{adata.uns["dataset_organism"]}")' + ) + +print(adata, flush=True) + +if os.path.isdir(par["model"]): + print("\n>>> Using model directory...", flush=True) + print(f"Directory path: '{par['model']}'", flush=True) + model_temp = None + model_dir = par["model"] +else: + model_temp = tempfile.TemporaryDirectory() + model_dir = model_temp.name + + if zipfile.is_zipfile(par["model"]): + print("\n>>> Extracting model from .zip...", flush=True) + print(f".zip path: '{par['model']}'", flush=True) + with zipfile.ZipFile(par["model"], "r") as zip_file: + zip_file.extractall(model_dir) + elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): + print("\n>>> Extracting model from .tar.gz...", flush=True) + print(f".tar.gz path: '{par['model']}'", flush=True) + with tarfile.open(par["model"], "r:gz") as tar_file: + tar_file.extractall(model_dir) + model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) + else: + raise ValueError( + "The 'model' argument should be a directory a .zip file or a .tar.gz file" + ) + +print("\n>>> Loading model...", flush=True) +model = mlflow.pyfunc.load_model(model_dir) +print(model, flush=True) + +print("\n>>> Writing temporary input H5AD file...", flush=True) +input_adata = ad.AnnData( + X=adata.X.copy(), + var=adata.var.filter(items=["feature_name"]), +) +print(input_adata, flush=True) + +h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) +print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) +input_adata.write(h5ad_file.name) +del input_adata + +print("\n>>> Running model...", flush=True) +input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) +input_params = {"gene_col": "feature_name"} +embedding = model.predict(input_df, params=input_params) + +print("\n>>> Storing output...", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": embedding, + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, +) +print(output) + +print("\n>>> Writing output to file...", flush=True) +print(f"Output H5AD file: '{par['output']}'", flush=True) +output.write_h5ad(par["output"], compression="gzip") + +print("\n>>> Cleaning up temporary files...", flush=True) +if model_temp is not None: + model_temp.cleanup() +h5ad_file.close() +os.unlink(h5ad_file.name) + +print("\n>>> Done!", flush=True) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 34db6276..f5da4fc0 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -103,6 +103,7 @@ dependencies: - name: methods/scanorama - name: methods/scanvi - name: methods/scgpt_finetuned + - name: methods/scgpt_mlflow - name: methods/scgpt_zeroshot - name: methods/scimilarity - name: methods/scprint diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 44ff5ed5..0e2f656e 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -35,6 +35,9 @@ methods = [ scgpt_finetuned.run( args: [model: file("s3://openproblems-work/cache/scGPT_human.zip")] ), + scgpt_mlflow.run( + args: [model: file("s3://openproblems-work/cache/scgpt-mlflow-model.zip")] + ), scgpt_zeroshot.run( args: [model: file("s3://openproblems-work/cache/scGPT_human.zip")] ), From 5c46c4c6c272f503002e9bbd8508d5fd6582df1e Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 13 Aug 2025 13:41:46 +0200 Subject: [PATCH 13/21] Add uce_mlflow method --- src/methods/uce_mlflow/config.vsh.yaml | 63 ++++ src/methods/uce_mlflow/requirements.txt | 366 ++++++++++++++++++++ src/methods/uce_mlflow/script.py | 110 ++++++ src/workflows/run_benchmark/config.vsh.yaml | 1 + src/workflows/run_benchmark/main.nf | 3 + 5 files changed, 543 insertions(+) create mode 100644 src/methods/uce_mlflow/config.vsh.yaml create mode 100644 src/methods/uce_mlflow/requirements.txt create mode 100644 src/methods/uce_mlflow/script.py diff --git a/src/methods/uce_mlflow/config.vsh.yaml b/src/methods/uce_mlflow/config.vsh.yaml new file mode 100644 index 00000000..354cbd63 --- /dev/null +++ b/src/methods/uce_mlflow/config.vsh.yaml @@ -0,0 +1,63 @@ +__merge__: ../../api/base_method.yaml + +name: uce_mlflow +label: UCE (MLflow model) +summary: UCE offers a unified biological latent space that can represent any cell +description: | + Universal Cell Embedding (UCE) is a single-cell foundation model that offers a + unified biological latent space that can represent any cell, regardless of + tissue or species + + Here, we use a version packaged as an MLflow model. +references: + doi: + - 10.1101/2023.11.28.568918 +links: + documentation: https://github.com/snap-stanford/UCE/blob/main/README.md + repository: https://github.com/snap-stanford/UCE + +info: + method_types: [embedding] + preferred_normalization: counts + +arguments: + - name: --model + type: file + description: | + An MLflow model URI for the UCE model. If it is a .zip or + .tar.gz file it will be extracted to a temporary directory. + required: true + +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py + - path: /src/utils/exit_codes.py + - path: requirements.txt + +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1 + setup: + - type: docker + add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh + run: sh /uv-installer.sh && rm /uv-installer.sh + env: PATH="/root/.local/bin/:$PATH" + - type: docker + run: uv venv --python 3.11 /opt/venv + - type: docker + env: + - VIRTUAL_ENV=/opt/venv + - PATH="/opt/venv/bin:$PATH" + add: requirements.txt /requirements.txt + run: uv pip install -r /requirements.txt + - type: docker + run: uv pip install mlflow==3.1.0 + - type: docker + run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems + +runners: + - type: executable + - type: nextflow + directives: + label: [hightime, highmem, midcpu, gpu] diff --git a/src/methods/uce_mlflow/requirements.txt b/src/methods/uce_mlflow/requirements.txt new file mode 100644 index 00000000..b2f4227b --- /dev/null +++ b/src/methods/uce_mlflow/requirements.txt @@ -0,0 +1,366 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o /tmp/tmpg2ov1w_7/requirements_initial.txt +accelerate==0.34.2 + # via -r requirements.in +alembic==1.16.4 + # via mlflow +anndata==0.10.9 + # via + # -r requirements.in + # scanpy +annotated-types==0.7.0 + # via pydantic +antlr4-python3-runtime==4.9.3 + # via omegaconf +anyio==4.10.0 + # via starlette +array-api-compat==1.12.0 + # via anndata +blinker==1.9.0 + # via flask +cachetools==5.5.2 + # via + # google-auth + # mlflow-skinny +certifi==2025.8.3 + # via requests +charset-normalizer==3.4.3 + # via requests +click==8.2.1 + # via + # flask + # mlflow-skinny + # uvicorn +cloudpickle==3.1.1 + # via mlflow-skinny +contourpy==1.3.3 + # via matplotlib +cycler==0.12.1 + # via matplotlib +databricks-sdk==0.62.0 + # via mlflow-skinny +docker==7.1.0 + # via mlflow +fastapi==0.116.1 + # via mlflow-skinny +filelock==3.18.0 + # via + # huggingface-hub + # torch + # triton +flask==3.1.1 + # via mlflow +fonttools==4.59.0 + # via matplotlib +fsspec==2025.7.0 + # via + # huggingface-hub + # torch +gitdb==4.0.12 + # via gitpython +gitpython==3.1.45 + # via mlflow-skinny +google-auth==2.40.3 + # via databricks-sdk +graphene==3.4.3 + # via mlflow +graphql-core==3.2.6 + # via + # graphene + # graphql-relay +graphql-relay==3.2.0 + # via graphene +greenlet==3.2.4 + # via sqlalchemy +gunicorn==23.0.0 + # via mlflow +h11==0.16.0 + # via uvicorn +h5py==3.14.0 + # via + # anndata + # scanpy +hf-xet==1.1.7 + # via huggingface-hub +huggingface-hub==0.34.4 + # via accelerate +idna==3.10 + # via + # anyio + # requests +importlib-metadata==8.7.0 + # via + # mlflow-skinny + # opentelemetry-api +itsdangerous==2.2.0 + # via flask +jinja2==3.1.6 + # via + # flask + # torch +joblib==1.5.1 + # via + # pynndescent + # scanpy + # scikit-learn +kiwisolver==1.4.9 + # via matplotlib +legacy-api-wrap==1.4.1 + # via scanpy +llvmlite==0.44.0 + # via + # numba + # pynndescent +mako==1.3.10 + # via alembic +markupsafe==3.0.2 + # via + # flask + # jinja2 + # mako + # werkzeug +matplotlib==3.10.5 + # via + # mlflow + # scanpy + # seaborn +mlflow==3.1.0 + # via -r requirements.in +mlflow-skinny==3.1.0 + # via mlflow +mpmath==1.3.0 + # via sympy +natsort==8.4.0 + # via + # anndata + # scanpy +networkx==3.5 + # via + # scanpy + # torch +numba==0.61.2 + # via + # pynndescent + # scanpy + # umap-learn +numpy==1.26.4 + # via + # -r requirements.in + # accelerate + # anndata + # contourpy + # h5py + # matplotlib + # mlflow + # numba + # pandas + # patsy + # scanpy + # scikit-learn + # scipy + # seaborn + # statsmodels + # umap-learn +nvidia-cublas-cu12==12.1.3.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via torch +nvidia-cuda-runtime-cu12==12.1.105 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.0.2.54 + # via torch +nvidia-curand-cu12==10.3.2.106 + # via torch +nvidia-cusolver-cu12==11.4.5.107 + # via torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.20.5 + # via torch +nvidia-nvjitlink-cu12==12.9.86 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via torch +omegaconf==2.3.0 + # via -r requirements.in +opentelemetry-api==1.36.0 + # via + # mlflow-skinny + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-sdk==1.36.0 + # via mlflow-skinny +opentelemetry-semantic-conventions==0.57b0 + # via opentelemetry-sdk +packaging==25.0 + # via + # accelerate + # anndata + # gunicorn + # huggingface-hub + # matplotlib + # mlflow-skinny + # scanpy + # statsmodels +pandas==2.2.3 + # via + # -r requirements.in + # anndata + # mlflow + # scanpy + # seaborn + # statsmodels +patsy==1.0.1 + # via + # scanpy + # statsmodels +pillow==11.3.0 + # via matplotlib +protobuf==6.31.1 + # via mlflow-skinny +psutil==7.0.0 + # via accelerate +pyarrow==20.0.0 + # via mlflow +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth +pydantic==2.11.7 + # via + # fastapi + # mlflow-skinny +pydantic-core==2.33.2 + # via pydantic +pynndescent==0.5.13 + # via + # scanpy + # umap-learn +pyparsing==3.2.3 + # via matplotlib +python-dateutil==2.9.0.post0 + # via + # graphene + # matplotlib + # pandas +pytz==2025.2 + # via pandas +pyyaml==6.0.2 + # via + # accelerate + # huggingface-hub + # mlflow-skinny + # omegaconf +requests==2.32.4 + # via + # databricks-sdk + # docker + # huggingface-hub + # mlflow-skinny +rsa==4.9.1 + # via google-auth +safetensors==0.6.2 + # via accelerate +scanpy==1.10.2 + # via -r requirements.in +scikit-learn==1.7.1 + # via + # mlflow + # pynndescent + # scanpy + # umap-learn +scipy==1.14.1 + # via + # -r requirements.in + # anndata + # mlflow + # pynndescent + # scanpy + # scikit-learn + # statsmodels + # umap-learn +seaborn==0.13.2 + # via scanpy +session-info==1.0.1 + # via scanpy +six==1.17.0 + # via python-dateutil +smmap==5.0.2 + # via gitdb +sniffio==1.3.1 + # via anyio +sqlalchemy==2.0.43 + # via + # alembic + # mlflow +sqlparse==0.5.3 + # via mlflow-skinny +starlette==0.47.2 + # via fastapi +statsmodels==0.14.5 + # via scanpy +stdlib-list==0.11.1 + # via session-info +sympy==1.14.0 + # via torch +threadpoolctl==3.6.0 + # via scikit-learn +torch==2.4.1 + # via + # -r requirements.in + # accelerate +tqdm==4.66.5 + # via + # -r requirements.in + # huggingface-hub + # scanpy + # umap-learn +triton==3.0.0 + # via torch +typing-extensions==4.14.1 + # via + # alembic + # anyio + # fastapi + # graphene + # huggingface-hub + # mlflow-skinny + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # sqlalchemy + # starlette + # torch + # typing-inspection +typing-inspection==0.4.1 + # via pydantic +tzdata==2025.2 + # via pandas +umap-learn==0.5.9.post2 + # via scanpy +urllib3==1.26.6 + # via + # -r requirements.in + # docker + # requests +uvicorn==0.35.0 + # via mlflow-skinny +werkzeug==3.1.3 + # via flask +zipp==3.23.0 + # via importlib-metadata diff --git a/src/methods/uce_mlflow/script.py b/src/methods/uce_mlflow/script.py new file mode 100644 index 00000000..c61b2a68 --- /dev/null +++ b/src/methods/uce_mlflow/script.py @@ -0,0 +1,110 @@ +import os +import sys +import tarfile +import tempfile +import zipfile + +import anndata as ad +import mlflow.pyfunc +import pandas as pd + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + "input": "resources_test/.../input.h5ad", + "output": "output.h5ad", + "model": "resources_test/.../model", +} +meta = {"name": "uce_mlflow"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from exit_codes import exit_non_applicable +from read_anndata_partial import read_anndata + +print("====== UCE (MLflow model) ======", flush=True) + +print("\n>>> Reading input files...", flush=True) +print(f"Input H5AD file: '{par['input']}'", flush=True) +adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") + +if adata.uns["dataset_organism"] != "homo_sapiens": + exit_non_applicable( + f"UCE (MLflow) can only be used with human data " + f'(dataset_organism == "{adata.uns["dataset_organism"]}")' + ) + +print(adata, flush=True) + +if os.path.isdir(par["model"]): + print("\n>>> Using model directory...", flush=True) + print(f"Directory path: '{par['model']}'", flush=True) + model_temp = None + model_dir = par["model"] +else: + model_temp = tempfile.TemporaryDirectory() + model_dir = model_temp.name + + if zipfile.is_zipfile(par["model"]): + print("\n>>> Extracting model from .zip...", flush=True) + print(f".zip path: '{par['model']}'", flush=True) + with zipfile.ZipFile(par["model"], "r") as zip_file: + zip_file.extractall(model_dir) + elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): + print("\n>>> Extracting model from .tar.gz...", flush=True) + print(f".tar.gz path: '{par['model']}'", flush=True) + with tarfile.open(par["model"], "r:gz") as tar_file: + tar_file.extractall(model_dir) + model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) + else: + raise ValueError( + "The 'model' argument should be a directory a .zip file or a .tar.gz file" + ) + +print("\n>>> Loading model...", flush=True) +model = mlflow.pyfunc.load_model(model_dir) +print(model, flush=True) + +print("\n>>> Writing temporary input H5AD file...", flush=True) +input_adata = ad.AnnData( + X=adata.X.copy(), + var=adata.var.filter(items=["feature_name"]), +) +print(input_adata, flush=True) + +h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) +print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) +input_adata.write(h5ad_file.name) +del input_adata + +print("\n>>> Running model...", flush=True) +input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) +embedding = model.predict(input_df) + +print("\n>>> Storing output...", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": embedding, + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, +) +print(output) + +print("\n>>> Writing output to file...", flush=True) +print(f"Output H5AD file: '{par['output']}'", flush=True) +output.write_h5ad(par["output"], compression="gzip") + +print("\n>>> Cleaning up temporary files...", flush=True) +if model_temp is not None: + model_temp.cleanup() +h5ad_file.close() +os.unlink(h5ad_file.name) + +print("\n>>> Done!", flush=True) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index f5da4fc0..f7d472d0 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -111,6 +111,7 @@ dependencies: - name: methods/scvi_mlflow - name: methods/transcriptformer_mlflow - name: methods/uce + - name: methods/uce_mlflow # metrics - name: metrics/asw_batch - name: metrics/asw_label diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 0e2f656e..1fbebb45 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -54,6 +54,9 @@ methods = [ ), uce.run( args: [model: file("s3://openproblems-work/cache/uce-model-v5.zip")] + ), + uce_mlflow.run( + args: [model: file("s3://openproblems-work/cache/uce-mlflow-model.zip")] ) ] From 9c82ff4986979fb47606c9d4502764521a0b345b Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 13 Aug 2025 14:23:23 +0200 Subject: [PATCH 14/21] Add unpack_directory() utils helper --- src/methods/geneformer_mlflow/config.vsh.yaml | 1 + src/methods/geneformer_mlflow/script.py | 27 ++---------- src/methods/scgpt_mlflow/script.py | 27 ++---------- src/methods/scvi_mlflow/script.py | 27 ++---------- .../transcriptformer_mlflow/config.vsh.yaml | 1 + src/methods/transcriptformer_mlflow/script.py | 27 ++---------- src/methods/uce_mlflow/config.vsh.yaml | 1 + src/methods/uce_mlflow/script.py | 27 ++---------- src/utils/unpack.py | 43 +++++++++++++++++++ 9 files changed, 61 insertions(+), 120 deletions(-) create mode 100644 src/utils/unpack.py diff --git a/src/methods/geneformer_mlflow/config.vsh.yaml b/src/methods/geneformer_mlflow/config.vsh.yaml index b9d08eda..acacc638 100644 --- a/src/methods/geneformer_mlflow/config.vsh.yaml +++ b/src/methods/geneformer_mlflow/config.vsh.yaml @@ -35,6 +35,7 @@ resources: path: script.py - path: /src/utils/read_anndata_partial.py - path: /src/utils/exit_codes.py + - path: /src/utils/unpack.py - path: requirements.txt engines: diff --git a/src/methods/geneformer_mlflow/script.py b/src/methods/geneformer_mlflow/script.py index 800ab80b..a6860e9f 100644 --- a/src/methods/geneformer_mlflow/script.py +++ b/src/methods/geneformer_mlflow/script.py @@ -22,6 +22,7 @@ sys.path.append(meta["resources_dir"]) from exit_codes import exit_non_applicable from read_anndata_partial import read_anndata +from unpack import unpack_directory print("====== Geneformer (MLflow model) ======", flush=True) @@ -37,30 +38,8 @@ print(adata, flush=True) -if os.path.isdir(par["model"]): - print("\n>>> Using model directory...", flush=True) - print(f"Directory path: '{par['model']}'", flush=True) - model_temp = None - model_dir = par["model"] -else: - model_temp = tempfile.TemporaryDirectory() - model_dir = model_temp.name - - if zipfile.is_zipfile(par["model"]): - print("\n>>> Extracting model from .zip...", flush=True) - print(f".zip path: '{par['model']}'", flush=True) - with zipfile.ZipFile(par["model"], "r") as zip_file: - zip_file.extractall(model_dir) - elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): - print("\n>>> Extracting model from .tar.gz...", flush=True) - print(f".tar.gz path: '{par['model']}'", flush=True) - with tarfile.open(par["model"], "r:gz") as tar_file: - tar_file.extractall(model_dir) - model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) - else: - raise ValueError( - "The 'model' argument should be a directory a .zip file or a .tar.gz file" - ) +print("\n>>> Unpacking model...", flush=True) +model_dir, model_temp = unpack_directory(par["model"]) print("\n>>> Loading model...", flush=True) model = mlflow.pyfunc.load_model(model_dir) diff --git a/src/methods/scgpt_mlflow/script.py b/src/methods/scgpt_mlflow/script.py index 7c70c6a0..fdb6ca3a 100644 --- a/src/methods/scgpt_mlflow/script.py +++ b/src/methods/scgpt_mlflow/script.py @@ -22,6 +22,7 @@ sys.path.append(meta["resources_dir"]) from exit_codes import exit_non_applicable from read_anndata_partial import read_anndata +from unpack import unpack_directory print("====== scGPT (MLflow model) ======", flush=True) @@ -37,30 +38,8 @@ print(adata, flush=True) -if os.path.isdir(par["model"]): - print("\n>>> Using model directory...", flush=True) - print(f"Directory path: '{par['model']}'", flush=True) - model_temp = None - model_dir = par["model"] -else: - model_temp = tempfile.TemporaryDirectory() - model_dir = model_temp.name - - if zipfile.is_zipfile(par["model"]): - print("\n>>> Extracting model from .zip...", flush=True) - print(f".zip path: '{par['model']}'", flush=True) - with zipfile.ZipFile(par["model"], "r") as zip_file: - zip_file.extractall(model_dir) - elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): - print("\n>>> Extracting model from .tar.gz...", flush=True) - print(f".tar.gz path: '{par['model']}'", flush=True) - with tarfile.open(par["model"], "r:gz") as tar_file: - tar_file.extractall(model_dir) - model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) - else: - raise ValueError( - "The 'model' argument should be a directory a .zip file or a .tar.gz file" - ) +print("\n>>> Unpacking model...", flush=True) +model_dir, model_temp = unpack_directory(par["model"]) print("\n>>> Loading model...", flush=True) model = mlflow.pyfunc.load_model(model_dir) diff --git a/src/methods/scvi_mlflow/script.py b/src/methods/scvi_mlflow/script.py index 04ff94d5..7fe71b46 100644 --- a/src/methods/scvi_mlflow/script.py +++ b/src/methods/scvi_mlflow/script.py @@ -22,6 +22,7 @@ sys.path.append(meta["resources_dir"]) from exit_codes import exit_non_applicable from read_anndata_partial import read_anndata +from unpack import unpack_directory print("====== scVI (MLflow model) ======", flush=True) @@ -41,30 +42,8 @@ print(adata, flush=True) -if os.path.isdir(par["model"]): - print("\n>>> Using model directory...", flush=True) - print(f"Directory path: '{par['model']}'", flush=True) - model_temp = None - model_dir = par["model"] -else: - model_temp = tempfile.TemporaryDirectory() - model_dir = model_temp.name - - if zipfile.is_zipfile(par["model"]): - print("\n>>> Extracting model from .zip...", flush=True) - print(f".zip path: '{par['model']}'", flush=True) - with zipfile.ZipFile(par["model"], "r") as zip_file: - zip_file.extractall(model_dir) - elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): - print("\n>>> Extracting model from .tar.gz...", flush=True) - print(f".tar.gz path: '{par['model']}'", flush=True) - with tarfile.open(par["model"], "r:gz") as tar_file: - tar_file.extractall(model_dir) - model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) - else: - raise ValueError( - "The 'model' argument should be a directory a .zip file or a .tar.gz file" - ) +print("\n>>> Unpacking model...", flush=True) +model_dir, model_temp = unpack_directory(par["model"]) print("\n>>> Loading model...", flush=True) model = mlflow.pyfunc.load_model(model_dir) diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml index 2d144c23..3c017991 100644 --- a/src/methods/transcriptformer_mlflow/config.vsh.yaml +++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml @@ -37,6 +37,7 @@ resources: path: script.py - path: /src/utils/read_anndata_partial.py - path: /src/utils/exit_codes.py + - path: /src/utils/unpack.py - path: requirements.txt engines: diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py index b16806d3..9c675ba5 100644 --- a/src/methods/transcriptformer_mlflow/script.py +++ b/src/methods/transcriptformer_mlflow/script.py @@ -22,6 +22,7 @@ sys.path.append(meta["resources_dir"]) from exit_codes import exit_non_applicable from read_anndata_partial import read_anndata +from unpack import unpack_directory print("====== TranscriptFormer (MLflow model) ======", flush=True) @@ -37,30 +38,8 @@ print(adata, flush=True) -if os.path.isdir(par["model"]): - print("\n>>> Using model directory...", flush=True) - print(f"Directory path: '{par['model']}'", flush=True) - model_temp = None - model_dir = par["model"] -else: - model_temp = tempfile.TemporaryDirectory() - model_dir = model_temp.name - - if zipfile.is_zipfile(par["model"]): - print("\n>>> Extracting model from .zip...", flush=True) - print(f".zip path: '{par['model']}'", flush=True) - with zipfile.ZipFile(par["model"], "r") as zip_file: - zip_file.extractall(model_dir) - elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): - print("\n>>> Extracting model from .tar.gz...", flush=True) - print(f".tar.gz path: '{par['model']}'", flush=True) - with tarfile.open(par["model"], "r:gz") as tar_file: - tar_file.extractall(model_dir) - model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) - else: - raise ValueError( - "The 'model' argument should be a directory a .zip file or a .tar.gz file" - ) +print("\n>>> Unpacking model...", flush=True) +model_dir, model_temp = unpack_directory(par["model"]) print("\n>>> Loading model...", flush=True) model = mlflow.pyfunc.load_model(model_dir) diff --git a/src/methods/uce_mlflow/config.vsh.yaml b/src/methods/uce_mlflow/config.vsh.yaml index 354cbd63..a5e6b77c 100644 --- a/src/methods/uce_mlflow/config.vsh.yaml +++ b/src/methods/uce_mlflow/config.vsh.yaml @@ -33,6 +33,7 @@ resources: path: script.py - path: /src/utils/read_anndata_partial.py - path: /src/utils/exit_codes.py + - path: /src/utils/unpack.py - path: requirements.txt engines: diff --git a/src/methods/uce_mlflow/script.py b/src/methods/uce_mlflow/script.py index c61b2a68..eb594544 100644 --- a/src/methods/uce_mlflow/script.py +++ b/src/methods/uce_mlflow/script.py @@ -22,6 +22,7 @@ sys.path.append(meta["resources_dir"]) from exit_codes import exit_non_applicable from read_anndata_partial import read_anndata +from unpack import unpack_directory print("====== UCE (MLflow model) ======", flush=True) @@ -37,30 +38,8 @@ print(adata, flush=True) -if os.path.isdir(par["model"]): - print("\n>>> Using model directory...", flush=True) - print(f"Directory path: '{par['model']}'", flush=True) - model_temp = None - model_dir = par["model"] -else: - model_temp = tempfile.TemporaryDirectory() - model_dir = model_temp.name - - if zipfile.is_zipfile(par["model"]): - print("\n>>> Extracting model from .zip...", flush=True) - print(f".zip path: '{par['model']}'", flush=True) - with zipfile.ZipFile(par["model"], "r") as zip_file: - zip_file.extractall(model_dir) - elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): - print("\n>>> Extracting model from .tar.gz...", flush=True) - print(f".tar.gz path: '{par['model']}'", flush=True) - with tarfile.open(par["model"], "r:gz") as tar_file: - tar_file.extractall(model_dir) - model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) - else: - raise ValueError( - "The 'model' argument should be a directory a .zip file or a .tar.gz file" - ) +print("\n>>> Unpacking model...", flush=True) +model_dir, model_temp = unpack_directory(par["model"]) print("\n>>> Loading model...", flush=True) model = mlflow.pyfunc.load_model(model_dir) diff --git a/src/utils/unpack.py b/src/utils/unpack.py new file mode 100644 index 00000000..443aa39f --- /dev/null +++ b/src/utils/unpack.py @@ -0,0 +1,43 @@ +import os +import tarfile +import tempfile +import zipfile + +def unpack_directory(directory): + """ + Unpack a directory to a temporary location (if needed) + + Args: + directory (str): Path to a directory, .zip, or .tar.gz file. + + Returns: + tuple: (unpacked_directory (str), temp_directory (TemporaryDirectory or None)) + unpacked_directory: Path to the unpacked directory. + temp_directory: TemporaryDirectory object if a temp dir was created, else None. + """ + print(f"Unpacking directory: '{directory}'", flush=True) + + if os.path.isdir(directory): + print(f"Returning provided directory: '{directory}'", flush=True) + temp_directory = None + unpacked_directory = directory + else: + temp_directory = tempfile.TemporaryDirectory() + unpacked_directory = temp_directory.name + + if zipfile.is_zipfile(directory): + print("Extracting .zip...", flush=True) + with zipfile.ZipFile(directory, "r") as zip_file: + zip_file.extractall(unpacked_directory) + elif tarfile.is_tarfile(directory) and directory.endswith(".tar.gz"): + print("Extracting .tar.gz...", flush=True) + with tarfile.open(directory, "r:gz") as tar_file: + tar_file.extractall(unpacked_directory) + unpacked_directory = os.path.join(unpacked_directory, os.listdir(unpacked_directory)[0]) + else: + raise ValueError( + "The 'directory' argument should be a directory, a .zip file or a .tar.gz file" + ) + print(f"Extracted to '{unpacked_directory}'", flush=True) + + return (unpacked_directory, temp_directory) From 2a2a61b17583430d74137d837d487a7fc00c46a6 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 13 Aug 2025 14:31:33 +0200 Subject: [PATCH 15/21] Add unpack helper to scgpt_mlflow --- src/methods/scgpt_mlflow/config.vsh.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/methods/scgpt_mlflow/config.vsh.yaml b/src/methods/scgpt_mlflow/config.vsh.yaml index b8455165..a748a1de 100644 --- a/src/methods/scgpt_mlflow/config.vsh.yaml +++ b/src/methods/scgpt_mlflow/config.vsh.yaml @@ -32,6 +32,7 @@ resources: path: script.py - path: /src/utils/read_anndata_partial.py - path: /src/utils/exit_codes.py + - path: /src/utils/unpack.py - path: requirements.txt engines: From 814365192de61894effe76b2918e7321ee9b48c8 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 13 Aug 2025 14:43:03 +0200 Subject: [PATCH 16/21] Add unpack helper to scvi_mlflow --- src/methods/scvi_mlflow/config.vsh.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/methods/scvi_mlflow/config.vsh.yaml b/src/methods/scvi_mlflow/config.vsh.yaml index d50a6e62..61ed6e78 100644 --- a/src/methods/scvi_mlflow/config.vsh.yaml +++ b/src/methods/scvi_mlflow/config.vsh.yaml @@ -33,6 +33,7 @@ resources: path: script.py - path: /src/utils/read_anndata_partial.py - path: /src/utils/exit_codes.py + - path: /src/utils/unpack.py - path: requirements.txt engines: From 5e49b7c6116847cf85a5ccb70d5f4c46fb6c482a Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Thu, 18 Sep 2025 08:30:33 +0200 Subject: [PATCH 17/21] Update scvi_mlflow Changes to match label projection --- src/methods/scvi_mlflow/script.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/methods/scvi_mlflow/script.py b/src/methods/scvi_mlflow/script.py index 7fe71b46..d8e92bb9 100644 --- a/src/methods/scvi_mlflow/script.py +++ b/src/methods/scvi_mlflow/script.py @@ -45,8 +45,8 @@ print("\n>>> Unpacking model...", flush=True) model_dir, model_temp = unpack_directory(par["model"]) -print("\n>>> Loading model...", flush=True) -model = mlflow.pyfunc.load_model(model_dir) +print(f"\n>>> Loading {organism} model...", flush=True) +model = mlflow.pyfunc.load_model(model_dir, model_config={"organism": organism}) print(model, flush=True) print("\n>>> Writing temporary input H5AD file...", flush=True) @@ -62,8 +62,7 @@ print("\n>>> Running model...", flush=True) input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) -input_params = {"organism": organism, "return_dist": True, "batch_keys": "batch"} -embedding = model.predict(input_df, params=input_params) +embedding = model.predict(input_df) print("\n>>> Storing output...", flush=True) output = ad.AnnData( From 1884f811b6a49e9b7f13171003053def96252ab5 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Sun, 12 Oct 2025 08:24:16 +0200 Subject: [PATCH 18/21] use helper functions in mlflow methods --- src/methods/geneformer_mlflow/config.vsh.yaml | 19 +- src/methods/geneformer_mlflow/script.py | 46 +++-- src/methods/scgpt_mlflow/config.vsh.yaml | 19 +- src/methods/scgpt_mlflow/script.py | 36 ++-- src/methods/scvi_mlflow/config.vsh.yaml | 17 +- src/methods/scvi_mlflow/script.py | 36 ++-- .../transcriptformer_mlflow/config.vsh.yaml | 19 +- src/methods/transcriptformer_mlflow/script.py | 43 ++--- src/methods/uce_mlflow/config.vsh.yaml | 19 +- src/methods/uce_mlflow/script.py | 34 +--- src/utils/mlflow.py | 174 ++++++++++++++++++ src/utils/mlflow_docker_setup.yaml | 14 ++ 12 files changed, 270 insertions(+), 206 deletions(-) create mode 100644 src/utils/mlflow.py create mode 100644 src/utils/mlflow_docker_setup.yaml diff --git a/src/methods/geneformer_mlflow/config.vsh.yaml b/src/methods/geneformer_mlflow/config.vsh.yaml index acacc638..e1d187cf 100644 --- a/src/methods/geneformer_mlflow/config.vsh.yaml +++ b/src/methods/geneformer_mlflow/config.vsh.yaml @@ -36,28 +36,13 @@ resources: - path: /src/utils/read_anndata_partial.py - path: /src/utils/exit_codes.py - path: /src/utils/unpack.py + - path: /src/utils/mlflow.py - path: requirements.txt engines: - type: docker image: openproblems/base_pytorch_nvidia:1 - setup: - - type: docker - add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh - run: sh /uv-installer.sh && rm /uv-installer.sh - env: PATH="/root/.local/bin/:$PATH" - - type: docker - run: uv venv --python 3.11 /opt/venv - - type: docker - env: - - VIRTUAL_ENV=/opt/venv - - PATH="/opt/venv/bin:$PATH" - add: requirements.txt /requirements.txt - run: uv pip install -r /requirements.txt - - type: docker - run: uv pip install mlflow==3.1.0 - - type: docker - run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems + __merge__: /src/utils/mlflow_docker_setup.yaml runners: - type: executable diff --git a/src/methods/geneformer_mlflow/script.py b/src/methods/geneformer_mlflow/script.py index a6860e9f..dc710040 100644 --- a/src/methods/geneformer_mlflow/script.py +++ b/src/methods/geneformer_mlflow/script.py @@ -1,12 +1,9 @@ import os import sys -import tarfile -import tempfile -import zipfile import anndata as ad import mlflow.pyfunc -import pandas as pd +import numpy as np ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -20,9 +17,10 @@ ## VIASH END sys.path.append(meta["resources_dir"]) -from exit_codes import exit_non_applicable -from read_anndata_partial import read_anndata -from unpack import unpack_directory +from exit_codes import exit_non_applicable # noqa: E402 +from mlflow import embed # noqa: E402 +from read_anndata_partial import read_anndata # noqa: E402 +from unpack import unpack_directory # noqa: E402 print("====== Geneformer (MLflow model) ======", flush=True) @@ -45,23 +43,25 @@ model = mlflow.pyfunc.load_model(model_dir) print(model, flush=True) -print("\n>>> Writing temporary input H5AD file...", flush=True) -input_adata = ad.AnnData( - X=adata.X.copy(), - var=adata.var.filter(items=["feature_id"]).rename( - columns={"feature_id": "ensembl_id"} - ), -) -print(input_adata, flush=True) +n_processors = meta.get("cpus") or os.cpu_count() +print(f"Available processors: {n_processors}", flush=True) + -h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) -print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) -input_adata.write(h5ad_file.name) -del input_adata +def process_geneformer_input(input_adata): + """Add Geneformer-specific fields to input AnnData.""" + input_adata.obs["cell_idx"] = np.arange(input_adata.n_obs) + input_adata.obs["n_counts"] = input_adata.X.sum(axis=1) -print("\n>>> Running model...", flush=True) -input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) -embedding = model.predict(input_df) + +print("\n>>> Embedding data...", flush=True) +embedding = embed( + adata, + model, + layers=["counts"], + var={"feature_id": "ensembl_id"}, + model_params={"nproc": n_processors}, + process_adata=process_geneformer_input, +) print("\n>>> Storing output...", flush=True) output = ad.AnnData( @@ -85,7 +85,5 @@ print("\n>>> Cleaning up temporary files...", flush=True) if model_temp is not None: model_temp.cleanup() -h5ad_file.close() -os.unlink(h5ad_file.name) print("\n>>> Done!", flush=True) diff --git a/src/methods/scgpt_mlflow/config.vsh.yaml b/src/methods/scgpt_mlflow/config.vsh.yaml index a748a1de..d684085e 100644 --- a/src/methods/scgpt_mlflow/config.vsh.yaml +++ b/src/methods/scgpt_mlflow/config.vsh.yaml @@ -33,28 +33,13 @@ resources: - path: /src/utils/read_anndata_partial.py - path: /src/utils/exit_codes.py - path: /src/utils/unpack.py + - path: /src/utils/mlflow.py - path: requirements.txt engines: - type: docker image: openproblems/base_pytorch_nvidia:1 - setup: - - type: docker - add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh - run: sh /uv-installer.sh && rm /uv-installer.sh - env: PATH="/root/.local/bin/:$PATH" - - type: docker - run: uv venv --python 3.11 /opt/venv - - type: docker - env: - - VIRTUAL_ENV=/opt/venv - - PATH="/opt/venv/bin:$PATH" - add: requirements.txt /requirements.txt - run: uv pip install -r /requirements.txt - - type: docker - run: uv pip install mlflow==3.1.0 - - type: docker - run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems + __merge__: /src/utils/mlflow_docker_setup.yaml runners: - type: executable diff --git a/src/methods/scgpt_mlflow/script.py b/src/methods/scgpt_mlflow/script.py index fdb6ca3a..db54fd23 100644 --- a/src/methods/scgpt_mlflow/script.py +++ b/src/methods/scgpt_mlflow/script.py @@ -1,12 +1,7 @@ -import os import sys -import tarfile -import tempfile -import zipfile import anndata as ad import mlflow.pyfunc -import pandas as pd ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -20,9 +15,10 @@ ## VIASH END sys.path.append(meta["resources_dir"]) -from exit_codes import exit_non_applicable -from read_anndata_partial import read_anndata -from unpack import unpack_directory +from exit_codes import exit_non_applicable # noqa: E402 +from mlflow import embed # noqa: E402 +from read_anndata_partial import read_anndata # noqa: E402 +from unpack import unpack_directory # noqa: E402 print("====== scGPT (MLflow model) ======", flush=True) @@ -45,22 +41,14 @@ model = mlflow.pyfunc.load_model(model_dir) print(model, flush=True) -print("\n>>> Writing temporary input H5AD file...", flush=True) -input_adata = ad.AnnData( - X=adata.X.copy(), - var=adata.var.filter(items=["feature_name"]), +print("\n>>> Embedding data...", flush=True) +embedding = embed( + adata, + model, + layers=["counts"], + var={"feature_name": "feature_name"}, + model_params={"gene_col": "feature_name"}, ) -print(input_adata, flush=True) - -h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) -print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) -input_adata.write(h5ad_file.name) -del input_adata - -print("\n>>> Running model...", flush=True) -input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) -input_params = {"gene_col": "feature_name"} -embedding = model.predict(input_df, params=input_params) print("\n>>> Storing output...", flush=True) output = ad.AnnData( @@ -84,7 +72,5 @@ print("\n>>> Cleaning up temporary files...", flush=True) if model_temp is not None: model_temp.cleanup() -h5ad_file.close() -os.unlink(h5ad_file.name) print("\n>>> Done!", flush=True) diff --git a/src/methods/scvi_mlflow/config.vsh.yaml b/src/methods/scvi_mlflow/config.vsh.yaml index 61ed6e78..85b6520f 100644 --- a/src/methods/scvi_mlflow/config.vsh.yaml +++ b/src/methods/scvi_mlflow/config.vsh.yaml @@ -34,26 +34,13 @@ resources: - path: /src/utils/read_anndata_partial.py - path: /src/utils/exit_codes.py - path: /src/utils/unpack.py + - path: /src/utils/mlflow.py - path: requirements.txt engines: - type: docker image: openproblems/base_pytorch_nvidia:1 - setup: - - type: docker - add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh - run: sh /uv-installer.sh && rm /uv-installer.sh - env: PATH="/root/.local/bin/:$PATH" - - type: docker - run: uv venv --python 3.11 /opt/venv - - type: docker - env: - - VIRTUAL_ENV=/opt/venv - - PATH="/opt/venv/bin:$PATH" - add: requirements.txt /requirements.txt - run: uv pip install -r /requirements.txt && uv pip install mlflow==3.1.0 - - type: docker - run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems + __merge__: /src/utils/mlflow_docker_setup.yaml runners: - type: executable diff --git a/src/methods/scvi_mlflow/script.py b/src/methods/scvi_mlflow/script.py index d8e92bb9..0c27a11a 100644 --- a/src/methods/scvi_mlflow/script.py +++ b/src/methods/scvi_mlflow/script.py @@ -1,12 +1,7 @@ -import os import sys -import tarfile -import tempfile -import zipfile import anndata as ad import mlflow.pyfunc -import pandas as pd ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -20,9 +15,10 @@ ## VIASH END sys.path.append(meta["resources_dir"]) -from exit_codes import exit_non_applicable -from read_anndata_partial import read_anndata -from unpack import unpack_directory +from exit_codes import exit_non_applicable # noqa: E402 +from mlflow import embed # noqa: E402 +from read_anndata_partial import read_anndata # noqa: E402 +from unpack import unpack_directory # noqa: E402 print("====== scVI (MLflow model) ======", flush=True) @@ -49,20 +45,14 @@ model = mlflow.pyfunc.load_model(model_dir, model_config={"organism": organism}) print(model, flush=True) -print("\n>>> Writing temporary input H5AD file...", flush=True) -input_adata = ad.AnnData(X=adata.X.copy()) -input_adata.var_names = adata.var["feature_id"].values -input_adata.obs["batch"] = adata.obs["batch"].values -print(input_adata, flush=True) - -h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) -print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) -input_adata.write(h5ad_file.name) -del input_adata - -print("\n>>> Running model...", flush=True) -input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) -embedding = model.predict(input_df) +print("\n>>> Embedding data...", flush=True) +embedding = embed( + adata, + model, + layers=["counts"], + obs=["batch"], + var={"feature_id": "feature_id"} +) print("\n>>> Storing output...", flush=True) output = ad.AnnData( @@ -86,7 +76,5 @@ print("\n>>> Cleaning up temporary files...", flush=True) if model_temp is not None: model_temp.cleanup() -h5ad_file.close() -os.unlink(h5ad_file.name) print("\n>>> Done!", flush=True) diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml index 3c017991..453ba275 100644 --- a/src/methods/transcriptformer_mlflow/config.vsh.yaml +++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml @@ -38,28 +38,13 @@ resources: - path: /src/utils/read_anndata_partial.py - path: /src/utils/exit_codes.py - path: /src/utils/unpack.py + - path: /src/utils/mlflow.py - path: requirements.txt engines: - type: docker image: openproblems/base_pytorch_nvidia:1 - setup: - - type: docker - add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh - run: sh /uv-installer.sh && rm /uv-installer.sh - env: PATH="/root/.local/bin/:$PATH" - - type: docker - run: uv venv --python 3.11 /opt/venv - - type: docker - env: - - VIRTUAL_ENV=/opt/venv - - PATH="/opt/venv/bin:$PATH" - add: requirements.txt /requirements.txt - run: uv pip install -r /requirements.txt - - type: docker - run: uv pip install mlflow==3.1.0 - - type: docker - run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems + __merge__: /src/utils/mlflow_docker_setup.yaml runners: - type: executable diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py index 9c675ba5..0ddacee8 100644 --- a/src/methods/transcriptformer_mlflow/script.py +++ b/src/methods/transcriptformer_mlflow/script.py @@ -1,12 +1,7 @@ -import os import sys -import tarfile -import tempfile -import zipfile import anndata as ad import mlflow.pyfunc -import pandas as pd ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -20,9 +15,10 @@ ## VIASH END sys.path.append(meta["resources_dir"]) -from exit_codes import exit_non_applicable -from read_anndata_partial import read_anndata -from unpack import unpack_directory +from exit_codes import exit_non_applicable # noqa: E402 +from mlflow import embed # noqa: E402 +from read_anndata_partial import read_anndata # noqa: E402 +from unpack import unpack_directory # noqa: E402 print("====== TranscriptFormer (MLflow model) ======", flush=True) @@ -45,23 +41,20 @@ model = mlflow.pyfunc.load_model(model_dir) print(model, flush=True) -print("\n>>> Writing temporary input H5AD file...", flush=True) -input_adata = ad.AnnData( - X=adata.X.copy(), - var=adata.var.filter(items=["feature_id"]).rename( - columns={"feature_id": "ensembl_id"} - ), -) -input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing -print(input_adata, flush=True) -h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) -print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) -input_adata.write(h5ad_file.name) -del input_adata -print("\n>>> Running model...", flush=True) -input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) -embedding = model.predict(input_df) +def process_transcriptformer_input(input_adata): + """Add TranscriptFormer-specific fields to input AnnData.""" + input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing + + +print("\n>>> Embedding data...", flush=True) +embedding = embed( + adata, + model, + layers=["counts"], + var={"feature_id": "ensembl_id"}, + process_adata=process_transcriptformer_input, +) print("\n>>> Storing output...", flush=True) output = ad.AnnData( @@ -85,7 +78,5 @@ print("\n>>> Cleaning up temporary files...", flush=True) if model_temp is not None: model_temp.cleanup() -h5ad_file.close() -os.unlink(h5ad_file.name) print("\n>>> Done!", flush=True) diff --git a/src/methods/uce_mlflow/config.vsh.yaml b/src/methods/uce_mlflow/config.vsh.yaml index a5e6b77c..564bc7de 100644 --- a/src/methods/uce_mlflow/config.vsh.yaml +++ b/src/methods/uce_mlflow/config.vsh.yaml @@ -34,28 +34,13 @@ resources: - path: /src/utils/read_anndata_partial.py - path: /src/utils/exit_codes.py - path: /src/utils/unpack.py + - path: /src/utils/mlflow.py - path: requirements.txt engines: - type: docker image: openproblems/base_pytorch_nvidia:1 - setup: - - type: docker - add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh - run: sh /uv-installer.sh && rm /uv-installer.sh - env: PATH="/root/.local/bin/:$PATH" - - type: docker - run: uv venv --python 3.11 /opt/venv - - type: docker - env: - - VIRTUAL_ENV=/opt/venv - - PATH="/opt/venv/bin:$PATH" - add: requirements.txt /requirements.txt - run: uv pip install -r /requirements.txt - - type: docker - run: uv pip install mlflow==3.1.0 - - type: docker - run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems + __merge__: /src/utils/mlflow_docker_setup.yaml runners: - type: executable diff --git a/src/methods/uce_mlflow/script.py b/src/methods/uce_mlflow/script.py index eb594544..6e6fffb6 100644 --- a/src/methods/uce_mlflow/script.py +++ b/src/methods/uce_mlflow/script.py @@ -1,12 +1,7 @@ -import os import sys -import tarfile -import tempfile -import zipfile import anndata as ad import mlflow.pyfunc -import pandas as pd ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -20,9 +15,10 @@ ## VIASH END sys.path.append(meta["resources_dir"]) -from exit_codes import exit_non_applicable -from read_anndata_partial import read_anndata -from unpack import unpack_directory +from exit_codes import exit_non_applicable # noqa: E402 +from mlflow import embed # noqa: E402 +from read_anndata_partial import read_anndata # noqa: E402 +from unpack import unpack_directory # noqa: E402 print("====== UCE (MLflow model) ======", flush=True) @@ -45,21 +41,13 @@ model = mlflow.pyfunc.load_model(model_dir) print(model, flush=True) -print("\n>>> Writing temporary input H5AD file...", flush=True) -input_adata = ad.AnnData( - X=adata.X.copy(), - var=adata.var.filter(items=["feature_name"]), +print("\n>>> Embedding data...", flush=True) +embedding = embed( + adata, + model, + layers=["counts"], + var={"feature_name": "feature_name"}, ) -print(input_adata, flush=True) - -h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) -print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) -input_adata.write(h5ad_file.name) -del input_adata - -print("\n>>> Running model...", flush=True) -input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) -embedding = model.predict(input_df) print("\n>>> Storing output...", flush=True) output = ad.AnnData( @@ -83,7 +71,5 @@ print("\n>>> Cleaning up temporary files...", flush=True) if model_temp is not None: model_temp.cleanup() -h5ad_file.close() -os.unlink(h5ad_file.name) print("\n>>> Done!", flush=True) diff --git a/src/utils/mlflow.py b/src/utils/mlflow.py new file mode 100644 index 00000000..447614e6 --- /dev/null +++ b/src/utils/mlflow.py @@ -0,0 +1,174 @@ +""" +Common utilities for MLflow-based methods. +""" +import os +import tempfile + +import anndata as ad +import pandas as pd +import sklearn.neighbors + + +def create_temp_h5ad( + adata, layers=None, obs=None, var=None, obsm=None, varm=None, uns=None +): + """ + Create a temporary H5AD file with specified data from an AnnData object. + + Args: + adata: Input AnnData object + layers: List of layer names to include (e.g., ["counts"]) + obs: List of obs column names to include (e.g., ["batch"]) + var: Dict mapping var column names to new names (e.g., {"feature_id": "ensembl_id"}) + obsm: List of obsm keys to include + varm: List of varm keys to include + uns: List of uns keys to include + + Returns: + tuple: (h5ad_file, input_adata) where h5ad_file is the NamedTemporaryFile and + input_adata is the created AnnData object + """ + # Extract X from layers or use X directly + if layers and len(layers) > 0: + X = adata.layers[layers[0]].copy() + else: + X = adata.X.copy() + + # Create new AnnData + input_adata = ad.AnnData(X=X) + + # Set var_names + input_adata.var_names = adata.var_names + + # Add obs columns + if obs: + for obs_key in obs: + if obs_key in adata.obs: + input_adata.obs[obs_key] = adata.obs[obs_key].values + + # Add var columns (with optional renaming) + if var: + for old_name, new_name in var.items(): + if old_name in adata.var: + input_adata.var[new_name] = adata.var[old_name].values + + # Add obsm + if obsm: + for obsm_key in obsm: + if obsm_key in adata.obsm: + input_adata.obsm[obsm_key] = adata.obsm[obsm_key].copy() + + # Add varm + if varm: + for varm_key in varm: + if varm_key in adata.varm: + input_adata.varm[varm_key] = adata.varm[varm_key].copy() + + # Add uns + if uns: + for uns_key in uns: + if uns_key in adata.uns: + input_adata.uns[uns_key] = adata.uns[uns_key] + + # Write to temp file + h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) + input_adata.write(h5ad_file.name) + + return h5ad_file, input_adata + + +def embed(adata, model, layers=None, obs=None, var=None, model_params=None, process_adata=None): + """ + Embed data using an MLflow model. + + Args: + adata: Input AnnData object to embed + model: Loaded MLflow model + layers: List of layer names to include (e.g., ["counts"]) + obs: List of obs column names to include (e.g., ["batch"]) + var: Dict mapping var column names to new names (e.g., {"feature_id": "ensembl_id"}) + model_params: Optional dict of parameters to pass to model.predict() + process_adata: Optional function to process input_adata before writing (e.g., to add defaults) + + Returns: + np.ndarray: Embeddings for the input data + """ + print("Writing temporary input H5AD file...", flush=True) + h5ad_file, input_adata = create_temp_h5ad(adata, layers=layers, obs=obs, var=var) + + # Apply any post-processing to input_adata + if process_adata: + process_adata(input_adata) + + print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True) + print(input_adata, flush=True) + + # Re-write the file after processing + input_adata.write(h5ad_file.name) + + print("Running model...", flush=True) + input_df = pd.DataFrame({"input_uri": [h5ad_file.name]}) + if model_params: + embedding = model.predict(input_df, params=model_params) + else: + embedding = model.predict(input_df) + + # Clean up + h5ad_file.close() + os.unlink(h5ad_file.name) + + return embedding + + +def embed_and_classify( + train_adata, + test_adata, + model, + layers=None, + obs=None, + var=None, + model_params=None, + process_adata=None, + n_neighbors=5, +): + """ + Generic pipeline for embedding data and training a kNN classifier. + + Args: + train_adata: Training AnnData object with labels + test_adata: Test AnnData object to predict + model: Loaded MLflow model + layers: List of layer names to include (e.g., ["counts"]) + obs: List of obs column names to include (e.g., ["batch"]) + var: Dict mapping var column names to new names (e.g., {"feature_id": "ensembl_id"}) + model_params: Optional dict of parameters to pass to model.predict() + process_adata: Optional function to process input_adata before writing (e.g., to add defaults) + n_neighbors: Number of neighbors for kNN classifier + + Returns: + np.ndarray: Predicted labels for test data + """ + # Embed training data + print("\n>>> Embedding training data...", flush=True) + embedding_train = embed( + train_adata, model, layers=layers, obs=obs, var=var, + model_params=model_params, process_adata=process_adata + ) + + # Train kNN classifier + print("\n>>> Training kNN classifier...", flush=True) + classifier = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n_neighbors) + classifier.fit(embedding_train, train_adata.obs["label"].astype(str)) + + # Embed test data + print("\n>>> Embedding test data...", flush=True) + embedding_test = embed( + test_adata, model, layers=layers, obs=obs, var=var, + model_params=model_params, process_adata=process_adata + ) + + # Classify + print("\n>>> Classifying test data...", flush=True) + predictions = classifier.predict(embedding_test) + + return predictions diff --git a/src/utils/mlflow_docker_setup.yaml b/src/utils/mlflow_docker_setup.yaml new file mode 100644 index 00000000..aa03e9a7 --- /dev/null +++ b/src/utils/mlflow_docker_setup.yaml @@ -0,0 +1,14 @@ +- type: docker + add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh + run: sh /uv-installer.sh && rm /uv-installer.sh + env: PATH="/root/.local/bin/:$PATH" +- type: docker + run: uv venv --python 3.11 /opt/venv +- type: docker + env: + - VIRTUAL_ENV=/opt/venv + - PATH="/opt/venv/bin:$PATH" + add: requirements.txt /requirements.txt + run: uv pip install -r /requirements.txt && uv pip install mlflow==3.1.0 +- type: docker + run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems From 28bb336ec5e099adbd2189181c78a99cc21c698d Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Sun, 12 Oct 2025 08:58:49 +0200 Subject: [PATCH 19/21] use biggpu for uce? --- src/methods/uce_mlflow/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/methods/uce_mlflow/config.vsh.yaml b/src/methods/uce_mlflow/config.vsh.yaml index 564bc7de..96ccbd8b 100644 --- a/src/methods/uce_mlflow/config.vsh.yaml +++ b/src/methods/uce_mlflow/config.vsh.yaml @@ -46,4 +46,4 @@ runners: - type: executable - type: nextflow directives: - label: [hightime, highmem, midcpu, gpu] + label: [hightime, highmem, midcpu, biggpu] From e480211b78b00ec84c167cbbe9783e24f165a66b Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Sun, 12 Oct 2025 08:59:01 +0200 Subject: [PATCH 20/21] disable old methods --- src/workflows/run_benchmark/config.vsh.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index f7d472d0..7872823e 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -92,7 +92,6 @@ dependencies: - name: methods/batchelor_mnn_correct - name: methods/bbknn - name: methods/combat - - name: methods/geneformer - name: methods/geneformer_mlflow - name: methods/harmony - name: methods/harmonypy @@ -102,9 +101,7 @@ dependencies: - name: methods/scalex - name: methods/scanorama - name: methods/scanvi - - name: methods/scgpt_finetuned - name: methods/scgpt_mlflow - - name: methods/scgpt_zeroshot - name: methods/scimilarity - name: methods/scprint - name: methods/scvi @@ -112,6 +109,10 @@ dependencies: - name: methods/transcriptformer_mlflow - name: methods/uce - name: methods/uce_mlflow + # outdated methods + # - name: methods/geneformer + # - name: methods/scgpt_finetuned + # - name: methods/scgpt_zeroshot # metrics - name: metrics/asw_batch - name: metrics/asw_label From 329d25e2b182636234d82ee00a9948e997af75d7 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Sun, 19 Oct 2025 22:53:18 +0200 Subject: [PATCH 21/21] fix wf Signed-off-by: Robrecht Cannoodt --- src/workflows/run_benchmark/config.vsh.yaml | 4 ++-- src/workflows/run_benchmark/main.nf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 7872823e..d9ae23f4 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -102,6 +102,8 @@ dependencies: - name: methods/scanorama - name: methods/scanvi - name: methods/scgpt_mlflow + - name: methods/scgpt_finetuned + - name: methods/scgpt_zeroshot - name: methods/scimilarity - name: methods/scprint - name: methods/scvi @@ -111,8 +113,6 @@ dependencies: - name: methods/uce_mlflow # outdated methods # - name: methods/geneformer - # - name: methods/scgpt_finetuned - # - name: methods/scgpt_zeroshot # metrics - name: metrics/asw_batch - name: metrics/asw_label diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 1fbebb45..88f83327 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -20,7 +20,7 @@ methods = [ batchelor_mnn_correct, bbknn, combat, - geneformer, + // geneformer, geneformer_mlflow.run( args: [model: file("s3://openproblems-work/cache/geneformer-mlflow-model.zip")] ),