From 479437adba35d20952eabdc0f874e8ccb61591ae Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Jul 2025 13:02:05 +0200
Subject: [PATCH 01/21] Add transcriptformer_mlflow method component

---
 .../transcriptformer_mlflow/config.vsh.yaml   | 60 +++++++++++++++
 src/methods/transcriptformer_mlflow/script.py | 76 +++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 src/methods/transcriptformer_mlflow/config.vsh.yaml
 create mode 100644 src/methods/transcriptformer_mlflow/script.py

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
new file mode 100644
index 00000000..e693428a
--- /dev/null
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -0,0 +1,60 @@
+__merge__: ../../api/base_method.yaml
+
+name: transcriptformer_mlflow
+label: TranscriptFormer (MLflow model)
+summary: "Context-aware representations of single-cell transcriptomes by jointly modeling genes and transcripts"
+description: |
+  TranscriptFormer is designed to learn rich, context-aware representations of
+  single-cell transcriptomes while jointly modeling genes and transcripts using
+  a novel generative architecture.
+
+  It is a family of generative foundation models representing a cross-species
+  generative cell atlas trained on up to 112 million cells spanning 1.53 billion
+  years of evolution across 12 species.
+
+  Here, we use a version packaged as an MLflow model.
+references:
+  doi:
+    - 10.1101/2025.04.25.650731
+links:
+  documentation: https://github.com/czi-ai/transcriptformer#readme
+  repository: https://github.com/czi-ai/transcriptformer
+
+info:
+  method_types: [embedding]
+  preferred_normalization: counts
+
+arguments:
+  - name: --model
+    type: file
+    description: |
+      An MLflow model URL for the transcriptformer model
+    required: true
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+  - path: /src/utils/exit_codes.py
+
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1
+    setup:
+      - type: docker
+        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
+        run: sh /uv-installer.sh && rm /uv-installer.sh
+        env: PATH="/root/.local/bin/:$PATH"
+      - type: docker
+        run: uv venv --python 3.11 /opt/venv
+      - type: docker
+        env:
+         - VIRTUAL_ENV=/opt/venv
+         - PATH="/opt/venv/bin:$PATH"
+        run: uv pip install mlflow==3.0.0 "transcriptformer>=0.3.0"
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [hightime, midmem, midcpu, gpu]
diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
new file mode 100644
index 00000000..db8bda4d
--- /dev/null
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -0,0 +1,76 @@
+import anndata as ad
+import sys
+import mlflow.pyfunc
+from tempfile import NamedTemporaryFile
+import os
+import pandas as pd
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    "input": "resources_test/.../input.h5ad",
+    "output": "output.h5ad",
+    "model": "resources_test/.../model",
+}
+meta = {"name": "transcriptformer_mlflow"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+from exit_codes import exit_non_applicable
+
+print(f"====== TranscriptFormer (MLflow model) ======", flush=True)
+
+print("\n>>> Reading input files...", flush=True)
+print(f"Input H5AD file: '{par['input']}'", flush=True)
+adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+
+if adata.uns["dataset_organism"] != "homo_sapiens":
+    exit_non_applicable(
+        f"Transcriptformer can only be used with human data "
+        f"(dataset_organism == \"{adata.uns['dataset_organism']}\")"
+    )
+
+print(adata, flush=True)
+
+print("\n>>> Writing temporary H5AD file...", flush=True)
+input_adata = ad.AnnData(X = adata.X.copy(), var = adata.var.filter(items=["feature_id"]).rename(columns = {"feature_id": "ensembl_id"}))
+input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing
+print(input_adata, flush=True)
+h5ad_file = NamedTemporaryFile(suffix=".h5ad", delete=False)
+print(f"Temporary H5AD file: '{h5ad_file}'", flush=True)
+input_adata.write(h5ad_file.name)
+del input_adata
+
+print("\n>>> Loading model...", flush=True)
+model = mlflow.pyfunc.load_model(par["model"])
+
+print("\n>>> Running model...", flush=True)
+input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
+model.predict(input_df)
+
+print("\n>>> Storing output...", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    # obsm={
+    #     "X_emb": embedded.X,
+    # },
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+print(output)
+
+print("\n>>> Writing output to file...", flush=True)
+print(f"Output H5AD file: '{par['output']}'", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
+
+print("\n>>> Cleaning up temporary files...", flush=True)
+h5ad_file.close()
+os.unlink(h5ad_file.name)
+
+print("\n>>> Done!", flush=True)

From 95ced9ba05092d9f83b14a61db9c538f6307149c Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Jul 2025 16:58:37 +0200
Subject: [PATCH 02/21] Adjust transcriptformer_mlflow dependencies

---
 src/methods/transcriptformer_mlflow/config.vsh.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index e693428a..ee874547 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -49,9 +49,9 @@ engines:
         run: uv venv --python 3.11 /opt/venv
       - type: docker
         env:
-         - VIRTUAL_ENV=/opt/venv
-         - PATH="/opt/venv/bin:$PATH"
-        run: uv pip install mlflow==3.0.0 "transcriptformer>=0.3.0"
+          - VIRTUAL_ENV=/opt/venv
+          - PATH="/opt/venv/bin:$PATH"
+        run: uv pip install mlflow==3.1.0 pandas==2.3.0 scanpy==1.11.2 scipy==1.16.0 "transcriptformer>=0.3.0"
 
 runners:
   - type: executable

From 6e8dce2e7006a5bdc80593a69dc8972ef34dd73b Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Jul 2025 17:00:50 +0200
Subject: [PATCH 03/21] Output embedding in transcriptformer_mlflow

---
 src/methods/transcriptformer_mlflow/script.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index db8bda4d..6c62f3f4 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -39,7 +39,7 @@
 input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing
 print(input_adata, flush=True)
 h5ad_file = NamedTemporaryFile(suffix=".h5ad", delete=False)
-print(f"Temporary H5AD file: '{h5ad_file}'", flush=True)
+print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
 input_adata.write(h5ad_file.name)
 del input_adata
 
@@ -48,15 +48,15 @@
 
 print("\n>>> Running model...", flush=True)
 input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-model.predict(input_df)
+embedding = model.predict(input_df)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
     obs=adata.obs[[]],
     var=adata.var[[]],
-    # obsm={
-    #     "X_emb": embedded.X,
-    # },
+    obsm={
+        "X_emb": embedding,
+    },
     uns={
         "dataset_id": adata.uns["dataset_id"],
         "normalization_id": adata.uns["normalization_id"],

From 440a18b8c5ea0446cf839f9009f56538194b66c3 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Jul 2025 17:26:59 +0200
Subject: [PATCH 04/21] Install transcriptformer requirements from file

---
 src/methods/transcriptformer_mlflow/config.vsh.yaml | 6 +++++-
 src/methods/transcriptformer_mlflow/script.py       | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index ee874547..c1524f87 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -36,6 +36,7 @@ resources:
     path: script.py
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
+  - path: requirements.txt
 
 engines:
   - type: docker
@@ -51,7 +52,10 @@ engines:
         env:
           - VIRTUAL_ENV=/opt/venv
           - PATH="/opt/venv/bin:$PATH"
-        run: uv pip install mlflow==3.1.0 pandas==2.3.0 scanpy==1.11.2 scipy==1.16.0 "transcriptformer>=0.3.0"
+        add: requirements.txt /requirements.txt
+        run: uv pip install -r /requirements.txt
+      - type: docker
+        run: uv pip install mlflow==3.1.0
 
 runners:
   - type: executable
diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index 6c62f3f4..f2cd1e40 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -45,6 +45,7 @@
 
 print("\n>>> Loading model...", flush=True)
 model = mlflow.pyfunc.load_model(par["model"])
+print(model, flush=True)
 
 print("\n>>> Running model...", flush=True)
 input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})

From 5332e00cd7a71b26f5044c803dfd94931d28b941 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 9 Jul 2025 10:16:16 +0200
Subject: [PATCH 05/21] Add extracting model archive to transcriptformer

---
 .../transcriptformer_mlflow/config.vsh.yaml   |   3 +-
 .../transcriptformer_mlflow/requirements.txt  | 338 ++++++++++++++++++
 src/methods/transcriptformer_mlflow/script.py |  44 ++-
 3 files changed, 377 insertions(+), 8 deletions(-)
 create mode 100644 src/methods/transcriptformer_mlflow/requirements.txt

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index c1524f87..ba708fa1 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -28,7 +28,8 @@ arguments:
   - name: --model
     type: file
     description: |
-      An MLflow model URL for the transcriptformer model
+      An MLflow model URI for the transcriptformer model. If it is a .zip or
+      .tar.gz file it will be extracted to a temporary directory.
     required: true
 
 resources:
diff --git a/src/methods/transcriptformer_mlflow/requirements.txt b/src/methods/transcriptformer_mlflow/requirements.txt
new file mode 100644
index 00000000..70d923d1
--- /dev/null
+++ b/src/methods/transcriptformer_mlflow/requirements.txt
@@ -0,0 +1,338 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements.in -o requirements.txt
+aiobotocore==2.23.0
+    # via s3fs
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.12.13
+    # via
+    #   aiobotocore
+    #   fsspec
+    #   s3fs
+aioitertools==0.12.0
+    # via aiobotocore
+aiosignal==1.3.2
+    # via aiohttp
+anndata==0.11.4
+    # via
+    #   cellxgene-census
+    #   scanpy
+    #   somacore
+    #   tiledbsoma
+    #   transcriptformer
+antlr4-python3-runtime==4.9.3
+    # via
+    #   hydra-core
+    #   omegaconf
+array-api-compat==1.12.0
+    # via anndata
+attrs==25.3.0
+    # via
+    #   aiohttp
+    #   somacore
+    #   tiledbsoma
+boto3==1.38.27
+    # via transcriptformer
+botocore==1.38.27
+    # via
+    #   aiobotocore
+    #   boto3
+    #   s3transfer
+cellxgene-census==1.17.0
+    # via transcriptformer
+certifi==2025.6.15
+    # via requests
+charset-normalizer==3.4.2
+    # via requests
+contourpy==1.3.2
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+filelock==3.18.0
+    # via
+    #   torch
+    #   triton
+fonttools==4.58.4
+    # via matplotlib
+frozenlist==1.7.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.5.1
+    # via
+    #   pytorch-lightning
+    #   s3fs
+    #   torch
+h5py==3.14.0
+    # via
+    #   anndata
+    #   scanpy
+    #   transcriptformer
+hydra-core==1.3.2
+    # via transcriptformer
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+iniconfig==2.1.0
+    # via pytest
+jinja2==3.1.6
+    # via torch
+jmespath==1.0.1
+    # via
+    #   aiobotocore
+    #   boto3
+    #   botocore
+joblib==1.5.1
+    # via
+    #   pynndescent
+    #   scanpy
+    #   scikit-learn
+kiwisolver==1.4.8
+    # via matplotlib
+legacy-api-wrap==1.4.1
+    # via scanpy
+lightning-utilities==0.14.3
+    # via
+    #   pytorch-lightning
+    #   torchmetrics
+llvmlite==0.44.0
+    # via
+    #   numba
+    #   pynndescent
+markupsafe==3.0.2
+    # via jinja2
+matplotlib==3.10.3
+    # via
+    #   scanpy
+    #   seaborn
+more-itertools==10.7.0
+    # via tiledbsoma
+mpmath==1.3.0
+    # via sympy
+multidict==6.6.0
+    # via
+    #   aiobotocore
+    #   aiohttp
+    #   yarl
+natsort==8.4.0
+    # via
+    #   anndata
+    #   scanpy
+networkx==3.5
+    # via
+    #   scanpy
+    #   torch
+numba==0.61.2
+    # via
+    #   pynndescent
+    #   scanpy
+    #   umap-learn
+numpy==2.2.6
+    # via
+    #   anndata
+    #   cellxgene-census
+    #   contourpy
+    #   h5py
+    #   matplotlib
+    #   numba
+    #   pandas
+    #   patsy
+    #   scanpy
+    #   scikit-learn
+    #   scipy
+    #   seaborn
+    #   shapely
+    #   somacore
+    #   statsmodels
+    #   tiledbsoma
+    #   torchmetrics
+    #   transcriptformer
+    #   umap-learn
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.575.51
+    # via pynvml
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+omegaconf==2.3.0
+    # via hydra-core
+packaging==25.0
+    # via
+    #   anndata
+    #   hydra-core
+    #   lightning-utilities
+    #   matplotlib
+    #   pytest
+    #   pytorch-lightning
+    #   scanpy
+    #   statsmodels
+    #   torchmetrics
+pandas==2.3.0
+    # via
+    #   anndata
+    #   scanpy
+    #   seaborn
+    #   somacore
+    #   statsmodels
+    #   tiledbsoma
+    #   transcriptformer
+patsy==1.0.1
+    # via
+    #   scanpy
+    #   statsmodels
+pillow==11.2.1
+    # via matplotlib
+pluggy==1.6.0
+    # via pytest
+propcache==0.3.2
+    # via
+    #   aiohttp
+    #   yarl
+psutil==7.0.0
+    # via transcriptformer
+pyarrow==20.0.0
+    # via
+    #   somacore
+    #   tiledbsoma
+pyarrow-hotfix==0.7
+    # via somacore
+pygments==2.19.2
+    # via pytest
+pynndescent==0.5.13
+    # via
+    #   scanpy
+    #   umap-learn
+pynvml==12.0.0
+    # via transcriptformer
+pyparsing==3.2.3
+    # via matplotlib
+pytest==8.4.1
+    # via transcriptformer
+python-dateutil==2.9.0.post0
+    # via
+    #   aiobotocore
+    #   botocore
+    #   matplotlib
+    #   pandas
+pytorch-lightning==2.5.2
+    # via transcriptformer
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   omegaconf
+    #   pytorch-lightning
+requests==2.32.4
+    # via cellxgene-census
+s3fs==2025.5.1
+    # via cellxgene-census
+s3transfer==0.13.0
+    # via boto3
+scanpy==1.11.2
+    # via
+    #   tiledbsoma
+    #   transcriptformer
+scikit-learn==1.7.0
+    # via
+    #   pynndescent
+    #   scanpy
+    #   umap-learn
+scipy==1.16.0
+    # via
+    #   anndata
+    #   pynndescent
+    #   scanpy
+    #   scikit-learn
+    #   somacore
+    #   statsmodels
+    #   tiledbsoma
+    #   transcriptformer
+    #   umap-learn
+seaborn==0.13.2
+    # via scanpy
+session-info2==0.1.2
+    # via scanpy
+setuptools==80.9.0
+    # via lightning-utilities
+shapely==2.1.1
+    # via somacore
+six==1.17.0
+    # via python-dateutil
+somacore==1.0.28
+    # via tiledbsoma
+statsmodels==0.14.4
+    # via scanpy
+sympy==1.13.1
+    # via torch
+threadpoolctl==3.6.0
+    # via scikit-learn
+tiledbsoma==1.17.0
+    # via cellxgene-census
+timeout-decorator==0.5.0
+    # via transcriptformer
+torch==2.5.1
+    # via
+    #   pytorch-lightning
+    #   torchmetrics
+    #   transcriptformer
+torchmetrics==1.7.3
+    # via pytorch-lightning
+tqdm==4.67.1
+    # via
+    #   pytorch-lightning
+    #   scanpy
+    #   umap-learn
+transcriptformer==0.3.0
+    # via -r requirements.in
+triton==3.1.0
+    # via torch
+typing-extensions==4.14.0
+    # via
+    #   cellxgene-census
+    #   lightning-utilities
+    #   pytorch-lightning
+    #   scanpy
+    #   somacore
+    #   tiledbsoma
+    #   torch
+tzdata==2025.2
+    # via pandas
+umap-learn==0.5.7
+    # via scanpy
+urllib3==2.5.0
+    # via
+    #   botocore
+    #   requests
+wrapt==1.17.2
+    # via aiobotocore
+yarl==1.20.1
+    # via aiohttp
diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index f2cd1e40..76b41023 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -1,9 +1,11 @@
 import anndata as ad
 import sys
 import mlflow.pyfunc
-from tempfile import NamedTemporaryFile
+import tempfile
 import os
 import pandas as pd
+import zipfile
+import tarfile
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -34,19 +36,45 @@
 
 print(adata, flush=True)
 
-print("\n>>> Writing temporary H5AD file...", flush=True)
+if os.path.isdir(par["model"]):
+    print("\n>>> Using model directory...", flush=True)
+    model_temp = None
+    model_dir = par["model"]
+else:
+    model_temp = tempfile.TemporaryDirectory()
+    model_dir = model_temp.name
+
+    if zipfile.is_zipfile(par["model"]):
+        print("\n>>> Extracting model from .zip...", flush=True)
+        print(f".zip path: '{par['model']}'", flush=True)
+        with zipfile.ZipFile(par["model"], "r") as zip_file:
+            zip_file.extractall(model_dir)
+    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(
+        ".tar.gz"
+    ):
+        print("\n>>> Extracting model from .tar.gz...", flush=True)
+        print(f".tar.gz path: '{par['model']}'", flush=True)
+        with tarfile.open(par["model"], "r:gz") as tar_file:
+            tar_file.extractall(model_dir)
+            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
+    else:
+        raise ValueError(
+            "The 'model' argument should be a directory a .zip file or a .tar.gz file"
+        )
+
+print("\n>>> Loading model...", flush=True)
+model = mlflow.pyfunc.load_model(model_dir)
+print(model, flush=True)
+
+print("\n>>> Writing temporary input H5AD file...", flush=True)
 input_adata = ad.AnnData(X = adata.X.copy(), var = adata.var.filter(items=["feature_id"]).rename(columns = {"feature_id": "ensembl_id"}))
 input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing
 print(input_adata, flush=True)
-h5ad_file = NamedTemporaryFile(suffix=".h5ad", delete=False)
+h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
 print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
 input_adata.write(h5ad_file.name)
 del input_adata
 
-print("\n>>> Loading model...", flush=True)
-model = mlflow.pyfunc.load_model(par["model"])
-print(model, flush=True)
-
 print("\n>>> Running model...", flush=True)
 input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
 embedding = model.predict(input_df)
@@ -71,6 +99,8 @@
 output.write_h5ad(par["output"], compression="gzip")
 
 print("\n>>> Cleaning up temporary files...", flush=True)
+if model_temp is not None:
+    model_temp.cleanup()
 h5ad_file.close()
 os.unlink(h5ad_file.name)
 

From b84a40aec116be12dd1b4069cd617ea8ac1485b4 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 9 Jul 2025 10:16:46 +0200
Subject: [PATCH 06/21] Add transcriptformer_mlflow to benchmark workflow

---
 scripts/run_benchmark/run_full_local.sh     | 2 +-
 scripts/run_benchmark/run_test_local.sh     | 2 +-
 src/workflows/run_benchmark/config.vsh.yaml | 1 +
 src/workflows/run_benchmark/main.nf         | 3 +++
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
index 20e434b3..b60940c9 100755
--- a/scripts/run_benchmark/run_full_local.sh
+++ b/scripts/run_benchmark/run_full_local.sh
@@ -26,7 +26,7 @@ input_states: resources/datasets/**/state.yaml
 rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
-settings: '{"methods_exclude": ["uce", "scgpt_finetuned"]}'
+settings: '{"methods_exclude": ["uce", "scgpt_finetuned", "transcriptformer_mlflow"]}'
 HERE
 
 # run the benchmark
diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh
index 85e39583..4b7bf15e 100755
--- a/scripts/run_benchmark/run_test_local.sh
+++ b/scripts/run_benchmark/run_test_local.sh
@@ -21,7 +21,7 @@ input_states: resources_test/task_batch_integration/**/state.yaml
 rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
-settings: '{"methods_exclude": ["uce", "scgpt_finetuned"]}'
+settings: '{"methods_exclude": ["uce", "scgpt_finetuned", "transcriptformer_mlflow"]}'
 HERE
 
 nextflow run . \
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 09905ad0..d9fe9504 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -106,6 +106,7 @@ dependencies:
   - name: methods/scimilarity
   - name: methods/scprint
   - name: methods/scvi
+  - name: methods/transcriptformer_mlflow
   - name: methods/uce
   # metrics
   - name: metrics/asw_batch
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 6196f749..104485bd 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -40,6 +40,9 @@ methods = [
   ),
   scprint,
   scvi,
+  transcriptformer_mlflow.run(
+    args: [model: file("s3://openproblems-work/cache/transcriptformer-mlflow-model.zip")]
+  ),
   uce.run(
     args: [model: file("s3://openproblems-work/cache/uce-model-v5.zip")]
   )

From 74be8558a5d2d31701acf338c908615f03d54d8e Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 9 Jul 2025 11:38:05 +0200
Subject: [PATCH 07/21] Install openproblems package for transcriptformer

---
 src/methods/transcriptformer_mlflow/config.vsh.yaml | 2 ++
 src/methods/transcriptformer_mlflow/script.py       | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index ba708fa1..9cb3544f 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -57,6 +57,8 @@ engines:
         run: uv pip install -r /requirements.txt
       - type: docker
         run: uv pip install mlflow==3.1.0
+      - type: docker
+        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
 
 runners:
   - type: executable
diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index 76b41023..c5c37575 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -38,6 +38,7 @@
 
 if os.path.isdir(par["model"]):
     print("\n>>> Using model directory...", flush=True)
+    print(f"Directory path: '{par['model']}'", flush=True)
     model_temp = None
     model_dir = par["model"]
 else:

From 8693f35176d8245b61ae18c76c1fc5b8181ee51a Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 9 Jul 2025 12:16:12 +0200
Subject: [PATCH 08/21] Style transcriptformer_mlflow script

---
 src/methods/transcriptformer_mlflow/script.py | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index c5c37575..b16806d3 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -1,11 +1,12 @@
-import anndata as ad
+import os
 import sys
-import mlflow.pyfunc
+import tarfile
 import tempfile
-import os
-import pandas as pd
 import zipfile
-import tarfile
+
+import anndata as ad
+import mlflow.pyfunc
+import pandas as pd
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -19,10 +20,10 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from read_anndata_partial import read_anndata
 from exit_codes import exit_non_applicable
+from read_anndata_partial import read_anndata
 
-print(f"====== TranscriptFormer (MLflow model) ======", flush=True)
+print("====== TranscriptFormer (MLflow model) ======", flush=True)
 
 print("\n>>> Reading input files...", flush=True)
 print(f"Input H5AD file: '{par['input']}'", flush=True)
@@ -31,7 +32,7 @@
 if adata.uns["dataset_organism"] != "homo_sapiens":
     exit_non_applicable(
         f"Transcriptformer can only be used with human data "
-        f"(dataset_organism == \"{adata.uns['dataset_organism']}\")"
+        f'(dataset_organism == "{adata.uns["dataset_organism"]}")'
     )
 
 print(adata, flush=True)
@@ -50,9 +51,7 @@
         print(f".zip path: '{par['model']}'", flush=True)
         with zipfile.ZipFile(par["model"], "r") as zip_file:
             zip_file.extractall(model_dir)
-    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(
-        ".tar.gz"
-    ):
+    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
         print("\n>>> Extracting model from .tar.gz...", flush=True)
         print(f".tar.gz path: '{par['model']}'", flush=True)
         with tarfile.open(par["model"], "r:gz") as tar_file:
@@ -68,8 +67,13 @@
 print(model, flush=True)
 
 print("\n>>> Writing temporary input H5AD file...", flush=True)
-input_adata = ad.AnnData(X = adata.X.copy(), var = adata.var.filter(items=["feature_id"]).rename(columns = {"feature_id": "ensembl_id"}))
-input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing
+input_adata = ad.AnnData(
+    X=adata.X.copy(),
+    var=adata.var.filter(items=["feature_id"]).rename(
+        columns={"feature_id": "ensembl_id"}
+    ),
+)
+input_adata.obs["assay"] = "unknown"  # Avoid error if assay is missing
 print(input_adata, flush=True)
 h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
 print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)

From b7dae0b4b7b98a17a3966ded2ffaaa8335379a20 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 29 Jul 2025 10:45:10 +0200
Subject: [PATCH 09/21] Increase transcriptformer memory label

---
 src/methods/transcriptformer_mlflow/config.vsh.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index 9cb3544f..2d144c23 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -64,4 +64,4 @@ runners:
   - type: executable
   - type: nextflow
     directives:
-      label: [hightime, midmem, midcpu, gpu]
+      label: [hightime, highmem, midcpu, gpu]

From 5d3c6e0f213f5e2e025173fe19cf1a0354857a74 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 13 Aug 2025 10:59:28 +0200
Subject: [PATCH 10/21] Add scvi_mlflow method

---
 src/methods/scvi_mlflow/config.vsh.yaml     |  61 +++
 src/methods/scvi_mlflow/requirements.txt    | 459 ++++++++++++++++++++
 src/methods/scvi_mlflow/script.py           | 114 +++++
 src/workflows/run_benchmark/config.vsh.yaml |   1 +
 src/workflows/run_benchmark/main.nf         |   3 +
 5 files changed, 638 insertions(+)
 create mode 100644 src/methods/scvi_mlflow/config.vsh.yaml
 create mode 100644 src/methods/scvi_mlflow/requirements.txt
 create mode 100644 src/methods/scvi_mlflow/script.py

diff --git a/src/methods/scvi_mlflow/config.vsh.yaml b/src/methods/scvi_mlflow/config.vsh.yaml
new file mode 100644
index 00000000..d50a6e62
--- /dev/null
+++ b/src/methods/scvi_mlflow/config.vsh.yaml
@@ -0,0 +1,61 @@
+__merge__: ../../api/base_method.yaml
+
+name: scvi_mlflow
+label: scVI (MLflow model)
+summary: scVI combines a variational autoencoder with a hierarchical Bayesian model (MLflow model)
+description: |
+   scVI combines a variational autoencoder with a hierarchical Bayesian model.
+   It uses the negative binomial distribution to describe gene expression of
+   each cell, conditioned on unobserved factors and the batch variable.
+
+   This version uses a pre-trained MLflow model.
+references:
+  doi:
+    - 10.1038/s41592-018-0229-2
+links:
+  repository: https://github.com/scverse/scvi-tools
+  documentation: https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html
+
+info:
+  method_types: [embedding]
+  preferred_normalization: counts
+
+arguments:
+  - name: --model
+    type: file
+    description: |
+      An MLflow model URI for the scVI model. If it is a .zip or
+      .tar.gz file it will be extracted to a temporary directory.
+    required: true
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+  - path: /src/utils/exit_codes.py
+  - path: requirements.txt
+
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1
+    setup:
+      - type: docker
+        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
+        run: sh /uv-installer.sh && rm /uv-installer.sh
+        env: PATH="/root/.local/bin/:$PATH"
+      - type: docker
+        run: uv venv --python 3.11 /opt/venv
+      - type: docker
+        env:
+          - VIRTUAL_ENV=/opt/venv
+          - PATH="/opt/venv/bin:$PATH"
+        add: requirements.txt /requirements.txt
+        run: uv pip install -r /requirements.txt && uv pip install mlflow==3.1.0
+      - type: docker
+        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [hightime, highmem, midcpu, gpu]
diff --git a/src/methods/scvi_mlflow/requirements.txt b/src/methods/scvi_mlflow/requirements.txt
new file mode 100644
index 00000000..c3c79df5
--- /dev/null
+++ b/src/methods/scvi_mlflow/requirements.txt
@@ -0,0 +1,459 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements.in -o /tmp/tmp6b02zuzi/requirements_initial.txt
+absl-py==2.3.1
+    # via
+    #   chex
+    #   ml-collections
+    #   optax
+    #   orbax-checkpoint
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.12.15
+    # via fsspec
+aiosignal==1.4.0
+    # via aiohttp
+alembic==1.16.4
+    # via mlflow
+anndata==0.10.8
+    # via
+    #   -r requirements.in
+    #   mudata
+    #   scvi-tools
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.10.0
+    # via starlette
+array-api-compat==1.12.0
+    # via anndata
+attrs==25.3.0
+    # via aiohttp
+blinker==1.9.0
+    # via flask
+cachetools==5.5.2
+    # via
+    #   google-auth
+    #   mlflow-skinny
+certifi==2025.8.3
+    # via requests
+charset-normalizer==3.4.3
+    # via requests
+chex==0.1.90
+    # via optax
+click==8.2.1
+    # via
+    #   flask
+    #   mlflow-skinny
+    #   uvicorn
+cloudpickle==3.1.1
+    # via mlflow-skinny
+contourpy==1.3.3
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+databricks-sdk==0.62.0
+    # via mlflow-skinny
+docker==7.1.0
+    # via mlflow
+docrep==0.3.2
+    # via scvi-tools
+etils==1.13.0
+    # via orbax-checkpoint
+fastapi==0.116.1
+    # via mlflow-skinny
+filelock==3.18.0
+    # via
+    #   torch
+    #   triton
+flask==3.1.1
+    # via mlflow
+flax==0.10.4
+    # via scvi-tools
+fonttools==4.59.0
+    # via matplotlib
+frozenlist==1.7.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.7.0
+    # via
+    #   etils
+    #   lightning
+    #   pytorch-lightning
+    #   torch
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.45
+    # via mlflow-skinny
+google-auth==2.40.3
+    # via databricks-sdk
+graphene==3.4.3
+    # via mlflow
+graphql-core==3.2.6
+    # via
+    #   graphene
+    #   graphql-relay
+graphql-relay==3.2.0
+    # via graphene
+greenlet==3.2.4
+    # via sqlalchemy
+gunicorn==23.0.0
+    # via mlflow
+h11==0.16.0
+    # via uvicorn
+h5py==3.14.0
+    # via
+    #   anndata
+    #   scvi-tools
+humanize==4.12.3
+    # via orbax-checkpoint
+idna==3.10
+    # via
+    #   anyio
+    #   requests
+    #   yarl
+importlib-metadata==8.7.0
+    # via
+    #   mlflow-skinny
+    #   opentelemetry-api
+importlib-resources==6.5.2
+    # via etils
+itsdangerous==2.2.0
+    # via flask
+jax==0.4.33
+    # via
+    #   -r requirements.in
+    #   chex
+    #   flax
+    #   numpyro
+    #   optax
+    #   orbax-checkpoint
+    #   scvi-tools
+jaxlib==0.4.33
+    # via
+    #   -r requirements.in
+    #   chex
+    #   jax
+    #   numpyro
+    #   optax
+    #   orbax-checkpoint
+    #   scvi-tools
+jinja2==3.1.6
+    # via
+    #   flask
+    #   torch
+joblib==1.5.1
+    # via scikit-learn
+kiwisolver==1.4.9
+    # via matplotlib
+lightning==2.5.2
+    # via scvi-tools
+lightning-utilities==0.15.2
+    # via
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+mako==1.3.10
+    # via alembic
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.2
+    # via
+    #   flask
+    #   jinja2
+    #   mako
+    #   werkzeug
+matplotlib==3.10.5
+    # via mlflow
+mdurl==0.1.2
+    # via markdown-it-py
+ml-collections==1.1.0
+    # via scvi-tools
+ml-dtypes==0.5.3
+    # via
+    #   jax
+    #   jaxlib
+    #   tensorstore
+mlflow==3.1.0
+    # via -r requirements.in
+mlflow-skinny==3.1.0
+    # via mlflow
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.1
+    # via
+    #   flax
+    #   orbax-checkpoint
+mudata==0.3.2
+    # via scvi-tools
+multidict==6.6.4
+    # via
+    #   aiohttp
+    #   yarl
+multipledispatch==1.0.0
+    # via numpyro
+natsort==8.4.0
+    # via anndata
+nest-asyncio==1.6.0
+    # via orbax-checkpoint
+networkx==3.5
+    # via torch
+numpy==1.26.4
+    # via
+    #   anndata
+    #   chex
+    #   contourpy
+    #   flax
+    #   h5py
+    #   jax
+    #   jaxlib
+    #   matplotlib
+    #   ml-dtypes
+    #   mlflow
+    #   numpyro
+    #   optax
+    #   orbax-checkpoint
+    #   pandas
+    #   pyro-ppl
+    #   scikit-learn
+    #   scipy
+    #   scvi-tools
+    #   tensorstore
+    #   torchmetrics
+    #   treescope
+numpyro==0.19.0
+    # via scvi-tools
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+opentelemetry-api==1.36.0
+    # via
+    #   mlflow-skinny
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-sdk==1.36.0
+    # via mlflow-skinny
+opentelemetry-semantic-conventions==0.57b0
+    # via opentelemetry-sdk
+opt-einsum==3.4.0
+    # via
+    #   jax
+    #   pyro-ppl
+optax==0.2.5
+    # via
+    #   flax
+    #   scvi-tools
+orbax-checkpoint==0.6.4
+    # via flax
+packaging==25.0
+    # via
+    #   anndata
+    #   gunicorn
+    #   lightning
+    #   lightning-utilities
+    #   matplotlib
+    #   mlflow-skinny
+    #   pytorch-lightning
+    #   torchmetrics
+pandas==2.2.3
+    # via
+    #   -r requirements.in
+    #   anndata
+    #   mlflow
+    #   scvi-tools
+pillow==11.3.0
+    # via matplotlib
+propcache==0.3.2
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==6.31.1
+    # via
+    #   mlflow-skinny
+    #   orbax-checkpoint
+pyarrow==20.0.0
+    # via mlflow
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2
+    # via google-auth
+pydantic==2.11.7
+    # via
+    #   fastapi
+    #   mlflow-skinny
+pydantic-core==2.33.2
+    # via pydantic
+pygments==2.19.2
+    # via rich
+pyparsing==3.2.3
+    # via matplotlib
+pyro-api==0.1.2
+    # via pyro-ppl
+pyro-ppl==1.9.1
+    # via scvi-tools
+python-dateutil==2.9.0.post0
+    # via
+    #   graphene
+    #   matplotlib
+    #   pandas
+pytorch-lightning==2.5.2
+    # via lightning
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   flax
+    #   lightning
+    #   ml-collections
+    #   mlflow-skinny
+    #   orbax-checkpoint
+    #   pytorch-lightning
+requests==2.32.4
+    # via
+    #   databricks-sdk
+    #   docker
+    #   mlflow-skinny
+rich==14.1.0
+    # via
+    #   flax
+    #   scvi-tools
+rsa==4.9.1
+    # via google-auth
+scikit-learn==1.7.1
+    # via
+    #   mlflow
+    #   scvi-tools
+scipy==1.16.1
+    # via
+    #   anndata
+    #   jax
+    #   jaxlib
+    #   mlflow
+    #   scikit-learn
+    #   scvi-tools
+scvi-tools==1.1.6.post2
+    # via -r requirements.in
+setuptools==80.9.0
+    # via lightning-utilities
+six==1.17.0
+    # via
+    #   docrep
+    #   python-dateutil
+smmap==5.0.2
+    # via gitdb
+sniffio==1.3.1
+    # via anyio
+sqlalchemy==2.0.43
+    # via
+    #   alembic
+    #   mlflow
+sqlparse==0.5.3
+    # via mlflow-skinny
+starlette==0.47.2
+    # via fastapi
+sympy==1.13.1
+    # via torch
+tensorstore==0.1.76
+    # via
+    #   flax
+    #   orbax-checkpoint
+threadpoolctl==3.6.0
+    # via scikit-learn
+toolz==1.0.0
+    # via chex
+torch==2.5.1
+    # via
+    #   -r requirements.in
+    #   lightning
+    #   pyro-ppl
+    #   pytorch-lightning
+    #   scvi-tools
+    #   torchmetrics
+torchmetrics==1.8.1
+    # via
+    #   lightning
+    #   pytorch-lightning
+    #   scvi-tools
+tqdm==4.67.1
+    # via
+    #   lightning
+    #   numpyro
+    #   pyro-ppl
+    #   pytorch-lightning
+    #   scvi-tools
+treescope==0.1.10
+    # via flax
+triton==3.1.0
+    # via torch
+typing-extensions==4.14.1
+    # via
+    #   aiosignal
+    #   alembic
+    #   anyio
+    #   chex
+    #   etils
+    #   fastapi
+    #   flax
+    #   graphene
+    #   lightning
+    #   lightning-utilities
+    #   mlflow-skinny
+    #   opentelemetry-api
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+    #   orbax-checkpoint
+    #   pydantic
+    #   pydantic-core
+    #   pytorch-lightning
+    #   sqlalchemy
+    #   starlette
+    #   torch
+    #   typing-inspection
+typing-inspection==0.4.1
+    # via pydantic
+tzdata==2025.2
+    # via pandas
+urllib3==2.5.0
+    # via
+    #   docker
+    #   requests
+uvicorn==0.35.0
+    # via mlflow-skinny
+werkzeug==3.1.3
+    # via flask
+yarl==1.20.1
+    # via aiohttp
+zipp==3.23.0
+    # via
+    #   etils
+    #   importlib-metadata
diff --git a/src/methods/scvi_mlflow/script.py b/src/methods/scvi_mlflow/script.py
new file mode 100644
index 00000000..04ff94d5
--- /dev/null
+++ b/src/methods/scvi_mlflow/script.py
@@ -0,0 +1,114 @@
+import os
+import sys
+import tarfile
+import tempfile
+import zipfile
+
+import anndata as ad
+import mlflow.pyfunc
+import pandas as pd
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    "input": "resources_test/.../input.h5ad",
+    "output": "output.h5ad",
+    "model": "resources_test/.../model",
+}
+meta = {"name": "scvi_mlflow"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from exit_codes import exit_non_applicable
+from read_anndata_partial import read_anndata
+
+print("====== scVI (MLflow model) ======", flush=True)
+
+print("\n>>> Reading input files...", flush=True)
+print(f"Input H5AD file: '{par['input']}'", flush=True)
+adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+
+if adata.uns["dataset_organism"] == "homo_sapiens":
+    organism = "human"
+elif adata.uns["dataset_organism"] == "mus_musculus":
+    organism = "mouse"
+else:
+    exit_non_applicable(
+        f"scVI (MLflow) can only be used with human or mouse data "
+        f'(dataset_organism == "{adata.uns["dataset_organism"]}")'
+    )
+
+print(adata, flush=True)
+
+if os.path.isdir(par["model"]):
+    print("\n>>> Using model directory...", flush=True)
+    print(f"Directory path: '{par['model']}'", flush=True)
+    model_temp = None
+    model_dir = par["model"]
+else:
+    model_temp = tempfile.TemporaryDirectory()
+    model_dir = model_temp.name
+
+    if zipfile.is_zipfile(par["model"]):
+        print("\n>>> Extracting model from .zip...", flush=True)
+        print(f".zip path: '{par['model']}'", flush=True)
+        with zipfile.ZipFile(par["model"], "r") as zip_file:
+            zip_file.extractall(model_dir)
+    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
+        print("\n>>> Extracting model from .tar.gz...", flush=True)
+        print(f".tar.gz path: '{par['model']}'", flush=True)
+        with tarfile.open(par["model"], "r:gz") as tar_file:
+            tar_file.extractall(model_dir)
+            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
+    else:
+        raise ValueError(
+            "The 'model' argument should be a directory a .zip file or a .tar.gz file"
+        )
+
+print("\n>>> Loading model...", flush=True)
+model = mlflow.pyfunc.load_model(model_dir)
+print(model, flush=True)
+
+print("\n>>> Writing temporary input H5AD file...", flush=True)
+input_adata = ad.AnnData(X=adata.X.copy())
+input_adata.var_names = adata.var["feature_id"].values
+input_adata.obs["batch"] = adata.obs["batch"].values
+print(input_adata, flush=True)
+
+h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
+print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
+input_adata.write(h5ad_file.name)
+del input_adata
+
+print("\n>>> Running model...", flush=True)
+input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
+input_params = {"organism": organism, "return_dist": True, "batch_keys": "batch"}
+embedding = model.predict(input_df, params=input_params)
+
+print("\n>>> Storing output...", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    obsm={
+        "X_emb": embedding,
+    },
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+print(output)
+
+print("\n>>> Writing output to file...", flush=True)
+print(f"Output H5AD file: '{par['output']}'", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
+
+print("\n>>> Cleaning up temporary files...", flush=True)
+if model_temp is not None:
+    model_temp.cleanup()
+h5ad_file.close()
+os.unlink(h5ad_file.name)
+
+print("\n>>> Done!", flush=True)
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index d9fe9504..af6f51b7 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -106,6 +106,7 @@ dependencies:
   - name: methods/scimilarity
   - name: methods/scprint
   - name: methods/scvi
+  - name: methods/scvi_mlflow
   - name: methods/transcriptformer_mlflow
   - name: methods/uce
   # metrics
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 104485bd..2db049f6 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -40,6 +40,9 @@ methods = [
   ),
   scprint,
   scvi,
+  scvi_mlflow.run(
+    args: [model: file("s3://openproblems-work/cache/scvi-mlflow-model.zip")]
+  ),
   transcriptformer_mlflow.run(
     args: [model: file("s3://openproblems-work/cache/transcriptformer-mlflow-model.zip")]
   ),

From 0243f8d042082969c4a7571a95ee2b401d5059ce Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 13 Aug 2025 11:32:57 +0200
Subject: [PATCH 11/21] Add geneformer_mlflow method

---
 src/methods/geneformer_mlflow/config.vsh.yaml |  65 +++
 .../geneformer_mlflow/requirements.txt        | 540 ++++++++++++++++++
 src/methods/geneformer_mlflow/script.py       | 112 ++++
 src/workflows/run_benchmark/config.vsh.yaml   |   1 +
 src/workflows/run_benchmark/main.nf           |   5 +-
 5 files changed, 722 insertions(+), 1 deletion(-)
 create mode 100644 src/methods/geneformer_mlflow/config.vsh.yaml
 create mode 100644 src/methods/geneformer_mlflow/requirements.txt
 create mode 100644 src/methods/geneformer_mlflow/script.py

diff --git a/src/methods/geneformer_mlflow/config.vsh.yaml b/src/methods/geneformer_mlflow/config.vsh.yaml
new file mode 100644
index 00000000..b9d08eda
--- /dev/null
+++ b/src/methods/geneformer_mlflow/config.vsh.yaml
@@ -0,0 +1,65 @@
+__merge__: ../../api/base_method.yaml
+
+name: geneformer_mlflow
+label: Geneformer (MLflow model)
+summary: Geneformer is a foundation transformer model pretrained on a large-scale corpus of single cell transcriptomes
+description: |
+  Geneformer is a foundation transformer model pretrained on a large-scale
+  corpus of single cell transcriptomes to enable context-aware predictions in
+  network biology. For this task, Geneformer is used to create a batch-corrected
+  cell embedding.
+
+  Here, we use a version packaged as an MLflow model.
+references:
+  doi:
+    - 10.1038/s41586-023-06139-9
+    - 10.1101/2024.08.16.608180
+links:
+  documentation: https://geneformer.readthedocs.io/en/latest/index.html
+  repository: https://huggingface.co/ctheodoris/Geneformer
+
+info:
+  method_types: [embedding]
+  preferred_normalization: counts
+
+arguments:
+  - name: --model
+    type: file
+    description: |
+      An MLflow model URI for the Geneformer model. If it is a .zip or
+      .tar.gz file it will be extracted to a temporary directory.
+    required: true
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+  - path: /src/utils/exit_codes.py
+  - path: requirements.txt
+
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1
+    setup:
+      - type: docker
+        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
+        run: sh /uv-installer.sh && rm /uv-installer.sh
+        env: PATH="/root/.local/bin/:$PATH"
+      - type: docker
+        run: uv venv --python 3.11 /opt/venv
+      - type: docker
+        env:
+          - VIRTUAL_ENV=/opt/venv
+          - PATH="/opt/venv/bin:$PATH"
+        add: requirements.txt /requirements.txt
+        run: uv pip install -r /requirements.txt
+      - type: docker
+        run: uv pip install mlflow==3.1.0
+      - type: docker
+        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [hightime, highmem, midcpu, gpu]
diff --git a/src/methods/geneformer_mlflow/requirements.txt b/src/methods/geneformer_mlflow/requirements.txt
new file mode 100644
index 00000000..21bec26b
--- /dev/null
+++ b/src/methods/geneformer_mlflow/requirements.txt
@@ -0,0 +1,540 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --output-file=/tmp/tmpmz65ifid/requirements_pip_final.txt requirements.in
+#
+absl-py==2.3.1
+    # via tensorboard
+accelerate==1.10.0
+    # via peft
+accumulation-tree==0.6.4
+    # via tdigest
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.12.15
+    # via fsspec
+aiosignal==1.4.0
+    # via aiohttp
+alembic==1.16.4
+    # via
+    #   mlflow
+    #   optuna
+anndata==0.10.9
+    # via
+    #   -r requirements.in
+    #   geneformer
+    #   scanpy
+annotated-types==0.7.0
+    # via pydantic
+antlr4-python3-runtime==4.9.3
+    # via omegaconf
+anyio==4.10.0
+    # via starlette
+array-api-compat==1.12.0
+    # via anndata
+attrs==25.3.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+blinker==1.9.0
+    # via flask
+cachetools==5.5.2
+    # via
+    #   google-auth
+    #   mlflow-skinny
+certifi==2025.8.3
+    # via requests
+charset-normalizer==3.4.3
+    # via requests
+click==8.2.1
+    # via
+    #   flask
+    #   loompy
+    #   mlflow-skinny
+    #   ray
+    #   uvicorn
+cloudpickle==3.1.1
+    # via mlflow-skinny
+colorlog==6.9.0
+    # via optuna
+contourpy==1.3.3
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+databricks-sdk==0.62.0
+    # via mlflow-skinny
+datasets==4.0.0
+    # via geneformer
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+docker==7.1.0
+    # via mlflow
+fastapi==0.116.1
+    # via mlflow-skinny
+filelock==3.18.0
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   ray
+    #   torch
+    #   transformers
+flask==3.1.1
+    # via mlflow
+fonttools==4.59.0
+    # via matplotlib
+frozenlist==1.7.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2025.3.0
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+geneformer @ git+https://huggingface.co/ctheodoris/Geneformer@69e6887
+    # via -r requirements.in
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.45
+    # via mlflow-skinny
+google-auth==2.40.3
+    # via databricks-sdk
+graphene==3.4.3
+    # via mlflow
+graphql-core==3.2.6
+    # via
+    #   graphene
+    #   graphql-relay
+graphql-relay==3.2.0
+    # via graphene
+greenlet==3.2.4
+    # via sqlalchemy
+grpcio==1.74.0
+    # via tensorboard
+gunicorn==23.0.0
+    # via mlflow
+h11==0.16.0
+    # via uvicorn
+h5py==3.14.0
+    # via
+    #   anndata
+    #   loompy
+    #   scanpy
+hf-xet==1.1.7
+    # via huggingface-hub
+huggingface-hub==0.34.4
+    # via
+    #   accelerate
+    #   datasets
+    #   peft
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   anyio
+    #   requests
+    #   yarl
+importlib-metadata==8.7.0
+    # via
+    #   mlflow-skinny
+    #   opentelemetry-api
+itsdangerous==2.2.0
+    # via flask
+jinja2==3.1.6
+    # via
+    #   flask
+    #   torch
+joblib==1.5.1
+    # via
+    #   pynndescent
+    #   scanpy
+    #   scikit-learn
+jsonschema==4.25.0
+    # via ray
+jsonschema-specifications==2025.4.1
+    # via jsonschema
+kiwisolver==1.4.9
+    # via matplotlib
+legacy-api-wrap==1.4.1
+    # via scanpy
+llvmlite==0.44.0
+    # via
+    #   numba
+    #   pynndescent
+loompy==3.0.8
+    # via geneformer
+mako==1.3.10
+    # via alembic
+markdown==3.8.2
+    # via tensorboard
+markupsafe==3.0.2
+    # via
+    #   flask
+    #   jinja2
+    #   mako
+    #   werkzeug
+matplotlib==3.10.5
+    # via
+    #   geneformer
+    #   mlflow
+    #   scanpy
+    #   seaborn
+mlflow==3.1.0
+    # via -r requirements.in
+mlflow-skinny==3.1.0
+    # via mlflow
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.1
+    # via ray
+multidict==6.6.4
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+natsort==8.4.0
+    # via
+    #   anndata
+    #   scanpy
+networkx==3.5
+    # via
+    #   scanpy
+    #   torch
+numba==0.61.2
+    # via
+    #   loompy
+    #   pynndescent
+    #   scanpy
+    #   umap-learn
+numpy==2.2.6
+    # via
+    #   accelerate
+    #   anndata
+    #   contourpy
+    #   datasets
+    #   geneformer
+    #   h5py
+    #   loompy
+    #   matplotlib
+    #   mlflow
+    #   numba
+    #   numpy-groupies
+    #   optuna
+    #   pandas
+    #   patsy
+    #   peft
+    #   scanpy
+    #   scikit-learn
+    #   scipy
+    #   seaborn
+    #   statsmodels
+    #   tensorboard
+    #   transformers
+    #   umap-learn
+numpy-groupies==0.11.3
+    # via loompy
+nvidia-cublas-cu12==12.8.4.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.8.90
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.8.93
+    # via torch
+nvidia-cuda-runtime-cu12==12.8.90
+    # via torch
+nvidia-cudnn-cu12==9.10.2.21
+    # via torch
+nvidia-cufft-cu12==11.3.3.83
+    # via torch
+nvidia-cufile-cu12==1.13.1.3
+    # via torch
+nvidia-curand-cu12==10.3.9.90
+    # via torch
+nvidia-cusolver-cu12==11.7.3.90
+    # via torch
+nvidia-cusparse-cu12==12.5.8.93
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.7.1
+    # via torch
+nvidia-nccl-cu12==2.27.3
+    # via torch
+nvidia-nvjitlink-cu12==12.8.93
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.8.90
+    # via torch
+omegaconf==2.3.0
+    # via -r requirements.in
+opentelemetry-api==1.36.0
+    # via
+    #   mlflow-skinny
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-sdk==1.36.0
+    # via mlflow-skinny
+opentelemetry-semantic-conventions==0.57b0
+    # via opentelemetry-sdk
+optuna==4.4.0
+    # via
+    #   geneformer
+    #   optuna-integration
+optuna-integration==4.4.0
+    # via geneformer
+packaging==25.0
+    # via
+    #   accelerate
+    #   anndata
+    #   datasets
+    #   geneformer
+    #   gunicorn
+    #   huggingface-hub
+    #   matplotlib
+    #   mlflow-skinny
+    #   optuna
+    #   peft
+    #   ray
+    #   scanpy
+    #   statsmodels
+    #   tensorboard
+    #   transformers
+pandas==2.3.1
+    # via
+    #   anndata
+    #   datasets
+    #   geneformer
+    #   mlflow
+    #   scanpy
+    #   seaborn
+    #   statsmodels
+patsy==1.0.1
+    # via
+    #   scanpy
+    #   statsmodels
+peft==0.17.0
+    # via geneformer
+pillow==11.3.0
+    # via
+    #   matplotlib
+    #   tensorboard
+propcache==0.3.2
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==6.31.1
+    # via
+    #   mlflow-skinny
+    #   ray
+    #   tensorboard
+psutil==7.0.0
+    # via
+    #   accelerate
+    #   peft
+pyarrow==20.0.0
+    # via
+    #   datasets
+    #   geneformer
+    #   mlflow
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2
+    # via google-auth
+pydantic==2.11.7
+    # via
+    #   fastapi
+    #   mlflow-skinny
+pydantic-core==2.33.2
+    # via pydantic
+pynndescent==0.5.13
+    # via
+    #   scanpy
+    #   umap-learn
+pyparsing==3.2.3
+    # via matplotlib
+python-dateutil==2.9.0.post0
+    # via
+    #   graphene
+    #   matplotlib
+    #   pandas
+pytz==2025.2
+    # via
+    #   geneformer
+    #   pandas
+pyudorandom==1.0.0
+    # via tdigest
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   mlflow-skinny
+    #   omegaconf
+    #   optuna
+    #   peft
+    #   ray
+    #   transformers
+ray==2.48.0
+    # via geneformer
+referencing==0.36.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2025.7.34
+    # via transformers
+requests==2.32.4
+    # via
+    #   databricks-sdk
+    #   datasets
+    #   docker
+    #   huggingface-hub
+    #   mlflow-skinny
+    #   ray
+    #   transformers
+rpds-py==0.27.0
+    # via
+    #   jsonschema
+    #   referencing
+rsa==4.9.1
+    # via google-auth
+safetensors==0.6.2
+    # via
+    #   accelerate
+    #   peft
+    #   transformers
+scanpy==1.11.4
+    # via geneformer
+scikit-learn==1.7.1
+    # via
+    #   geneformer
+    #   mlflow
+    #   pynndescent
+    #   scanpy
+    #   umap-learn
+scipy==1.16.1
+    # via
+    #   anndata
+    #   geneformer
+    #   loompy
+    #   mlflow
+    #   pynndescent
+    #   scanpy
+    #   scikit-learn
+    #   statsmodels
+    #   umap-learn
+seaborn==0.13.2
+    # via
+    #   geneformer
+    #   scanpy
+session-info2==0.2
+    # via scanpy
+six==1.17.0
+    # via python-dateutil
+smmap==5.0.2
+    # via gitdb
+sniffio==1.3.1
+    # via anyio
+sqlalchemy==2.0.43
+    # via
+    #   alembic
+    #   mlflow
+    #   optuna
+sqlparse==0.5.3
+    # via mlflow-skinny
+starlette==0.47.2
+    # via fastapi
+statsmodels==0.14.5
+    # via
+    #   geneformer
+    #   scanpy
+sympy==1.14.0
+    # via torch
+tdigest==0.5.2.2
+    # via geneformer
+tensorboard==2.20.0
+    # via geneformer
+tensorboard-data-server==0.7.2
+    # via tensorboard
+threadpoolctl==3.6.0
+    # via scikit-learn
+tokenizers==0.21.4
+    # via transformers
+torch==2.8.0
+    # via
+    #   accelerate
+    #   geneformer
+    #   peft
+tqdm==4.67.1
+    # via
+    #   datasets
+    #   geneformer
+    #   huggingface-hub
+    #   optuna
+    #   peft
+    #   scanpy
+    #   transformers
+    #   umap-learn
+transformers==4.49.0
+    # via
+    #   -r requirements.in
+    #   geneformer
+    #   peft
+triton==3.4.0
+    # via torch
+typing-extensions==4.14.1
+    # via
+    #   aiosignal
+    #   alembic
+    #   anyio
+    #   fastapi
+    #   graphene
+    #   huggingface-hub
+    #   mlflow-skinny
+    #   opentelemetry-api
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+    #   pydantic
+    #   pydantic-core
+    #   referencing
+    #   scanpy
+    #   sqlalchemy
+    #   starlette
+    #   torch
+    #   typing-inspection
+typing-inspection==0.4.1
+    # via pydantic
+tzdata==2025.2
+    # via pandas
+umap-learn==0.5.9.post2
+    # via scanpy
+urllib3==2.5.0
+    # via
+    #   docker
+    #   requests
+uvicorn==0.35.0
+    # via mlflow-skinny
+werkzeug==3.1.3
+    # via
+    #   flask
+    #   tensorboard
+xxhash==3.5.0
+    # via datasets
+yarl==1.20.1
+    # via aiohttp
+zipp==3.23.0
+    # via importlib-metadata
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/src/methods/geneformer_mlflow/script.py b/src/methods/geneformer_mlflow/script.py
new file mode 100644
index 00000000..800ab80b
--- /dev/null
+++ b/src/methods/geneformer_mlflow/script.py
@@ -0,0 +1,112 @@
+import os
+import sys
+import tarfile
+import tempfile
+import zipfile
+
+import anndata as ad
+import mlflow.pyfunc
+import pandas as pd
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    "input": "resources_test/.../input.h5ad",
+    "output": "output.h5ad",
+    "model": "resources_test/.../model",
+}
+meta = {"name": "geneformer_mlflow"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from exit_codes import exit_non_applicable
+from read_anndata_partial import read_anndata
+
+print("====== Geneformer (MLflow model) ======", flush=True)
+
+print("\n>>> Reading input files...", flush=True)
+print(f"Input H5AD file: '{par['input']}'", flush=True)
+adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+
+if adata.uns["dataset_organism"] != "homo_sapiens":
+    exit_non_applicable(
+        f"Geneformer (MLflow) can only be used with human data "
+        f'(dataset_organism == "{adata.uns["dataset_organism"]}")'
+    )
+
+print(adata, flush=True)
+
+if os.path.isdir(par["model"]):
+    print("\n>>> Using model directory...", flush=True)
+    print(f"Directory path: '{par['model']}'", flush=True)
+    model_temp = None
+    model_dir = par["model"]
+else:
+    model_temp = tempfile.TemporaryDirectory()
+    model_dir = model_temp.name
+
+    if zipfile.is_zipfile(par["model"]):
+        print("\n>>> Extracting model from .zip...", flush=True)
+        print(f".zip path: '{par['model']}'", flush=True)
+        with zipfile.ZipFile(par["model"], "r") as zip_file:
+            zip_file.extractall(model_dir)
+    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
+        print("\n>>> Extracting model from .tar.gz...", flush=True)
+        print(f".tar.gz path: '{par['model']}'", flush=True)
+        with tarfile.open(par["model"], "r:gz") as tar_file:
+            tar_file.extractall(model_dir)
+            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
+    else:
+        raise ValueError(
+            "The 'model' argument should be a directory a .zip file or a .tar.gz file"
+        )
+
+print("\n>>> Loading model...", flush=True)
+model = mlflow.pyfunc.load_model(model_dir)
+print(model, flush=True)
+
+print("\n>>> Writing temporary input H5AD file...", flush=True)
+input_adata = ad.AnnData(
+    X=adata.X.copy(),
+    var=adata.var.filter(items=["feature_id"]).rename(
+        columns={"feature_id": "ensembl_id"}
+    ),
+)
+print(input_adata, flush=True)
+
+h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
+print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
+input_adata.write(h5ad_file.name)
+del input_adata
+
+print("\n>>> Running model...", flush=True)
+input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
+embedding = model.predict(input_df)
+
+print("\n>>> Storing output...", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    obsm={
+        "X_emb": embedding,
+    },
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+print(output)
+
+print("\n>>> Writing output to file...", flush=True)
+print(f"Output H5AD file: '{par['output']}'", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
+
+print("\n>>> Cleaning up temporary files...", flush=True)
+if model_temp is not None:
+    model_temp.cleanup()
+h5ad_file.close()
+os.unlink(h5ad_file.name)
+
+print("\n>>> Done!", flush=True)
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index af6f51b7..34db6276 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -93,6 +93,7 @@ dependencies:
   - name: methods/bbknn
   - name: methods/combat
   - name: methods/geneformer
+  - name: methods/geneformer_mlflow
   - name: methods/harmony
   - name: methods/harmonypy
   - name: methods/liger
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 2db049f6..44ff5ed5 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -21,6 +21,9 @@ methods = [
   bbknn,
   combat,
   geneformer,
+  geneformer_mlflow.run(
+    args: [model: file("s3://openproblems-work/cache/geneformer-mlflow-model.zip")]
+  ),
   harmony,
   harmonypy,
   liger,
@@ -61,7 +64,7 @@ metrics = [
   hvg_overlap,
   isolated_label_asw,
   isolated_label_f1,
-  kbet,
+  // kbet,
   kbet_pg,
   kbet_pg_label,
   lisi,

From 04d8a5da3cee43307223ab88b03aa36f27efb497 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 13 Aug 2025 12:37:20 +0200
Subject: [PATCH 12/21] Add scgpt_mlflow method

---
 src/methods/scgpt_mlflow/config.vsh.yaml    |  62 ++
 src/methods/scgpt_mlflow/requirements.txt   | 684 ++++++++++++++++++++
 src/methods/scgpt_mlflow/script.py          | 111 ++++
 src/workflows/run_benchmark/config.vsh.yaml |   1 +
 src/workflows/run_benchmark/main.nf         |   3 +
 5 files changed, 861 insertions(+)
 create mode 100644 src/methods/scgpt_mlflow/config.vsh.yaml
 create mode 100644 src/methods/scgpt_mlflow/requirements.txt
 create mode 100644 src/methods/scgpt_mlflow/script.py

diff --git a/src/methods/scgpt_mlflow/config.vsh.yaml b/src/methods/scgpt_mlflow/config.vsh.yaml
new file mode 100644
index 00000000..b8455165
--- /dev/null
+++ b/src/methods/scgpt_mlflow/config.vsh.yaml
@@ -0,0 +1,62 @@
+__merge__: ../../api/base_method.yaml
+
+name: scgpt_mlflow
+label: scGPT (MLflow model)
+summary: A foundation model for single-cell biology
+description: |
+  scGPT is a foundation model for single-cell biology based on a generative
+  pre-trained transformer and trained on a repository of over 33 million cells.
+
+  Here, we use a version packaged as an MLflow model.
+references:
+  doi:
+    - 10.1038/s41592-024-02201-0
+links:
+  documentation: https://scgpt.readthedocs.io/en/latest/
+  repository: https://github.com/bowang-lab/scGPT
+
+info:
+  method_types: [embedding]
+  preferred_normalization: counts
+
+arguments:
+  - name: --model
+    type: file
+    description: |
+      An MLflow model URI for the transcriptformer model. If it is a .zip or
+      .tar.gz file it will be extracted to a temporary directory.
+    required: true
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+  - path: /src/utils/exit_codes.py
+  - path: requirements.txt
+
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1
+    setup:
+      - type: docker
+        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
+        run: sh /uv-installer.sh && rm /uv-installer.sh
+        env: PATH="/root/.local/bin/:$PATH"
+      - type: docker
+        run: uv venv --python 3.11 /opt/venv
+      - type: docker
+        env:
+          - VIRTUAL_ENV=/opt/venv
+          - PATH="/opt/venv/bin:$PATH"
+        add: requirements.txt /requirements.txt
+        run: uv pip install -r /requirements.txt
+      - type: docker
+        run: uv pip install mlflow==3.1.0
+      - type: docker
+        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [hightime, highmem, midcpu, gpu]
diff --git a/src/methods/scgpt_mlflow/requirements.txt b/src/methods/scgpt_mlflow/requirements.txt
new file mode 100644
index 00000000..2ad53dc3
--- /dev/null
+++ b/src/methods/scgpt_mlflow/requirements.txt
@@ -0,0 +1,684 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements.in -o /tmp/tmp7yfkiop2/requirements_initial.txt
+absl-py==2.3.1
+    # via
+    #   chex
+    #   ml-collections
+    #   optax
+    #   orbax
+    #   orbax-checkpoint
+aiofiles==24.1.0
+    # via orbax-checkpoint
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.12.15
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.4.0
+    # via aiohttp
+alembic==1.16.4
+    # via mlflow
+anndata==0.10.9
+    # via
+    #   -r requirements.in
+    #   mudata
+    #   scanpy
+    #   scib
+    #   scvi-tools
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.10.0
+    # via starlette
+array-api-compat==1.12.0
+    # via anndata
+asttokens==3.0.0
+    # via stack-data
+async-timeout==5.0.1
+    # via aiohttp
+attrs==25.3.0
+    # via aiohttp
+blinker==1.9.0
+    # via flask
+cached-property==2.0.1
+    # via orbax
+cachetools==5.5.2
+    # via
+    #   google-auth
+    #   mlflow-skinny
+cell-gears==0.0.2
+    # via scgpt
+certifi==2025.8.3
+    # via requests
+charset-normalizer==3.4.3
+    # via requests
+chex==0.1.90
+    # via
+    #   optax
+    #   scvi-tools
+click==8.2.1
+    # via
+    #   flask
+    #   mlflow-skinny
+    #   uvicorn
+cloudpickle==3.1.1
+    # via mlflow-skinny
+contourpy==1.3.2
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+databricks-sdk==0.62.0
+    # via mlflow-skinny
+datasets==2.14.4
+    # via scgpt
+dcor==0.6
+    # via cell-gears
+decorator==5.2.1
+    # via ipython
+deprecated==1.2.18
+    # via scib
+dill==0.3.7
+    # via
+    #   datasets
+    #   multiprocess
+docker==7.1.0
+    # via mlflow
+docrep==0.3.2
+    # via scvi-tools
+et-xmlfile==2.0.0
+    # via openpyxl
+etils==1.13.0
+    # via
+    #   orbax
+    #   orbax-checkpoint
+exceptiongroup==1.3.0
+    # via
+    #   anndata
+    #   anyio
+    #   ipython
+executing==2.2.0
+    # via stack-data
+fastapi==0.116.1
+    # via mlflow-skinny
+filelock==3.18.0
+    # via
+    #   huggingface-hub
+    #   torch
+    #   triton
+flask==3.1.1
+    # via mlflow
+flax==0.10.7
+    # via scvi-tools
+fonttools==4.59.0
+    # via matplotlib
+frozenlist==1.7.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.7.0
+    # via
+    #   datasets
+    #   etils
+    #   huggingface-hub
+    #   pytorch-lightning
+    #   torch
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.45
+    # via mlflow-skinny
+google-auth==2.40.3
+    # via databricks-sdk
+graphene==3.4.3
+    # via mlflow
+graphql-core==3.2.6
+    # via
+    #   graphene
+    #   graphql-relay
+graphql-relay==3.2.0
+    # via graphene
+greenlet==3.2.4
+    # via sqlalchemy
+gunicorn==23.0.0
+    # via mlflow
+h11==0.16.0
+    # via uvicorn
+h5py==3.14.0
+    # via
+    #   anndata
+    #   scanpy
+    #   scib
+    #   scvi-tools
+hf-xet==1.1.7
+    # via huggingface-hub
+huggingface-hub==0.34.4
+    # via datasets
+humanize==4.12.3
+    # via orbax-checkpoint
+idna==3.10
+    # via
+    #   anyio
+    #   requests
+    #   yarl
+igraph==0.11.9
+    # via
+    #   leidenalg
+    #   scib
+importlib-metadata==8.7.0
+    # via
+    #   mlflow-skinny
+    #   opentelemetry-api
+importlib-resources==6.5.2
+    # via
+    #   etils
+    #   orbax
+ipython==8.27.0
+    # via -r requirements.in
+itsdangerous==2.2.0
+    # via flask
+jax==0.6.2
+    # via
+    #   chex
+    #   flax
+    #   numpyro
+    #   optax
+    #   orbax
+    #   orbax-checkpoint
+    #   scvi-tools
+jaxlib==0.6.2
+    # via
+    #   chex
+    #   jax
+    #   numpyro
+    #   optax
+    #   orbax
+    #   scvi-tools
+jedi==0.19.2
+    # via ipython
+jinja2==3.1.6
+    # via
+    #   flask
+    #   torch
+joblib==1.5.1
+    # via
+    #   dcor
+    #   pynndescent
+    #   scanpy
+    #   scikit-learn
+kiwisolver==1.4.9
+    # via matplotlib
+legacy-api-wrap==1.4.1
+    # via scanpy
+leidenalg==0.10.2
+    # via
+    #   scgpt
+    #   scib
+lightning-utilities==0.15.2
+    # via
+    #   pytorch-lightning
+    #   torchmetrics
+llvmlite==0.44.0
+    # via
+    #   numba
+    #   pynndescent
+    #   scib
+mako==1.3.10
+    # via alembic
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.2
+    # via
+    #   flask
+    #   jinja2
+    #   mako
+    #   werkzeug
+matplotlib==3.10.5
+    # via
+    #   mlflow
+    #   scanpy
+    #   scib
+    #   seaborn
+matplotlib-inline==0.1.7
+    # via ipython
+mdurl==0.1.2
+    # via markdown-it-py
+ml-collections==1.1.0
+    # via scvi-tools
+ml-dtypes==0.5.3
+    # via
+    #   jax
+    #   jaxlib
+    #   tensorstore
+mlflow==3.1.0
+    # via -r requirements.in
+mlflow-skinny==3.1.0
+    # via mlflow
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.1
+    # via
+    #   flax
+    #   orbax
+    #   orbax-checkpoint
+mudata==0.3.2
+    # via scvi-tools
+multidict==6.6.4
+    # via
+    #   aiohttp
+    #   yarl
+multipledispatch==1.0.0
+    # via numpyro
+multiprocess==0.70.15
+    # via datasets
+natsort==8.4.0
+    # via
+    #   anndata
+    #   scanpy
+nest-asyncio==1.6.0
+    # via
+    #   orbax
+    #   orbax-checkpoint
+networkx==3.4.2
+    # via
+    #   cell-gears
+    #   scanpy
+    #   torch
+numba==0.61.2
+    # via
+    #   dcor
+    #   pynndescent
+    #   scanpy
+    #   scgpt
+    #   scib
+    #   umap-learn
+numpy==1.26.4
+    # via
+    #   anndata
+    #   cell-gears
+    #   chex
+    #   contourpy
+    #   datasets
+    #   dcor
+    #   h5py
+    #   jax
+    #   jaxlib
+    #   matplotlib
+    #   ml-dtypes
+    #   mlflow
+    #   numba
+    #   numpyro
+    #   optax
+    #   orbax
+    #   orbax-checkpoint
+    #   pandas
+    #   patsy
+    #   pyro-ppl
+    #   pytorch-lightning
+    #   scanpy
+    #   scib
+    #   scikit-learn
+    #   scikit-misc
+    #   scipy
+    #   scvi-tools
+    #   seaborn
+    #   statsmodels
+    #   tensorstore
+    #   torchmetrics
+    #   torchtext
+    #   treescope
+    #   umap-learn
+numpyro==0.19.0
+    # via scvi-tools
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.18.1
+    # via torch
+nvidia-nvjitlink-cu12==12.9.86
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
+openpyxl==3.1.5
+    # via scvi-tools
+opentelemetry-api==1.36.0
+    # via
+    #   mlflow-skinny
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-sdk==1.36.0
+    # via mlflow-skinny
+opentelemetry-semantic-conventions==0.57b0
+    # via opentelemetry-sdk
+opt-einsum==3.4.0
+    # via
+    #   jax
+    #   pyro-ppl
+optax==0.2.5
+    # via
+    #   flax
+    #   scvi-tools
+orbax==0.1.7
+    # via scgpt
+orbax-checkpoint==0.11.21
+    # via flax
+packaging==25.0
+    # via
+    #   anndata
+    #   datasets
+    #   gunicorn
+    #   huggingface-hub
+    #   lightning-utilities
+    #   matplotlib
+    #   mlflow-skinny
+    #   pytorch-lightning
+    #   scanpy
+    #   statsmodels
+    #   torchmetrics
+pandas==2.3.1
+    # via
+    #   anndata
+    #   cell-gears
+    #   datasets
+    #   mlflow
+    #   scanpy
+    #   scgpt
+    #   scib
+    #   scvi-tools
+    #   seaborn
+    #   statsmodels
+parso==0.8.4
+    # via jedi
+patsy==1.0.1
+    # via
+    #   scanpy
+    #   statsmodels
+pexpect==4.9.0
+    # via ipython
+pillow==11.3.0
+    # via matplotlib
+prompt-toolkit==3.0.51
+    # via ipython
+propcache==0.3.2
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==6.31.1
+    # via
+    #   mlflow-skinny
+    #   orbax-checkpoint
+ptyprocess==0.7.0
+    # via pexpect
+pure-eval==0.2.3
+    # via stack-data
+pyarrow==20.0.0
+    # via
+    #   datasets
+    #   mlflow
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2
+    # via google-auth
+pydantic==2.11.7
+    # via
+    #   fastapi
+    #   mlflow-skinny
+pydantic-core==2.33.2
+    # via pydantic
+pydot==4.0.1
+    # via scib
+pygments==2.19.2
+    # via
+    #   ipython
+    #   rich
+pynndescent==0.5.13
+    # via
+    #   scanpy
+    #   umap-learn
+pyparsing==3.2.3
+    # via
+    #   matplotlib
+    #   pydot
+pyro-api==0.1.2
+    # via pyro-ppl
+pyro-ppl==1.9.1
+    # via scvi-tools
+python-dateutil==2.9.0.post0
+    # via
+    #   graphene
+    #   matplotlib
+    #   pandas
+pytorch-lightning==1.9.5
+    # via scvi-tools
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   datasets
+    #   flax
+    #   huggingface-hub
+    #   ml-collections
+    #   mlflow-skinny
+    #   orbax
+    #   orbax-checkpoint
+    #   pytorch-lightning
+requests==2.32.4
+    # via
+    #   databricks-sdk
+    #   datasets
+    #   docker
+    #   huggingface-hub
+    #   mlflow-skinny
+    #   torchdata
+    #   torchtext
+rich==14.1.0
+    # via
+    #   flax
+    #   scvi-tools
+rsa==4.9.1
+    # via google-auth
+scanpy==1.11.4
+    # via
+    #   cell-gears
+    #   scgpt
+    #   scib
+scgpt==0.2.1
+    # via -r requirements.in
+scib==1.1.7
+    # via scgpt
+scikit-learn==1.7.1
+    # via
+    #   cell-gears
+    #   mlflow
+    #   pynndescent
+    #   scanpy
+    #   scib
+    #   scvi-tools
+    #   umap-learn
+scikit-misc==0.5.1
+    # via
+    #   scgpt
+    #   scib
+scipy==1.12.0
+    # via
+    #   -r requirements.in
+    #   anndata
+    #   dcor
+    #   jax
+    #   jaxlib
+    #   mlflow
+    #   pynndescent
+    #   scanpy
+    #   scib
+    #   scikit-learn
+    #   scvi-tools
+    #   statsmodels
+    #   umap-learn
+scvi-tools==0.20.3
+    # via scgpt
+seaborn==0.13.2
+    # via
+    #   scanpy
+    #   scib
+session-info2==0.2
+    # via scanpy
+setuptools==80.9.0
+    # via lightning-utilities
+simplejson==3.20.1
+    # via orbax-checkpoint
+six==1.17.0
+    # via
+    #   docrep
+    #   python-dateutil
+smmap==5.0.2
+    # via gitdb
+sniffio==1.3.1
+    # via anyio
+sqlalchemy==2.0.43
+    # via
+    #   alembic
+    #   mlflow
+sqlparse==0.5.3
+    # via mlflow-skinny
+stack-data==0.6.3
+    # via ipython
+starlette==0.47.2
+    # via fastapi
+statsmodels==0.14.5
+    # via scanpy
+sympy==1.14.0
+    # via torch
+tensorstore==0.1.76
+    # via
+    #   flax
+    #   orbax
+    #   orbax-checkpoint
+texttable==1.7.0
+    # via igraph
+threadpoolctl==3.6.0
+    # via scikit-learn
+tomli==2.2.1
+    # via alembic
+toolz==1.0.0
+    # via chex
+torch==2.1.2
+    # via
+    #   cell-gears
+    #   pyro-ppl
+    #   pytorch-lightning
+    #   scgpt
+    #   scvi-tools
+    #   torchdata
+    #   torchmetrics
+    #   torchtext
+torchdata==0.7.1
+    # via torchtext
+torchmetrics==1.8.1
+    # via
+    #   pytorch-lightning
+    #   scvi-tools
+torchtext==0.16.2
+    # via scgpt
+tqdm==4.67.1
+    # via
+    #   cell-gears
+    #   datasets
+    #   huggingface-hub
+    #   numpyro
+    #   pyro-ppl
+    #   pytorch-lightning
+    #   scanpy
+    #   scvi-tools
+    #   torchtext
+    #   umap-learn
+traitlets==5.14.3
+    # via
+    #   ipython
+    #   matplotlib-inline
+treescope==0.1.10
+    # via flax
+triton==2.1.0
+    # via torch
+typing-extensions==4.14.1
+    # via
+    #   aiosignal
+    #   alembic
+    #   anyio
+    #   chex
+    #   etils
+    #   exceptiongroup
+    #   fastapi
+    #   flax
+    #   graphene
+    #   huggingface-hub
+    #   ipython
+    #   lightning-utilities
+    #   mlflow-skinny
+    #   multidict
+    #   opentelemetry-api
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+    #   orbax
+    #   orbax-checkpoint
+    #   pydantic
+    #   pydantic-core
+    #   pytorch-lightning
+    #   scanpy
+    #   scgpt
+    #   sqlalchemy
+    #   starlette
+    #   torch
+    #   typing-inspection
+    #   uvicorn
+typing-inspection==0.4.1
+    # via pydantic
+tzdata==2025.2
+    # via pandas
+umap-learn==0.5.9.post2
+    # via
+    #   scanpy
+    #   scgpt
+    #   scib
+urllib3==2.5.0
+    # via
+    #   docker
+    #   requests
+    #   torchdata
+uvicorn==0.35.0
+    # via mlflow-skinny
+wcwidth==0.2.13
+    # via prompt-toolkit
+werkzeug==3.1.3
+    # via flask
+wrapt==1.17.3
+    # via deprecated
+xxhash==3.5.0
+    # via datasets
+yarl==1.20.1
+    # via aiohttp
+zipp==3.23.0
+    # via
+    #   etils
+    #   importlib-metadata
diff --git a/src/methods/scgpt_mlflow/script.py b/src/methods/scgpt_mlflow/script.py
new file mode 100644
index 00000000..7c70c6a0
--- /dev/null
+++ b/src/methods/scgpt_mlflow/script.py
@@ -0,0 +1,111 @@
+import os
+import sys
+import tarfile
+import tempfile
+import zipfile
+
+import anndata as ad
+import mlflow.pyfunc
+import pandas as pd
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    "input": "resources_test/.../input.h5ad",
+    "output": "output.h5ad",
+    "model": "resources_test/.../model",
+}
+meta = {"name": "scGPT_mlflow"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from exit_codes import exit_non_applicable
+from read_anndata_partial import read_anndata
+
+print("====== scGPT (MLflow model) ======", flush=True)
+
+print("\n>>> Reading input files...", flush=True)
+print(f"Input H5AD file: '{par['input']}'", flush=True)
+adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+
+if adata.uns["dataset_organism"] != "homo_sapiens":
+    exit_non_applicable(
+        f"scGPT (MLflow) can only be used with human data "
+        f'(dataset_organism == "{adata.uns["dataset_organism"]}")'
+    )
+
+print(adata, flush=True)
+
+if os.path.isdir(par["model"]):
+    print("\n>>> Using model directory...", flush=True)
+    print(f"Directory path: '{par['model']}'", flush=True)
+    model_temp = None
+    model_dir = par["model"]
+else:
+    model_temp = tempfile.TemporaryDirectory()
+    model_dir = model_temp.name
+
+    if zipfile.is_zipfile(par["model"]):
+        print("\n>>> Extracting model from .zip...", flush=True)
+        print(f".zip path: '{par['model']}'", flush=True)
+        with zipfile.ZipFile(par["model"], "r") as zip_file:
+            zip_file.extractall(model_dir)
+    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
+        print("\n>>> Extracting model from .tar.gz...", flush=True)
+        print(f".tar.gz path: '{par['model']}'", flush=True)
+        with tarfile.open(par["model"], "r:gz") as tar_file:
+            tar_file.extractall(model_dir)
+            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
+    else:
+        raise ValueError(
+            "The 'model' argument should be a directory a .zip file or a .tar.gz file"
+        )
+
+print("\n>>> Loading model...", flush=True)
+model = mlflow.pyfunc.load_model(model_dir)
+print(model, flush=True)
+
+print("\n>>> Writing temporary input H5AD file...", flush=True)
+input_adata = ad.AnnData(
+    X=adata.X.copy(),
+    var=adata.var.filter(items=["feature_name"]),
+)
+print(input_adata, flush=True)
+
+h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
+print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
+input_adata.write(h5ad_file.name)
+del input_adata
+
+print("\n>>> Running model...", flush=True)
+input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
+input_params = {"gene_col": "feature_name"}
+embedding = model.predict(input_df, params=input_params)
+
+print("\n>>> Storing output...", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    obsm={
+        "X_emb": embedding,
+    },
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+print(output)
+
+print("\n>>> Writing output to file...", flush=True)
+print(f"Output H5AD file: '{par['output']}'", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
+
+print("\n>>> Cleaning up temporary files...", flush=True)
+if model_temp is not None:
+    model_temp.cleanup()
+h5ad_file.close()
+os.unlink(h5ad_file.name)
+
+print("\n>>> Done!", flush=True)
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 34db6276..f5da4fc0 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -103,6 +103,7 @@ dependencies:
   - name: methods/scanorama
   - name: methods/scanvi
   - name: methods/scgpt_finetuned
+  - name: methods/scgpt_mlflow
   - name: methods/scgpt_zeroshot
   - name: methods/scimilarity
   - name: methods/scprint
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 44ff5ed5..0e2f656e 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -35,6 +35,9 @@ methods = [
   scgpt_finetuned.run(
     args: [model: file("s3://openproblems-work/cache/scGPT_human.zip")]
   ),
+  scgpt_mlflow.run(
+    args: [model: file("s3://openproblems-work/cache/scgpt-mlflow-model.zip")]
+  ),
   scgpt_zeroshot.run(
     args: [model: file("s3://openproblems-work/cache/scGPT_human.zip")]
   ),

From 5c46c4c6c272f503002e9bbd8508d5fd6582df1e Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 13 Aug 2025 13:41:46 +0200
Subject: [PATCH 13/21] Add uce_mlflow method

---
 src/methods/uce_mlflow/config.vsh.yaml      |  63 ++++
 src/methods/uce_mlflow/requirements.txt     | 366 ++++++++++++++++++++
 src/methods/uce_mlflow/script.py            | 110 ++++++
 src/workflows/run_benchmark/config.vsh.yaml |   1 +
 src/workflows/run_benchmark/main.nf         |   3 +
 5 files changed, 543 insertions(+)
 create mode 100644 src/methods/uce_mlflow/config.vsh.yaml
 create mode 100644 src/methods/uce_mlflow/requirements.txt
 create mode 100644 src/methods/uce_mlflow/script.py

diff --git a/src/methods/uce_mlflow/config.vsh.yaml b/src/methods/uce_mlflow/config.vsh.yaml
new file mode 100644
index 00000000..354cbd63
--- /dev/null
+++ b/src/methods/uce_mlflow/config.vsh.yaml
@@ -0,0 +1,63 @@
+__merge__: ../../api/base_method.yaml
+
+name: uce_mlflow
+label: UCE (MLflow model)
+summary: UCE offers a unified biological latent space that can represent any cell
+description: |
+  Universal Cell Embedding (UCE) is a single-cell foundation model that offers a
+  unified biological latent space that can represent any cell, regardless of
+  tissue or species
+
+  Here, we use a version packaged as an MLflow model.
+references:
+  doi:
+    - 10.1101/2023.11.28.568918
+links:
+  documentation: https://github.com/snap-stanford/UCE/blob/main/README.md
+  repository: https://github.com/snap-stanford/UCE
+
+info:
+  method_types: [embedding]
+  preferred_normalization: counts
+
+arguments:
+  - name: --model
+    type: file
+    description: |
+      An MLflow model URI for the UCE model. If it is a .zip or
+      .tar.gz file it will be extracted to a temporary directory.
+    required: true
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+  - path: /src/utils/exit_codes.py
+  - path: requirements.txt
+
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1
+    setup:
+      - type: docker
+        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
+        run: sh /uv-installer.sh && rm /uv-installer.sh
+        env: PATH="/root/.local/bin/:$PATH"
+      - type: docker
+        run: uv venv --python 3.11 /opt/venv
+      - type: docker
+        env:
+          - VIRTUAL_ENV=/opt/venv
+          - PATH="/opt/venv/bin:$PATH"
+        add: requirements.txt /requirements.txt
+        run: uv pip install -r /requirements.txt
+      - type: docker
+        run: uv pip install mlflow==3.1.0
+      - type: docker
+        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [hightime, highmem, midcpu, gpu]
diff --git a/src/methods/uce_mlflow/requirements.txt b/src/methods/uce_mlflow/requirements.txt
new file mode 100644
index 00000000..b2f4227b
--- /dev/null
+++ b/src/methods/uce_mlflow/requirements.txt
@@ -0,0 +1,366 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements.in -o /tmp/tmpg2ov1w_7/requirements_initial.txt
+accelerate==0.34.2
+    # via -r requirements.in
+alembic==1.16.4
+    # via mlflow
+anndata==0.10.9
+    # via
+    #   -r requirements.in
+    #   scanpy
+annotated-types==0.7.0
+    # via pydantic
+antlr4-python3-runtime==4.9.3
+    # via omegaconf
+anyio==4.10.0
+    # via starlette
+array-api-compat==1.12.0
+    # via anndata
+blinker==1.9.0
+    # via flask
+cachetools==5.5.2
+    # via
+    #   google-auth
+    #   mlflow-skinny
+certifi==2025.8.3
+    # via requests
+charset-normalizer==3.4.3
+    # via requests
+click==8.2.1
+    # via
+    #   flask
+    #   mlflow-skinny
+    #   uvicorn
+cloudpickle==3.1.1
+    # via mlflow-skinny
+contourpy==1.3.3
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+databricks-sdk==0.62.0
+    # via mlflow-skinny
+docker==7.1.0
+    # via mlflow
+fastapi==0.116.1
+    # via mlflow-skinny
+filelock==3.18.0
+    # via
+    #   huggingface-hub
+    #   torch
+    #   triton
+flask==3.1.1
+    # via mlflow
+fonttools==4.59.0
+    # via matplotlib
+fsspec==2025.7.0
+    # via
+    #   huggingface-hub
+    #   torch
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.45
+    # via mlflow-skinny
+google-auth==2.40.3
+    # via databricks-sdk
+graphene==3.4.3
+    # via mlflow
+graphql-core==3.2.6
+    # via
+    #   graphene
+    #   graphql-relay
+graphql-relay==3.2.0
+    # via graphene
+greenlet==3.2.4
+    # via sqlalchemy
+gunicorn==23.0.0
+    # via mlflow
+h11==0.16.0
+    # via uvicorn
+h5py==3.14.0
+    # via
+    #   anndata
+    #   scanpy
+hf-xet==1.1.7
+    # via huggingface-hub
+huggingface-hub==0.34.4
+    # via accelerate
+idna==3.10
+    # via
+    #   anyio
+    #   requests
+importlib-metadata==8.7.0
+    # via
+    #   mlflow-skinny
+    #   opentelemetry-api
+itsdangerous==2.2.0
+    # via flask
+jinja2==3.1.6
+    # via
+    #   flask
+    #   torch
+joblib==1.5.1
+    # via
+    #   pynndescent
+    #   scanpy
+    #   scikit-learn
+kiwisolver==1.4.9
+    # via matplotlib
+legacy-api-wrap==1.4.1
+    # via scanpy
+llvmlite==0.44.0
+    # via
+    #   numba
+    #   pynndescent
+mako==1.3.10
+    # via alembic
+markupsafe==3.0.2
+    # via
+    #   flask
+    #   jinja2
+    #   mako
+    #   werkzeug
+matplotlib==3.10.5
+    # via
+    #   mlflow
+    #   scanpy
+    #   seaborn
+mlflow==3.1.0
+    # via -r requirements.in
+mlflow-skinny==3.1.0
+    # via mlflow
+mpmath==1.3.0
+    # via sympy
+natsort==8.4.0
+    # via
+    #   anndata
+    #   scanpy
+networkx==3.5
+    # via
+    #   scanpy
+    #   torch
+numba==0.61.2
+    # via
+    #   pynndescent
+    #   scanpy
+    #   umap-learn
+numpy==1.26.4
+    # via
+    #   -r requirements.in
+    #   accelerate
+    #   anndata
+    #   contourpy
+    #   h5py
+    #   matplotlib
+    #   mlflow
+    #   numba
+    #   pandas
+    #   patsy
+    #   scanpy
+    #   scikit-learn
+    #   scipy
+    #   seaborn
+    #   statsmodels
+    #   umap-learn
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.20.5
+    # via torch
+nvidia-nvjitlink-cu12==12.9.86
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
+omegaconf==2.3.0
+    # via -r requirements.in
+opentelemetry-api==1.36.0
+    # via
+    #   mlflow-skinny
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-sdk==1.36.0
+    # via mlflow-skinny
+opentelemetry-semantic-conventions==0.57b0
+    # via opentelemetry-sdk
+packaging==25.0
+    # via
+    #   accelerate
+    #   anndata
+    #   gunicorn
+    #   huggingface-hub
+    #   matplotlib
+    #   mlflow-skinny
+    #   scanpy
+    #   statsmodels
+pandas==2.2.3
+    # via
+    #   -r requirements.in
+    #   anndata
+    #   mlflow
+    #   scanpy
+    #   seaborn
+    #   statsmodels
+patsy==1.0.1
+    # via
+    #   scanpy
+    #   statsmodels
+pillow==11.3.0
+    # via matplotlib
+protobuf==6.31.1
+    # via mlflow-skinny
+psutil==7.0.0
+    # via accelerate
+pyarrow==20.0.0
+    # via mlflow
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2
+    # via google-auth
+pydantic==2.11.7
+    # via
+    #   fastapi
+    #   mlflow-skinny
+pydantic-core==2.33.2
+    # via pydantic
+pynndescent==0.5.13
+    # via
+    #   scanpy
+    #   umap-learn
+pyparsing==3.2.3
+    # via matplotlib
+python-dateutil==2.9.0.post0
+    # via
+    #   graphene
+    #   matplotlib
+    #   pandas
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   huggingface-hub
+    #   mlflow-skinny
+    #   omegaconf
+requests==2.32.4
+    # via
+    #   databricks-sdk
+    #   docker
+    #   huggingface-hub
+    #   mlflow-skinny
+rsa==4.9.1
+    # via google-auth
+safetensors==0.6.2
+    # via accelerate
+scanpy==1.10.2
+    # via -r requirements.in
+scikit-learn==1.7.1
+    # via
+    #   mlflow
+    #   pynndescent
+    #   scanpy
+    #   umap-learn
+scipy==1.14.1
+    # via
+    #   -r requirements.in
+    #   anndata
+    #   mlflow
+    #   pynndescent
+    #   scanpy
+    #   scikit-learn
+    #   statsmodels
+    #   umap-learn
+seaborn==0.13.2
+    # via scanpy
+session-info==1.0.1
+    # via scanpy
+six==1.17.0
+    # via python-dateutil
+smmap==5.0.2
+    # via gitdb
+sniffio==1.3.1
+    # via anyio
+sqlalchemy==2.0.43
+    # via
+    #   alembic
+    #   mlflow
+sqlparse==0.5.3
+    # via mlflow-skinny
+starlette==0.47.2
+    # via fastapi
+statsmodels==0.14.5
+    # via scanpy
+stdlib-list==0.11.1
+    # via session-info
+sympy==1.14.0
+    # via torch
+threadpoolctl==3.6.0
+    # via scikit-learn
+torch==2.4.1
+    # via
+    #   -r requirements.in
+    #   accelerate
+tqdm==4.66.5
+    # via
+    #   -r requirements.in
+    #   huggingface-hub
+    #   scanpy
+    #   umap-learn
+triton==3.0.0
+    # via torch
+typing-extensions==4.14.1
+    # via
+    #   alembic
+    #   anyio
+    #   fastapi
+    #   graphene
+    #   huggingface-hub
+    #   mlflow-skinny
+    #   opentelemetry-api
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+    #   pydantic
+    #   pydantic-core
+    #   sqlalchemy
+    #   starlette
+    #   torch
+    #   typing-inspection
+typing-inspection==0.4.1
+    # via pydantic
+tzdata==2025.2
+    # via pandas
+umap-learn==0.5.9.post2
+    # via scanpy
+urllib3==1.26.6
+    # via
+    #   -r requirements.in
+    #   docker
+    #   requests
+uvicorn==0.35.0
+    # via mlflow-skinny
+werkzeug==3.1.3
+    # via flask
+zipp==3.23.0
+    # via importlib-metadata
diff --git a/src/methods/uce_mlflow/script.py b/src/methods/uce_mlflow/script.py
new file mode 100644
index 00000000..c61b2a68
--- /dev/null
+++ b/src/methods/uce_mlflow/script.py
@@ -0,0 +1,110 @@
+import os
+import sys
+import tarfile
+import tempfile
+import zipfile
+
+import anndata as ad
+import mlflow.pyfunc
+import pandas as pd
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    "input": "resources_test/.../input.h5ad",
+    "output": "output.h5ad",
+    "model": "resources_test/.../model",
+}
+meta = {"name": "uce_mlflow"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from exit_codes import exit_non_applicable
+from read_anndata_partial import read_anndata
+
+print("====== UCE (MLflow model) ======", flush=True)
+
+print("\n>>> Reading input files...", flush=True)
+print(f"Input H5AD file: '{par['input']}'", flush=True)
+adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+
+if adata.uns["dataset_organism"] != "homo_sapiens":
+    exit_non_applicable(
+        f"UCE (MLflow) can only be used with human data "
+        f'(dataset_organism == "{adata.uns["dataset_organism"]}")'
+    )
+
+print(adata, flush=True)
+
+if os.path.isdir(par["model"]):
+    print("\n>>> Using model directory...", flush=True)
+    print(f"Directory path: '{par['model']}'", flush=True)
+    model_temp = None
+    model_dir = par["model"]
+else:
+    model_temp = tempfile.TemporaryDirectory()
+    model_dir = model_temp.name
+
+    if zipfile.is_zipfile(par["model"]):
+        print("\n>>> Extracting model from .zip...", flush=True)
+        print(f".zip path: '{par['model']}'", flush=True)
+        with zipfile.ZipFile(par["model"], "r") as zip_file:
+            zip_file.extractall(model_dir)
+    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
+        print("\n>>> Extracting model from .tar.gz...", flush=True)
+        print(f".tar.gz path: '{par['model']}'", flush=True)
+        with tarfile.open(par["model"], "r:gz") as tar_file:
+            tar_file.extractall(model_dir)
+            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
+    else:
+        raise ValueError(
+            "The 'model' argument should be a directory a .zip file or a .tar.gz file"
+        )
+
+print("\n>>> Loading model...", flush=True)
+model = mlflow.pyfunc.load_model(model_dir)
+print(model, flush=True)
+
+print("\n>>> Writing temporary input H5AD file...", flush=True)
+input_adata = ad.AnnData(
+    X=adata.X.copy(),
+    var=adata.var.filter(items=["feature_name"]),
+)
+print(input_adata, flush=True)
+
+h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
+print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
+input_adata.write(h5ad_file.name)
+del input_adata
+
+print("\n>>> Running model...", flush=True)
+input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
+embedding = model.predict(input_df)
+
+print("\n>>> Storing output...", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    obsm={
+        "X_emb": embedding,
+    },
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+print(output)
+
+print("\n>>> Writing output to file...", flush=True)
+print(f"Output H5AD file: '{par['output']}'", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
+
+print("\n>>> Cleaning up temporary files...", flush=True)
+if model_temp is not None:
+    model_temp.cleanup()
+h5ad_file.close()
+os.unlink(h5ad_file.name)
+
+print("\n>>> Done!", flush=True)
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index f5da4fc0..f7d472d0 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -111,6 +111,7 @@ dependencies:
   - name: methods/scvi_mlflow
   - name: methods/transcriptformer_mlflow
   - name: methods/uce
+  - name: methods/uce_mlflow
   # metrics
   - name: metrics/asw_batch
   - name: metrics/asw_label
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 0e2f656e..1fbebb45 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -54,6 +54,9 @@ methods = [
   ),
   uce.run(
     args: [model: file("s3://openproblems-work/cache/uce-model-v5.zip")]
+  ),
+  uce_mlflow.run(
+    args: [model: file("s3://openproblems-work/cache/uce-mlflow-model.zip")]
   )
 ]
 

From 9c82ff4986979fb47606c9d4502764521a0b345b Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 13 Aug 2025 14:23:23 +0200
Subject: [PATCH 14/21] Add unpack_directory() utils helper

---
 src/methods/geneformer_mlflow/config.vsh.yaml |  1 +
 src/methods/geneformer_mlflow/script.py       | 27 ++----------
 src/methods/scgpt_mlflow/script.py            | 27 ++----------
 src/methods/scvi_mlflow/script.py             | 27 ++----------
 .../transcriptformer_mlflow/config.vsh.yaml   |  1 +
 src/methods/transcriptformer_mlflow/script.py | 27 ++----------
 src/methods/uce_mlflow/config.vsh.yaml        |  1 +
 src/methods/uce_mlflow/script.py              | 27 ++----------
 src/utils/unpack.py                           | 43 +++++++++++++++++++
 9 files changed, 61 insertions(+), 120 deletions(-)
 create mode 100644 src/utils/unpack.py

diff --git a/src/methods/geneformer_mlflow/config.vsh.yaml b/src/methods/geneformer_mlflow/config.vsh.yaml
index b9d08eda..acacc638 100644
--- a/src/methods/geneformer_mlflow/config.vsh.yaml
+++ b/src/methods/geneformer_mlflow/config.vsh.yaml
@@ -35,6 +35,7 @@ resources:
     path: script.py
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
+  - path: /src/utils/unpack.py
   - path: requirements.txt
 
 engines:
diff --git a/src/methods/geneformer_mlflow/script.py b/src/methods/geneformer_mlflow/script.py
index 800ab80b..a6860e9f 100644
--- a/src/methods/geneformer_mlflow/script.py
+++ b/src/methods/geneformer_mlflow/script.py
@@ -22,6 +22,7 @@
 sys.path.append(meta["resources_dir"])
 from exit_codes import exit_non_applicable
 from read_anndata_partial import read_anndata
+from unpack import unpack_directory
 
 print("====== Geneformer (MLflow model) ======", flush=True)
 
@@ -37,30 +38,8 @@
 
 print(adata, flush=True)
 
-if os.path.isdir(par["model"]):
-    print("\n>>> Using model directory...", flush=True)
-    print(f"Directory path: '{par['model']}'", flush=True)
-    model_temp = None
-    model_dir = par["model"]
-else:
-    model_temp = tempfile.TemporaryDirectory()
-    model_dir = model_temp.name
-
-    if zipfile.is_zipfile(par["model"]):
-        print("\n>>> Extracting model from .zip...", flush=True)
-        print(f".zip path: '{par['model']}'", flush=True)
-        with zipfile.ZipFile(par["model"], "r") as zip_file:
-            zip_file.extractall(model_dir)
-    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
-        print("\n>>> Extracting model from .tar.gz...", flush=True)
-        print(f".tar.gz path: '{par['model']}'", flush=True)
-        with tarfile.open(par["model"], "r:gz") as tar_file:
-            tar_file.extractall(model_dir)
-            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
-    else:
-        raise ValueError(
-            "The 'model' argument should be a directory a .zip file or a .tar.gz file"
-        )
+print("\n>>> Unpacking model...", flush=True)
+model_dir, model_temp = unpack_directory(par["model"])
 
 print("\n>>> Loading model...", flush=True)
 model = mlflow.pyfunc.load_model(model_dir)
diff --git a/src/methods/scgpt_mlflow/script.py b/src/methods/scgpt_mlflow/script.py
index 7c70c6a0..fdb6ca3a 100644
--- a/src/methods/scgpt_mlflow/script.py
+++ b/src/methods/scgpt_mlflow/script.py
@@ -22,6 +22,7 @@
 sys.path.append(meta["resources_dir"])
 from exit_codes import exit_non_applicable
 from read_anndata_partial import read_anndata
+from unpack import unpack_directory
 
 print("====== scGPT (MLflow model) ======", flush=True)
 
@@ -37,30 +38,8 @@
 
 print(adata, flush=True)
 
-if os.path.isdir(par["model"]):
-    print("\n>>> Using model directory...", flush=True)
-    print(f"Directory path: '{par['model']}'", flush=True)
-    model_temp = None
-    model_dir = par["model"]
-else:
-    model_temp = tempfile.TemporaryDirectory()
-    model_dir = model_temp.name
-
-    if zipfile.is_zipfile(par["model"]):
-        print("\n>>> Extracting model from .zip...", flush=True)
-        print(f".zip path: '{par['model']}'", flush=True)
-        with zipfile.ZipFile(par["model"], "r") as zip_file:
-            zip_file.extractall(model_dir)
-    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
-        print("\n>>> Extracting model from .tar.gz...", flush=True)
-        print(f".tar.gz path: '{par['model']}'", flush=True)
-        with tarfile.open(par["model"], "r:gz") as tar_file:
-            tar_file.extractall(model_dir)
-            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
-    else:
-        raise ValueError(
-            "The 'model' argument should be a directory a .zip file or a .tar.gz file"
-        )
+print("\n>>> Unpacking model...", flush=True)
+model_dir, model_temp = unpack_directory(par["model"])
 
 print("\n>>> Loading model...", flush=True)
 model = mlflow.pyfunc.load_model(model_dir)
diff --git a/src/methods/scvi_mlflow/script.py b/src/methods/scvi_mlflow/script.py
index 04ff94d5..7fe71b46 100644
--- a/src/methods/scvi_mlflow/script.py
+++ b/src/methods/scvi_mlflow/script.py
@@ -22,6 +22,7 @@
 sys.path.append(meta["resources_dir"])
 from exit_codes import exit_non_applicable
 from read_anndata_partial import read_anndata
+from unpack import unpack_directory
 
 print("====== scVI (MLflow model) ======", flush=True)
 
@@ -41,30 +42,8 @@
 
 print(adata, flush=True)
 
-if os.path.isdir(par["model"]):
-    print("\n>>> Using model directory...", flush=True)
-    print(f"Directory path: '{par['model']}'", flush=True)
-    model_temp = None
-    model_dir = par["model"]
-else:
-    model_temp = tempfile.TemporaryDirectory()
-    model_dir = model_temp.name
-
-    if zipfile.is_zipfile(par["model"]):
-        print("\n>>> Extracting model from .zip...", flush=True)
-        print(f".zip path: '{par['model']}'", flush=True)
-        with zipfile.ZipFile(par["model"], "r") as zip_file:
-            zip_file.extractall(model_dir)
-    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
-        print("\n>>> Extracting model from .tar.gz...", flush=True)
-        print(f".tar.gz path: '{par['model']}'", flush=True)
-        with tarfile.open(par["model"], "r:gz") as tar_file:
-            tar_file.extractall(model_dir)
-            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
-    else:
-        raise ValueError(
-            "The 'model' argument should be a directory a .zip file or a .tar.gz file"
-        )
+print("\n>>> Unpacking model...", flush=True)
+model_dir, model_temp = unpack_directory(par["model"])
 
 print("\n>>> Loading model...", flush=True)
 model = mlflow.pyfunc.load_model(model_dir)
diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index 2d144c23..3c017991 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -37,6 +37,7 @@ resources:
     path: script.py
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
+  - path: /src/utils/unpack.py
   - path: requirements.txt
 
 engines:
diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index b16806d3..9c675ba5 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -22,6 +22,7 @@
 sys.path.append(meta["resources_dir"])
 from exit_codes import exit_non_applicable
 from read_anndata_partial import read_anndata
+from unpack import unpack_directory
 
 print("====== TranscriptFormer (MLflow model) ======", flush=True)
 
@@ -37,30 +38,8 @@
 
 print(adata, flush=True)
 
-if os.path.isdir(par["model"]):
-    print("\n>>> Using model directory...", flush=True)
-    print(f"Directory path: '{par['model']}'", flush=True)
-    model_temp = None
-    model_dir = par["model"]
-else:
-    model_temp = tempfile.TemporaryDirectory()
-    model_dir = model_temp.name
-
-    if zipfile.is_zipfile(par["model"]):
-        print("\n>>> Extracting model from .zip...", flush=True)
-        print(f".zip path: '{par['model']}'", flush=True)
-        with zipfile.ZipFile(par["model"], "r") as zip_file:
-            zip_file.extractall(model_dir)
-    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
-        print("\n>>> Extracting model from .tar.gz...", flush=True)
-        print(f".tar.gz path: '{par['model']}'", flush=True)
-        with tarfile.open(par["model"], "r:gz") as tar_file:
-            tar_file.extractall(model_dir)
-            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
-    else:
-        raise ValueError(
-            "The 'model' argument should be a directory a .zip file or a .tar.gz file"
-        )
+print("\n>>> Unpacking model...", flush=True)
+model_dir, model_temp = unpack_directory(par["model"])
 
 print("\n>>> Loading model...", flush=True)
 model = mlflow.pyfunc.load_model(model_dir)
diff --git a/src/methods/uce_mlflow/config.vsh.yaml b/src/methods/uce_mlflow/config.vsh.yaml
index 354cbd63..a5e6b77c 100644
--- a/src/methods/uce_mlflow/config.vsh.yaml
+++ b/src/methods/uce_mlflow/config.vsh.yaml
@@ -33,6 +33,7 @@ resources:
     path: script.py
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
+  - path: /src/utils/unpack.py
   - path: requirements.txt
 
 engines:
diff --git a/src/methods/uce_mlflow/script.py b/src/methods/uce_mlflow/script.py
index c61b2a68..eb594544 100644
--- a/src/methods/uce_mlflow/script.py
+++ b/src/methods/uce_mlflow/script.py
@@ -22,6 +22,7 @@
 sys.path.append(meta["resources_dir"])
 from exit_codes import exit_non_applicable
 from read_anndata_partial import read_anndata
+from unpack import unpack_directory
 
 print("====== UCE (MLflow model) ======", flush=True)
 
@@ -37,30 +38,8 @@
 
 print(adata, flush=True)
 
-if os.path.isdir(par["model"]):
-    print("\n>>> Using model directory...", flush=True)
-    print(f"Directory path: '{par['model']}'", flush=True)
-    model_temp = None
-    model_dir = par["model"]
-else:
-    model_temp = tempfile.TemporaryDirectory()
-    model_dir = model_temp.name
-
-    if zipfile.is_zipfile(par["model"]):
-        print("\n>>> Extracting model from .zip...", flush=True)
-        print(f".zip path: '{par['model']}'", flush=True)
-        with zipfile.ZipFile(par["model"], "r") as zip_file:
-            zip_file.extractall(model_dir)
-    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
-        print("\n>>> Extracting model from .tar.gz...", flush=True)
-        print(f".tar.gz path: '{par['model']}'", flush=True)
-        with tarfile.open(par["model"], "r:gz") as tar_file:
-            tar_file.extractall(model_dir)
-            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
-    else:
-        raise ValueError(
-            "The 'model' argument should be a directory a .zip file or a .tar.gz file"
-        )
+print("\n>>> Unpacking model...", flush=True)
+model_dir, model_temp = unpack_directory(par["model"])
 
 print("\n>>> Loading model...", flush=True)
 model = mlflow.pyfunc.load_model(model_dir)
diff --git a/src/utils/unpack.py b/src/utils/unpack.py
new file mode 100644
index 00000000..443aa39f
--- /dev/null
+++ b/src/utils/unpack.py
@@ -0,0 +1,43 @@
+import os
+import tarfile
+import tempfile
+import zipfile
+
+def unpack_directory(directory):
+    """
+    Unpack a directory to a temporary location (if needed)
+
+    Args:
+        directory (str): Path to a directory, .zip, or .tar.gz file.
+
+    Returns:
+        tuple: (unpacked_directory (str), temp_directory (TemporaryDirectory or None))
+            unpacked_directory: Path to the unpacked directory.
+            temp_directory: TemporaryDirectory object if a temp dir was created, else None.
+    """
+    print(f"Unpacking directory: '{directory}'", flush=True)
+
+    if os.path.isdir(directory):
+        print(f"Returning provided directory: '{directory}'", flush=True)
+        temp_directory = None
+        unpacked_directory = directory
+    else:
+        temp_directory = tempfile.TemporaryDirectory()
+        unpacked_directory = temp_directory.name
+
+        if zipfile.is_zipfile(directory):
+            print("Extracting .zip...", flush=True)
+            with zipfile.ZipFile(directory, "r") as zip_file:
+                zip_file.extractall(unpacked_directory)
+        elif tarfile.is_tarfile(directory) and directory.endswith(".tar.gz"):
+            print("Extracting .tar.gz...", flush=True)
+            with tarfile.open(directory, "r:gz") as tar_file:
+                tar_file.extractall(unpacked_directory)
+                unpacked_directory = os.path.join(unpacked_directory, os.listdir(unpacked_directory)[0])
+        else:
+            raise ValueError(
+                "The 'directory' argument should be a directory, a .zip file or a .tar.gz file"
+            )
+        print(f"Extracted to '{unpacked_directory}'", flush=True)
+
+    return (unpacked_directory, temp_directory)

From 2a2a61b17583430d74137d837d487a7fc00c46a6 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 13 Aug 2025 14:31:33 +0200
Subject: [PATCH 15/21] Add unpack helper to scgpt_mlflow

---
 src/methods/scgpt_mlflow/config.vsh.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/methods/scgpt_mlflow/config.vsh.yaml b/src/methods/scgpt_mlflow/config.vsh.yaml
index b8455165..a748a1de 100644
--- a/src/methods/scgpt_mlflow/config.vsh.yaml
+++ b/src/methods/scgpt_mlflow/config.vsh.yaml
@@ -32,6 +32,7 @@ resources:
     path: script.py
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
+  - path: /src/utils/unpack.py
   - path: requirements.txt
 
 engines:

From 814365192de61894effe76b2918e7321ee9b48c8 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 13 Aug 2025 14:43:03 +0200
Subject: [PATCH 16/21] Add unpack helper to scvi_mlflow

---
 src/methods/scvi_mlflow/config.vsh.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/methods/scvi_mlflow/config.vsh.yaml b/src/methods/scvi_mlflow/config.vsh.yaml
index d50a6e62..61ed6e78 100644
--- a/src/methods/scvi_mlflow/config.vsh.yaml
+++ b/src/methods/scvi_mlflow/config.vsh.yaml
@@ -33,6 +33,7 @@ resources:
     path: script.py
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
+  - path: /src/utils/unpack.py
   - path: requirements.txt
 
 engines:

From 5e49b7c6116847cf85a5ccb70d5f4c46fb6c482a Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Thu, 18 Sep 2025 08:30:33 +0200
Subject: [PATCH 17/21] Update scvi_mlflow

Changes to match label projection
---
 src/methods/scvi_mlflow/script.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/methods/scvi_mlflow/script.py b/src/methods/scvi_mlflow/script.py
index 7fe71b46..d8e92bb9 100644
--- a/src/methods/scvi_mlflow/script.py
+++ b/src/methods/scvi_mlflow/script.py
@@ -45,8 +45,8 @@
 print("\n>>> Unpacking model...", flush=True)
 model_dir, model_temp = unpack_directory(par["model"])
 
-print("\n>>> Loading model...", flush=True)
-model = mlflow.pyfunc.load_model(model_dir)
+print(f"\n>>> Loading {organism} model...", flush=True)
+model = mlflow.pyfunc.load_model(model_dir, model_config={"organism": organism})
 print(model, flush=True)
 
 print("\n>>> Writing temporary input H5AD file...", flush=True)
@@ -62,8 +62,7 @@
 
 print("\n>>> Running model...", flush=True)
 input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-input_params = {"organism": organism, "return_dist": True, "batch_keys": "batch"}
-embedding = model.predict(input_df, params=input_params)
+embedding = model.predict(input_df)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(

From 1884f811b6a49e9b7f13171003053def96252ab5 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Sun, 12 Oct 2025 08:24:16 +0200
Subject: [PATCH 18/21] use helper functions in mlflow methods

---
 src/methods/geneformer_mlflow/config.vsh.yaml |  19 +-
 src/methods/geneformer_mlflow/script.py       |  46 +++--
 src/methods/scgpt_mlflow/config.vsh.yaml      |  19 +-
 src/methods/scgpt_mlflow/script.py            |  36 ++--
 src/methods/scvi_mlflow/config.vsh.yaml       |  17 +-
 src/methods/scvi_mlflow/script.py             |  36 ++--
 .../transcriptformer_mlflow/config.vsh.yaml   |  19 +-
 src/methods/transcriptformer_mlflow/script.py |  43 ++---
 src/methods/uce_mlflow/config.vsh.yaml        |  19 +-
 src/methods/uce_mlflow/script.py              |  34 +---
 src/utils/mlflow.py                           | 174 ++++++++++++++++++
 src/utils/mlflow_docker_setup.yaml            |  14 ++
 12 files changed, 270 insertions(+), 206 deletions(-)
 create mode 100644 src/utils/mlflow.py
 create mode 100644 src/utils/mlflow_docker_setup.yaml

diff --git a/src/methods/geneformer_mlflow/config.vsh.yaml b/src/methods/geneformer_mlflow/config.vsh.yaml
index acacc638..e1d187cf 100644
--- a/src/methods/geneformer_mlflow/config.vsh.yaml
+++ b/src/methods/geneformer_mlflow/config.vsh.yaml
@@ -36,28 +36,13 @@ resources:
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
   - path: /src/utils/unpack.py
+  - path: /src/utils/mlflow.py
   - path: requirements.txt
 
 engines:
   - type: docker
     image: openproblems/base_pytorch_nvidia:1
-    setup:
-      - type: docker
-        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
-        run: sh /uv-installer.sh && rm /uv-installer.sh
-        env: PATH="/root/.local/bin/:$PATH"
-      - type: docker
-        run: uv venv --python 3.11 /opt/venv
-      - type: docker
-        env:
-          - VIRTUAL_ENV=/opt/venv
-          - PATH="/opt/venv/bin:$PATH"
-        add: requirements.txt /requirements.txt
-        run: uv pip install -r /requirements.txt
-      - type: docker
-        run: uv pip install mlflow==3.1.0
-      - type: docker
-        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+    __merge__: /src/utils/mlflow_docker_setup.yaml
 
 runners:
   - type: executable
diff --git a/src/methods/geneformer_mlflow/script.py b/src/methods/geneformer_mlflow/script.py
index a6860e9f..dc710040 100644
--- a/src/methods/geneformer_mlflow/script.py
+++ b/src/methods/geneformer_mlflow/script.py
@@ -1,12 +1,9 @@
 import os
 import sys
-import tarfile
-import tempfile
-import zipfile
 
 import anndata as ad
 import mlflow.pyfunc
-import pandas as pd
+import numpy as np
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +17,10 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from exit_codes import exit_non_applicable
-from read_anndata_partial import read_anndata
-from unpack import unpack_directory
+from exit_codes import exit_non_applicable  # noqa: E402
+from mlflow import embed  # noqa: E402
+from read_anndata_partial import read_anndata  # noqa: E402
+from unpack import unpack_directory  # noqa: E402
 
 print("====== Geneformer (MLflow model) ======", flush=True)
 
@@ -45,23 +43,25 @@
 model = mlflow.pyfunc.load_model(model_dir)
 print(model, flush=True)
 
-print("\n>>> Writing temporary input H5AD file...", flush=True)
-input_adata = ad.AnnData(
-    X=adata.X.copy(),
-    var=adata.var.filter(items=["feature_id"]).rename(
-        columns={"feature_id": "ensembl_id"}
-    ),
-)
-print(input_adata, flush=True)
+n_processors = meta.get("cpus") or os.cpu_count()
+print(f"Available processors: {n_processors}", flush=True)
+
 
-h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
-print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
-input_adata.write(h5ad_file.name)
-del input_adata
+def process_geneformer_input(input_adata):
+    """Add Geneformer-specific fields to input AnnData."""
+    input_adata.obs["cell_idx"] = np.arange(input_adata.n_obs)
+    input_adata.obs["n_counts"] = input_adata.X.sum(axis=1)
 
-print("\n>>> Running model...", flush=True)
-input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-embedding = model.predict(input_df)
+
+print("\n>>> Embedding data...", flush=True)
+embedding = embed(
+    adata,
+    model,
+    layers=["counts"],
+    var={"feature_id": "ensembl_id"},
+    model_params={"nproc": n_processors},
+    process_adata=process_geneformer_input,
+)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
@@ -85,7 +85,5 @@
 print("\n>>> Cleaning up temporary files...", flush=True)
 if model_temp is not None:
     model_temp.cleanup()
-h5ad_file.close()
-os.unlink(h5ad_file.name)
 
 print("\n>>> Done!", flush=True)
diff --git a/src/methods/scgpt_mlflow/config.vsh.yaml b/src/methods/scgpt_mlflow/config.vsh.yaml
index a748a1de..d684085e 100644
--- a/src/methods/scgpt_mlflow/config.vsh.yaml
+++ b/src/methods/scgpt_mlflow/config.vsh.yaml
@@ -33,28 +33,13 @@ resources:
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
   - path: /src/utils/unpack.py
+  - path: /src/utils/mlflow.py
   - path: requirements.txt
 
 engines:
   - type: docker
     image: openproblems/base_pytorch_nvidia:1
-    setup:
-      - type: docker
-        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
-        run: sh /uv-installer.sh && rm /uv-installer.sh
-        env: PATH="/root/.local/bin/:$PATH"
-      - type: docker
-        run: uv venv --python 3.11 /opt/venv
-      - type: docker
-        env:
-          - VIRTUAL_ENV=/opt/venv
-          - PATH="/opt/venv/bin:$PATH"
-        add: requirements.txt /requirements.txt
-        run: uv pip install -r /requirements.txt
-      - type: docker
-        run: uv pip install mlflow==3.1.0
-      - type: docker
-        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+    __merge__: /src/utils/mlflow_docker_setup.yaml
 
 runners:
   - type: executable
diff --git a/src/methods/scgpt_mlflow/script.py b/src/methods/scgpt_mlflow/script.py
index fdb6ca3a..db54fd23 100644
--- a/src/methods/scgpt_mlflow/script.py
+++ b/src/methods/scgpt_mlflow/script.py
@@ -1,12 +1,7 @@
-import os
 import sys
-import tarfile
-import tempfile
-import zipfile
 
 import anndata as ad
 import mlflow.pyfunc
-import pandas as pd
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +15,10 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from exit_codes import exit_non_applicable
-from read_anndata_partial import read_anndata
-from unpack import unpack_directory
+from exit_codes import exit_non_applicable  # noqa: E402
+from mlflow import embed  # noqa: E402
+from read_anndata_partial import read_anndata  # noqa: E402
+from unpack import unpack_directory  # noqa: E402
 
 print("====== scGPT (MLflow model) ======", flush=True)
 
@@ -45,22 +41,14 @@
 model = mlflow.pyfunc.load_model(model_dir)
 print(model, flush=True)
 
-print("\n>>> Writing temporary input H5AD file...", flush=True)
-input_adata = ad.AnnData(
-    X=adata.X.copy(),
-    var=adata.var.filter(items=["feature_name"]),
+print("\n>>> Embedding data...", flush=True)
+embedding = embed(
+    adata,
+    model,
+    layers=["counts"],
+    var={"feature_name": "feature_name"},
+    model_params={"gene_col": "feature_name"},
 )
-print(input_adata, flush=True)
-
-h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
-print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
-input_adata.write(h5ad_file.name)
-del input_adata
-
-print("\n>>> Running model...", flush=True)
-input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-input_params = {"gene_col": "feature_name"}
-embedding = model.predict(input_df, params=input_params)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
@@ -84,7 +72,5 @@
 print("\n>>> Cleaning up temporary files...", flush=True)
 if model_temp is not None:
     model_temp.cleanup()
-h5ad_file.close()
-os.unlink(h5ad_file.name)
 
 print("\n>>> Done!", flush=True)
diff --git a/src/methods/scvi_mlflow/config.vsh.yaml b/src/methods/scvi_mlflow/config.vsh.yaml
index 61ed6e78..85b6520f 100644
--- a/src/methods/scvi_mlflow/config.vsh.yaml
+++ b/src/methods/scvi_mlflow/config.vsh.yaml
@@ -34,26 +34,13 @@ resources:
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
   - path: /src/utils/unpack.py
+  - path: /src/utils/mlflow.py
   - path: requirements.txt
 
 engines:
   - type: docker
     image: openproblems/base_pytorch_nvidia:1
-    setup:
-      - type: docker
-        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
-        run: sh /uv-installer.sh && rm /uv-installer.sh
-        env: PATH="/root/.local/bin/:$PATH"
-      - type: docker
-        run: uv venv --python 3.11 /opt/venv
-      - type: docker
-        env:
-          - VIRTUAL_ENV=/opt/venv
-          - PATH="/opt/venv/bin:$PATH"
-        add: requirements.txt /requirements.txt
-        run: uv pip install -r /requirements.txt && uv pip install mlflow==3.1.0
-      - type: docker
-        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+    __merge__: /src/utils/mlflow_docker_setup.yaml
 
 runners:
   - type: executable
diff --git a/src/methods/scvi_mlflow/script.py b/src/methods/scvi_mlflow/script.py
index d8e92bb9..0c27a11a 100644
--- a/src/methods/scvi_mlflow/script.py
+++ b/src/methods/scvi_mlflow/script.py
@@ -1,12 +1,7 @@
-import os
 import sys
-import tarfile
-import tempfile
-import zipfile
 
 import anndata as ad
 import mlflow.pyfunc
-import pandas as pd
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +15,10 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from exit_codes import exit_non_applicable
-from read_anndata_partial import read_anndata
-from unpack import unpack_directory
+from exit_codes import exit_non_applicable  # noqa: E402
+from mlflow import embed  # noqa: E402
+from read_anndata_partial import read_anndata  # noqa: E402
+from unpack import unpack_directory  # noqa: E402
 
 print("====== scVI (MLflow model) ======", flush=True)
 
@@ -49,20 +45,14 @@
 model = mlflow.pyfunc.load_model(model_dir, model_config={"organism": organism})
 print(model, flush=True)
 
-print("\n>>> Writing temporary input H5AD file...", flush=True)
-input_adata = ad.AnnData(X=adata.X.copy())
-input_adata.var_names = adata.var["feature_id"].values
-input_adata.obs["batch"] = adata.obs["batch"].values
-print(input_adata, flush=True)
-
-h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
-print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
-input_adata.write(h5ad_file.name)
-del input_adata
-
-print("\n>>> Running model...", flush=True)
-input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-embedding = model.predict(input_df)
+print("\n>>> Embedding data...", flush=True)
+embedding = embed(
+    adata,
+    model,
+    layers=["counts"],
+    obs=["batch"],
+    var={"feature_id": "feature_id"}
+)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
@@ -86,7 +76,5 @@
 print("\n>>> Cleaning up temporary files...", flush=True)
 if model_temp is not None:
     model_temp.cleanup()
-h5ad_file.close()
-os.unlink(h5ad_file.name)
 
 print("\n>>> Done!", flush=True)
diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index 3c017991..453ba275 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -38,28 +38,13 @@ resources:
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
   - path: /src/utils/unpack.py
+  - path: /src/utils/mlflow.py
   - path: requirements.txt
 
 engines:
   - type: docker
     image: openproblems/base_pytorch_nvidia:1
-    setup:
-      - type: docker
-        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
-        run: sh /uv-installer.sh && rm /uv-installer.sh
-        env: PATH="/root/.local/bin/:$PATH"
-      - type: docker
-        run: uv venv --python 3.11 /opt/venv
-      - type: docker
-        env:
-          - VIRTUAL_ENV=/opt/venv
-          - PATH="/opt/venv/bin:$PATH"
-        add: requirements.txt /requirements.txt
-        run: uv pip install -r /requirements.txt
-      - type: docker
-        run: uv pip install mlflow==3.1.0
-      - type: docker
-        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+    __merge__: /src/utils/mlflow_docker_setup.yaml
 
 runners:
   - type: executable
diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index 9c675ba5..0ddacee8 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -1,12 +1,7 @@
-import os
 import sys
-import tarfile
-import tempfile
-import zipfile
 
 import anndata as ad
 import mlflow.pyfunc
-import pandas as pd
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +15,10 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from exit_codes import exit_non_applicable
-from read_anndata_partial import read_anndata
-from unpack import unpack_directory
+from exit_codes import exit_non_applicable  # noqa: E402
+from mlflow import embed  # noqa: E402
+from read_anndata_partial import read_anndata  # noqa: E402
+from unpack import unpack_directory  # noqa: E402
 
 print("====== TranscriptFormer (MLflow model) ======", flush=True)
 
@@ -45,23 +41,20 @@
 model = mlflow.pyfunc.load_model(model_dir)
 print(model, flush=True)
 
-print("\n>>> Writing temporary input H5AD file...", flush=True)
-input_adata = ad.AnnData(
-    X=adata.X.copy(),
-    var=adata.var.filter(items=["feature_id"]).rename(
-        columns={"feature_id": "ensembl_id"}
-    ),
-)
-input_adata.obs["assay"] = "unknown"  # Avoid error if assay is missing
-print(input_adata, flush=True)
-h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
-print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
-input_adata.write(h5ad_file.name)
-del input_adata
 
-print("\n>>> Running model...", flush=True)
-input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-embedding = model.predict(input_df)
+def process_transcriptformer_input(input_adata):
+    """Add TranscriptFormer-specific fields to input AnnData."""
+    input_adata.obs["assay"] = "unknown"  # Avoid error if assay is missing
+
+
+print("\n>>> Embedding data...", flush=True)
+embedding = embed(
+    adata,
+    model,
+    layers=["counts"],
+    var={"feature_id": "ensembl_id"},
+    process_adata=process_transcriptformer_input,
+)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
@@ -85,7 +78,5 @@
 print("\n>>> Cleaning up temporary files...", flush=True)
 if model_temp is not None:
     model_temp.cleanup()
-h5ad_file.close()
-os.unlink(h5ad_file.name)
 
 print("\n>>> Done!", flush=True)
diff --git a/src/methods/uce_mlflow/config.vsh.yaml b/src/methods/uce_mlflow/config.vsh.yaml
index a5e6b77c..564bc7de 100644
--- a/src/methods/uce_mlflow/config.vsh.yaml
+++ b/src/methods/uce_mlflow/config.vsh.yaml
@@ -34,28 +34,13 @@ resources:
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
   - path: /src/utils/unpack.py
+  - path: /src/utils/mlflow.py
   - path: requirements.txt
 
 engines:
   - type: docker
     image: openproblems/base_pytorch_nvidia:1
-    setup:
-      - type: docker
-        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
-        run: sh /uv-installer.sh && rm /uv-installer.sh
-        env: PATH="/root/.local/bin/:$PATH"
-      - type: docker
-        run: uv venv --python 3.11 /opt/venv
-      - type: docker
-        env:
-          - VIRTUAL_ENV=/opt/venv
-          - PATH="/opt/venv/bin:$PATH"
-        add: requirements.txt /requirements.txt
-        run: uv pip install -r /requirements.txt
-      - type: docker
-        run: uv pip install mlflow==3.1.0
-      - type: docker
-        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+    __merge__: /src/utils/mlflow_docker_setup.yaml
 
 runners:
   - type: executable
diff --git a/src/methods/uce_mlflow/script.py b/src/methods/uce_mlflow/script.py
index eb594544..6e6fffb6 100644
--- a/src/methods/uce_mlflow/script.py
+++ b/src/methods/uce_mlflow/script.py
@@ -1,12 +1,7 @@
-import os
 import sys
-import tarfile
-import tempfile
-import zipfile
 
 import anndata as ad
 import mlflow.pyfunc
-import pandas as pd
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +15,10 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from exit_codes import exit_non_applicable
-from read_anndata_partial import read_anndata
-from unpack import unpack_directory
+from exit_codes import exit_non_applicable  # noqa: E402
+from mlflow import embed  # noqa: E402
+from read_anndata_partial import read_anndata  # noqa: E402
+from unpack import unpack_directory  # noqa: E402
 
 print("====== UCE (MLflow model) ======", flush=True)
 
@@ -45,21 +41,13 @@
 model = mlflow.pyfunc.load_model(model_dir)
 print(model, flush=True)
 
-print("\n>>> Writing temporary input H5AD file...", flush=True)
-input_adata = ad.AnnData(
-    X=adata.X.copy(),
-    var=adata.var.filter(items=["feature_name"]),
+print("\n>>> Embedding data...", flush=True)
+embedding = embed(
+    adata,
+    model,
+    layers=["counts"],
+    var={"feature_name": "feature_name"},
 )
-print(input_adata, flush=True)
-
-h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
-print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
-input_adata.write(h5ad_file.name)
-del input_adata
-
-print("\n>>> Running model...", flush=True)
-input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-embedding = model.predict(input_df)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
@@ -83,7 +71,5 @@
 print("\n>>> Cleaning up temporary files...", flush=True)
 if model_temp is not None:
     model_temp.cleanup()
-h5ad_file.close()
-os.unlink(h5ad_file.name)
 
 print("\n>>> Done!", flush=True)
diff --git a/src/utils/mlflow.py b/src/utils/mlflow.py
new file mode 100644
index 00000000..447614e6
--- /dev/null
+++ b/src/utils/mlflow.py
@@ -0,0 +1,174 @@
+"""
+Common utilities for MLflow-based methods.
+"""
+import os
+import tempfile
+
+import anndata as ad
+import pandas as pd
+import sklearn.neighbors
+
+
+def create_temp_h5ad(
+    adata, layers=None, obs=None, var=None, obsm=None, varm=None, uns=None
+):
+    """
+    Create a temporary H5AD file with specified data from an AnnData object.
+
+    Args:
+        adata: Input AnnData object
+        layers: List of layer names to include (e.g., ["counts"])
+        obs: List of obs column names to include (e.g., ["batch"])
+        var: Dict mapping var column names to new names (e.g., {"feature_id": "ensembl_id"})
+        obsm: List of obsm keys to include
+        varm: List of varm keys to include
+        uns: List of uns keys to include
+
+    Returns:
+        tuple: (h5ad_file, input_adata) where h5ad_file is the NamedTemporaryFile and
+               input_adata is the created AnnData object
+    """
+    # Extract X from layers or use X directly
+    if layers and len(layers) > 0:
+        X = adata.layers[layers[0]].copy()
+    else:
+        X = adata.X.copy()
+
+    # Create new AnnData
+    input_adata = ad.AnnData(X=X)
+
+    # Set var_names
+    input_adata.var_names = adata.var_names
+
+    # Add obs columns
+    if obs:
+        for obs_key in obs:
+            if obs_key in adata.obs:
+                input_adata.obs[obs_key] = adata.obs[obs_key].values
+
+    # Add var columns (with optional renaming)
+    if var:
+        for old_name, new_name in var.items():
+            if old_name in adata.var:
+                input_adata.var[new_name] = adata.var[old_name].values
+
+    # Add obsm
+    if obsm:
+        for obsm_key in obsm:
+            if obsm_key in adata.obsm:
+                input_adata.obsm[obsm_key] = adata.obsm[obsm_key].copy()
+
+    # Add varm
+    if varm:
+        for varm_key in varm:
+            if varm_key in adata.varm:
+                input_adata.varm[varm_key] = adata.varm[varm_key].copy()
+
+    # Add uns
+    if uns:
+        for uns_key in uns:
+            if uns_key in adata.uns:
+                input_adata.uns[uns_key] = adata.uns[uns_key]
+
+    # Write to temp file
+    h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
+    input_adata.write(h5ad_file.name)
+
+    return h5ad_file, input_adata
+
+
+def embed(adata, model, layers=None, obs=None, var=None, model_params=None, process_adata=None):
+    """
+    Embed data using an MLflow model.
+
+    Args:
+        adata: Input AnnData object to embed
+        model: Loaded MLflow model
+        layers: List of layer names to include (e.g., ["counts"])
+        obs: List of obs column names to include (e.g., ["batch"])
+        var: Dict mapping var column names to new names (e.g., {"feature_id": "ensembl_id"})
+        model_params: Optional dict of parameters to pass to model.predict()
+        process_adata: Optional function to process input_adata before writing (e.g., to add defaults)
+
+    Returns:
+        np.ndarray: Embeddings for the input data
+    """
+    print("Writing temporary input H5AD file...", flush=True)
+    h5ad_file, input_adata = create_temp_h5ad(adata, layers=layers, obs=obs, var=var)
+    
+    # Apply any post-processing to input_adata
+    if process_adata:
+        process_adata(input_adata)
+    
+    print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
+    print(input_adata, flush=True)
+
+    # Re-write the file after processing
+    input_adata.write(h5ad_file.name)
+
+    print("Running model...", flush=True)
+    input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
+    if model_params:
+        embedding = model.predict(input_df, params=model_params)
+    else:
+        embedding = model.predict(input_df)
+
+    # Clean up
+    h5ad_file.close()
+    os.unlink(h5ad_file.name)
+
+    return embedding
+
+
+def embed_and_classify(
+    train_adata,
+    test_adata,
+    model,
+    layers=None,
+    obs=None,
+    var=None,
+    model_params=None,
+    process_adata=None,
+    n_neighbors=5,
+):
+    """
+    Generic pipeline for embedding data and training a kNN classifier.
+
+    Args:
+        train_adata: Training AnnData object with labels
+        test_adata: Test AnnData object to predict
+        model: Loaded MLflow model
+        layers: List of layer names to include (e.g., ["counts"])
+        obs: List of obs column names to include (e.g., ["batch"])
+        var: Dict mapping var column names to new names (e.g., {"feature_id": "ensembl_id"})
+        model_params: Optional dict of parameters to pass to model.predict()
+        process_adata: Optional function to process input_adata before writing (e.g., to add defaults)
+        n_neighbors: Number of neighbors for kNN classifier
+
+    Returns:
+        np.ndarray: Predicted labels for test data
+    """
+    # Embed training data
+    print("\n>>> Embedding training data...", flush=True)
+    embedding_train = embed(
+        train_adata, model, layers=layers, obs=obs, var=var,
+        model_params=model_params, process_adata=process_adata
+    )
+
+    # Train kNN classifier
+    print("\n>>> Training kNN classifier...", flush=True)
+    classifier = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)
+    classifier.fit(embedding_train, train_adata.obs["label"].astype(str))
+
+    # Embed test data
+    print("\n>>> Embedding test data...", flush=True)
+    embedding_test = embed(
+        test_adata, model, layers=layers, obs=obs, var=var,
+        model_params=model_params, process_adata=process_adata
+    )
+
+    # Classify
+    print("\n>>> Classifying test data...", flush=True)
+    predictions = classifier.predict(embedding_test)
+
+    return predictions
diff --git a/src/utils/mlflow_docker_setup.yaml b/src/utils/mlflow_docker_setup.yaml
new file mode 100644
index 00000000..aa03e9a7
--- /dev/null
+++ b/src/utils/mlflow_docker_setup.yaml
@@ -0,0 +1,14 @@
+- type: docker
+  add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
+  run: sh /uv-installer.sh && rm /uv-installer.sh
+  env: PATH="/root/.local/bin/:$PATH"
+- type: docker
+  run: uv venv --python 3.11 /opt/venv
+- type: docker
+  env:
+    - VIRTUAL_ENV=/opt/venv
+    - PATH="/opt/venv/bin:$PATH"
+  add: requirements.txt /requirements.txt
+  run: uv pip install -r /requirements.txt && uv pip install mlflow==3.1.0
+- type: docker
+  run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems

From 28bb336ec5e099adbd2189181c78a99cc21c698d Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Sun, 12 Oct 2025 08:58:49 +0200
Subject: [PATCH 19/21] use biggpu for uce?

---
 src/methods/uce_mlflow/config.vsh.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/methods/uce_mlflow/config.vsh.yaml b/src/methods/uce_mlflow/config.vsh.yaml
index 564bc7de..96ccbd8b 100644
--- a/src/methods/uce_mlflow/config.vsh.yaml
+++ b/src/methods/uce_mlflow/config.vsh.yaml
@@ -46,4 +46,4 @@ runners:
   - type: executable
   - type: nextflow
     directives:
-      label: [hightime, highmem, midcpu, gpu]
+      label: [hightime, highmem, midcpu, biggpu]

From e480211b78b00ec84c167cbbe9783e24f165a66b Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Sun, 12 Oct 2025 08:59:01 +0200
Subject: [PATCH 20/21] disable old methods

---
 src/workflows/run_benchmark/config.vsh.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index f7d472d0..7872823e 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -92,7 +92,6 @@ dependencies:
   - name: methods/batchelor_mnn_correct
   - name: methods/bbknn
   - name: methods/combat
-  - name: methods/geneformer
   - name: methods/geneformer_mlflow
   - name: methods/harmony
   - name: methods/harmonypy
@@ -102,9 +101,7 @@ dependencies:
   - name: methods/scalex
   - name: methods/scanorama
   - name: methods/scanvi
-  - name: methods/scgpt_finetuned
   - name: methods/scgpt_mlflow
-  - name: methods/scgpt_zeroshot
   - name: methods/scimilarity
   - name: methods/scprint
   - name: methods/scvi
@@ -112,6 +109,10 @@ dependencies:
   - name: methods/transcriptformer_mlflow
   - name: methods/uce
   - name: methods/uce_mlflow
+  # outdated methods
+  # - name: methods/geneformer
+  # - name: methods/scgpt_finetuned
+  # - name: methods/scgpt_zeroshot
   # metrics
   - name: metrics/asw_batch
   - name: metrics/asw_label

From 329d25e2b182636234d82ee00a9948e997af75d7 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Sun, 19 Oct 2025 22:53:18 +0200
Subject: [PATCH 21/21] fix wf

Signed-off-by: Robrecht Cannoodt <rcannood@gmail.com>
---
 src/workflows/run_benchmark/config.vsh.yaml | 4 ++--
 src/workflows/run_benchmark/main.nf         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 7872823e..d9ae23f4 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -102,6 +102,8 @@ dependencies:
   - name: methods/scanorama
   - name: methods/scanvi
   - name: methods/scgpt_mlflow
+  - name: methods/scgpt_finetuned
+  - name: methods/scgpt_zeroshot
   - name: methods/scimilarity
   - name: methods/scprint
   - name: methods/scvi
@@ -111,8 +113,6 @@ dependencies:
   - name: methods/uce_mlflow
   # outdated methods
   # - name: methods/geneformer
-  # - name: methods/scgpt_finetuned
-  # - name: methods/scgpt_zeroshot
   # metrics
   - name: metrics/asw_batch
   - name: metrics/asw_label
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 1fbebb45..88f83327 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -20,7 +20,7 @@ methods = [
   batchelor_mnn_correct,
   bbknn,
   combat,
-  geneformer,
+  // geneformer,
   geneformer_mlflow.run(
     args: [model: file("s3://openproblems-work/cache/geneformer-mlflow-model.zip")]
   ),