From 479437adba35d20952eabdc0f874e8ccb61591ae Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Jul 2025 13:02:05 +0200
Subject: [PATCH 1/9] Add transcriptformer_mlflow method component

---
 .../transcriptformer_mlflow/config.vsh.yaml   | 60 +++++++++++++++
 src/methods/transcriptformer_mlflow/script.py | 76 +++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 src/methods/transcriptformer_mlflow/config.vsh.yaml
 create mode 100644 src/methods/transcriptformer_mlflow/script.py

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
new file mode 100644
index 00000000..e693428a
--- /dev/null
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -0,0 +1,60 @@
+__merge__: ../../api/base_method.yaml
+
+name: transcriptformer_mlflow
+label: TranscriptFormer (MLflow model)
+summary: "Context-aware representations of single-cell transcriptomes by jointly modeling genes and transcripts"
+description: |
+  TranscriptFormer is designed to learn rich, context-aware representations of
+  single-cell transcriptomes while jointly modeling genes and transcripts using
+  a novel generative architecture.
+
+  It is a family of generative foundation models representing a cross-species
+  generative cell atlas trained on up to 112 million cells spanning 1.53 billion
+  years of evolution across 12 species.
+
+  Here, we use a version packaged as an MLflow model.
+references:
+  doi:
+    - 10.1101/2025.04.25.650731
+links:
+  documentation: https://github.com/czi-ai/transcriptformer#readme
+  repository: https://github.com/czi-ai/transcriptformer
+
+info:
+  method_types: [embedding]
+  preferred_normalization: counts
+
+arguments:
+  - name: --model
+    type: file
+    description: |
+      An MLflow model URL for the transcriptformer model
+    required: true
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+  - path: /src/utils/exit_codes.py
+
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1
+    setup:
+      - type: docker
+        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
+        run: sh /uv-installer.sh && rm /uv-installer.sh
+        env: PATH="/root/.local/bin/:$PATH"
+      - type: docker
+        run: uv venv --python 3.11 /opt/venv
+      - type: docker
+        env:
+         - VIRTUAL_ENV=/opt/venv
+         - PATH="/opt/venv/bin:$PATH"
+        run: uv pip install mlflow==3.0.0 "transcriptformer>=0.3.0"
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [hightime, midmem, midcpu, gpu]
diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
new file mode 100644
index 00000000..db8bda4d
--- /dev/null
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -0,0 +1,76 @@
+import anndata as ad
+import sys
+import mlflow.pyfunc
+from tempfile import NamedTemporaryFile
+import os
+import pandas as pd
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    "input": "resources_test/.../input.h5ad",
+    "output": "output.h5ad",
+    "model": "resources_test/.../model",
+}
+meta = {"name": "transcriptformer_mlflow"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+from exit_codes import exit_non_applicable
+
+print(f"====== TranscriptFormer (MLflow model) ======", flush=True)
+
+print("\n>>> Reading input files...", flush=True)
+print(f"Input H5AD file: '{par['input']}'", flush=True)
+adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+
+if adata.uns["dataset_organism"] != "homo_sapiens":
+    exit_non_applicable(
+        f"Transcriptformer can only be used with human data "
+        f"(dataset_organism == \"{adata.uns['dataset_organism']}\")"
+    )
+
+print(adata, flush=True)
+
+print("\n>>> Writing temporary H5AD file...", flush=True)
+input_adata = ad.AnnData(X = adata.X.copy(), var = adata.var.filter(items=["feature_id"]).rename(columns = {"feature_id": "ensembl_id"}))
+input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing
+print(input_adata, flush=True)
+h5ad_file = NamedTemporaryFile(suffix=".h5ad", delete=False)
+print(f"Temporary H5AD file: '{h5ad_file}'", flush=True)
+input_adata.write(h5ad_file.name)
+del input_adata
+
+print("\n>>> Loading model...", flush=True)
+model = mlflow.pyfunc.load_model(par["model"])
+
+print("\n>>> Running model...", flush=True)
+input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
+model.predict(input_df)
+
+print("\n>>> Storing output...", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    # obsm={
+    #     "X_emb": embedded.X,
+    # },
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+print(output)
+
+print("\n>>> Writing output to file...", flush=True)
+print(f"Output H5AD file: '{par['output']}'", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
+
+print("\n>>> Cleaning up temporary files...", flush=True)
+h5ad_file.close()
+os.unlink(h5ad_file.name)
+
+print("\n>>> Done!", flush=True)

From 95ced9ba05092d9f83b14a61db9c538f6307149c Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Jul 2025 16:58:37 +0200
Subject: [PATCH 2/9] Adjust transcriptformer_mlflow dependencies

---
 src/methods/transcriptformer_mlflow/config.vsh.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index e693428a..ee874547 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -49,9 +49,9 @@ engines:
         run: uv venv --python 3.11 /opt/venv
       - type: docker
         env:
-         - VIRTUAL_ENV=/opt/venv
-         - PATH="/opt/venv/bin:$PATH"
-        run: uv pip install mlflow==3.0.0 "transcriptformer>=0.3.0"
+          - VIRTUAL_ENV=/opt/venv
+          - PATH="/opt/venv/bin:$PATH"
+        run: uv pip install mlflow==3.1.0 pandas==2.3.0 scanpy==1.11.2 scipy==1.16.0 "transcriptformer>=0.3.0"
 
 runners:
   - type: executable

From 6e8dce2e7006a5bdc80593a69dc8972ef34dd73b Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Jul 2025 17:00:50 +0200
Subject: [PATCH 3/9] Output embedding in transcriptformer_mlflow

---
 src/methods/transcriptformer_mlflow/script.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index db8bda4d..6c62f3f4 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -39,7 +39,7 @@
 input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing
 print(input_adata, flush=True)
 h5ad_file = NamedTemporaryFile(suffix=".h5ad", delete=False)
-print(f"Temporary H5AD file: '{h5ad_file}'", flush=True)
+print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
 input_adata.write(h5ad_file.name)
 del input_adata
 
@@ -48,15 +48,15 @@
 
 print("\n>>> Running model...", flush=True)
 input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-model.predict(input_df)
+embedding = model.predict(input_df)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
     obs=adata.obs[[]],
     var=adata.var[[]],
-    # obsm={
-    #     "X_emb": embedded.X,
-    # },
+    obsm={
+        "X_emb": embedding,
+    },
     uns={
         "dataset_id": adata.uns["dataset_id"],
         "normalization_id": adata.uns["normalization_id"],

From 440a18b8c5ea0446cf839f9009f56538194b66c3 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Jul 2025 17:26:59 +0200
Subject: [PATCH 4/9] Install transcriptformer requirements from file

---
 src/methods/transcriptformer_mlflow/config.vsh.yaml | 6 +++++-
 src/methods/transcriptformer_mlflow/script.py       | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index ee874547..c1524f87 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -36,6 +36,7 @@ resources:
     path: script.py
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
+  - path: requirements.txt
 
 engines:
   - type: docker
@@ -51,7 +52,10 @@ engines:
         env:
           - VIRTUAL_ENV=/opt/venv
           - PATH="/opt/venv/bin:$PATH"
-        run: uv pip install mlflow==3.1.0 pandas==2.3.0 scanpy==1.11.2 scipy==1.16.0 "transcriptformer>=0.3.0"
+        add: requirements.txt /requirements.txt
+        run: uv pip install -r /requirements.txt
+      - type: docker
+        run: uv pip install mlflow==3.1.0
 
 runners:
   - type: executable
diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index 6c62f3f4..f2cd1e40 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -45,6 +45,7 @@
 
 print("\n>>> Loading model...", flush=True)
 model = mlflow.pyfunc.load_model(par["model"])
+print(model, flush=True)
 
 print("\n>>> Running model...", flush=True)
 input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})

From 5332e00cd7a71b26f5044c803dfd94931d28b941 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 9 Jul 2025 10:16:16 +0200
Subject: [PATCH 5/9] Add extracting model archive to transcriptformer

---
 .../transcriptformer_mlflow/config.vsh.yaml   |   3 +-
 .../transcriptformer_mlflow/requirements.txt  | 338 ++++++++++++++++++
 src/methods/transcriptformer_mlflow/script.py |  44 ++-
 3 files changed, 377 insertions(+), 8 deletions(-)
 create mode 100644 src/methods/transcriptformer_mlflow/requirements.txt

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index c1524f87..ba708fa1 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -28,7 +28,8 @@ arguments:
   - name: --model
     type: file
     description: |
-      An MLflow model URL for the transcriptformer model
+      An MLflow model URI for the transcriptformer model. If it is a .zip or
+      .tar.gz file it will be extracted to a temporary directory.
     required: true
 
 resources:
diff --git a/src/methods/transcriptformer_mlflow/requirements.txt b/src/methods/transcriptformer_mlflow/requirements.txt
new file mode 100644
index 00000000..70d923d1
--- /dev/null
+++ b/src/methods/transcriptformer_mlflow/requirements.txt
@@ -0,0 +1,338 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements.in -o requirements.txt
+aiobotocore==2.23.0
+    # via s3fs
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.12.13
+    # via
+    #   aiobotocore
+    #   fsspec
+    #   s3fs
+aioitertools==0.12.0
+    # via aiobotocore
+aiosignal==1.3.2
+    # via aiohttp
+anndata==0.11.4
+    # via
+    #   cellxgene-census
+    #   scanpy
+    #   somacore
+    #   tiledbsoma
+    #   transcriptformer
+antlr4-python3-runtime==4.9.3
+    # via
+    #   hydra-core
+    #   omegaconf
+array-api-compat==1.12.0
+    # via anndata
+attrs==25.3.0
+    # via
+    #   aiohttp
+    #   somacore
+    #   tiledbsoma
+boto3==1.38.27
+    # via transcriptformer
+botocore==1.38.27
+    # via
+    #   aiobotocore
+    #   boto3
+    #   s3transfer
+cellxgene-census==1.17.0
+    # via transcriptformer
+certifi==2025.6.15
+    # via requests
+charset-normalizer==3.4.2
+    # via requests
+contourpy==1.3.2
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+filelock==3.18.0
+    # via
+    #   torch
+    #   triton
+fonttools==4.58.4
+    # via matplotlib
+frozenlist==1.7.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.5.1
+    # via
+    #   pytorch-lightning
+    #   s3fs
+    #   torch
+h5py==3.14.0
+    # via
+    #   anndata
+    #   scanpy
+    #   transcriptformer
+hydra-core==1.3.2
+    # via transcriptformer
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+iniconfig==2.1.0
+    # via pytest
+jinja2==3.1.6
+    # via torch
+jmespath==1.0.1
+    # via
+    #   aiobotocore
+    #   boto3
+    #   botocore
+joblib==1.5.1
+    # via
+    #   pynndescent
+    #   scanpy
+    #   scikit-learn
+kiwisolver==1.4.8
+    # via matplotlib
+legacy-api-wrap==1.4.1
+    # via scanpy
+lightning-utilities==0.14.3
+    # via
+    #   pytorch-lightning
+    #   torchmetrics
+llvmlite==0.44.0
+    # via
+    #   numba
+    #   pynndescent
+markupsafe==3.0.2
+    # via jinja2
+matplotlib==3.10.3
+    # via
+    #   scanpy
+    #   seaborn
+more-itertools==10.7.0
+    # via tiledbsoma
+mpmath==1.3.0
+    # via sympy
+multidict==6.6.0
+    # via
+    #   aiobotocore
+    #   aiohttp
+    #   yarl
+natsort==8.4.0
+    # via
+    #   anndata
+    #   scanpy
+networkx==3.5
+    # via
+    #   scanpy
+    #   torch
+numba==0.61.2
+    # via
+    #   pynndescent
+    #   scanpy
+    #   umap-learn
+numpy==2.2.6
+    # via
+    #   anndata
+    #   cellxgene-census
+    #   contourpy
+    #   h5py
+    #   matplotlib
+    #   numba
+    #   pandas
+    #   patsy
+    #   scanpy
+    #   scikit-learn
+    #   scipy
+    #   seaborn
+    #   shapely
+    #   somacore
+    #   statsmodels
+    #   tiledbsoma
+    #   torchmetrics
+    #   transcriptformer
+    #   umap-learn
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.575.51
+    # via pynvml
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+omegaconf==2.3.0
+    # via hydra-core
+packaging==25.0
+    # via
+    #   anndata
+    #   hydra-core
+    #   lightning-utilities
+    #   matplotlib
+    #   pytest
+    #   pytorch-lightning
+    #   scanpy
+    #   statsmodels
+    #   torchmetrics
+pandas==2.3.0
+    # via
+    #   anndata
+    #   scanpy
+    #   seaborn
+    #   somacore
+    #   statsmodels
+    #   tiledbsoma
+    #   transcriptformer
+patsy==1.0.1
+    # via
+    #   scanpy
+    #   statsmodels
+pillow==11.2.1
+    # via matplotlib
+pluggy==1.6.0
+    # via pytest
+propcache==0.3.2
+    # via
+    #   aiohttp
+    #   yarl
+psutil==7.0.0
+    # via transcriptformer
+pyarrow==20.0.0
+    # via
+    #   somacore
+    #   tiledbsoma
+pyarrow-hotfix==0.7
+    # via somacore
+pygments==2.19.2
+    # via pytest
+pynndescent==0.5.13
+    # via
+    #   scanpy
+    #   umap-learn
+pynvml==12.0.0
+    # via transcriptformer
+pyparsing==3.2.3
+    # via matplotlib
+pytest==8.4.1
+    # via transcriptformer
+python-dateutil==2.9.0.post0
+    # via
+    #   aiobotocore
+    #   botocore
+    #   matplotlib
+    #   pandas
+pytorch-lightning==2.5.2
+    # via transcriptformer
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   omegaconf
+    #   pytorch-lightning
+requests==2.32.4
+    # via cellxgene-census
+s3fs==2025.5.1
+    # via cellxgene-census
+s3transfer==0.13.0
+    # via boto3
+scanpy==1.11.2
+    # via
+    #   tiledbsoma
+    #   transcriptformer
+scikit-learn==1.7.0
+    # via
+    #   pynndescent
+    #   scanpy
+    #   umap-learn
+scipy==1.16.0
+    # via
+    #   anndata
+    #   pynndescent
+    #   scanpy
+    #   scikit-learn
+    #   somacore
+    #   statsmodels
+    #   tiledbsoma
+    #   transcriptformer
+    #   umap-learn
+seaborn==0.13.2
+    # via scanpy
+session-info2==0.1.2
+    # via scanpy
+setuptools==80.9.0
+    # via lightning-utilities
+shapely==2.1.1
+    # via somacore
+six==1.17.0
+    # via python-dateutil
+somacore==1.0.28
+    # via tiledbsoma
+statsmodels==0.14.4
+    # via scanpy
+sympy==1.13.1
+    # via torch
+threadpoolctl==3.6.0
+    # via scikit-learn
+tiledbsoma==1.17.0
+    # via cellxgene-census
+timeout-decorator==0.5.0
+    # via transcriptformer
+torch==2.5.1
+    # via
+    #   pytorch-lightning
+    #   torchmetrics
+    #   transcriptformer
+torchmetrics==1.7.3
+    # via pytorch-lightning
+tqdm==4.67.1
+    # via
+    #   pytorch-lightning
+    #   scanpy
+    #   umap-learn
+transcriptformer==0.3.0
+    # via -r requirements.in
+triton==3.1.0
+    # via torch
+typing-extensions==4.14.0
+    # via
+    #   cellxgene-census
+    #   lightning-utilities
+    #   pytorch-lightning
+    #   scanpy
+    #   somacore
+    #   tiledbsoma
+    #   torch
+tzdata==2025.2
+    # via pandas
+umap-learn==0.5.7
+    # via scanpy
+urllib3==2.5.0
+    # via
+    #   botocore
+    #   requests
+wrapt==1.17.2
+    # via aiobotocore
+yarl==1.20.1
+    # via aiohttp
diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index f2cd1e40..76b41023 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -1,9 +1,11 @@
 import anndata as ad
 import sys
 import mlflow.pyfunc
-from tempfile import NamedTemporaryFile
+import tempfile
 import os
 import pandas as pd
+import zipfile
+import tarfile
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -34,19 +36,45 @@
 
 print(adata, flush=True)
 
-print("\n>>> Writing temporary H5AD file...", flush=True)
+if os.path.isdir(par["model"]):
+    print("\n>>> Using model directory...", flush=True)
+    model_temp = None
+    model_dir = par["model"]
+else:
+    model_temp = tempfile.TemporaryDirectory()
+    model_dir = model_temp.name
+
+    if zipfile.is_zipfile(par["model"]):
+        print("\n>>> Extracting model from .zip...", flush=True)
+        print(f".zip path: '{par['model']}'", flush=True)
+        with zipfile.ZipFile(par["model"], "r") as zip_file:
+            zip_file.extractall(model_dir)
+    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(
+        ".tar.gz"
+    ):
+        print("\n>>> Extracting model from .tar.gz...", flush=True)
+        print(f".tar.gz path: '{par['model']}'", flush=True)
+        with tarfile.open(par["model"], "r:gz") as tar_file:
+            tar_file.extractall(model_dir)
+            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
+    else:
+        raise ValueError(
+            "The 'model' argument should be a directory a .zip file or a .tar.gz file"
+        )
+
+print("\n>>> Loading model...", flush=True)
+model = mlflow.pyfunc.load_model(model_dir)
+print(model, flush=True)
+
+print("\n>>> Writing temporary input H5AD file...", flush=True)
 input_adata = ad.AnnData(X = adata.X.copy(), var = adata.var.filter(items=["feature_id"]).rename(columns = {"feature_id": "ensembl_id"}))
 input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing
 print(input_adata, flush=True)
-h5ad_file = NamedTemporaryFile(suffix=".h5ad", delete=False)
+h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
 print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
 input_adata.write(h5ad_file.name)
 del input_adata
 
-print("\n>>> Loading model...", flush=True)
-model = mlflow.pyfunc.load_model(par["model"])
-print(model, flush=True)
-
 print("\n>>> Running model...", flush=True)
 input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
 embedding = model.predict(input_df)
@@ -71,6 +99,8 @@
 output.write_h5ad(par["output"], compression="gzip")
 
 print("\n>>> Cleaning up temporary files...", flush=True)
+if model_temp is not None:
+    model_temp.cleanup()
 h5ad_file.close()
 os.unlink(h5ad_file.name)
 

From b84a40aec116be12dd1b4069cd617ea8ac1485b4 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 9 Jul 2025 10:16:46 +0200
Subject: [PATCH 6/9] Add transcriptformer_mlflow to benchmark workflow

---
 scripts/run_benchmark/run_full_local.sh     | 2 +-
 scripts/run_benchmark/run_test_local.sh     | 2 +-
 src/workflows/run_benchmark/config.vsh.yaml | 1 +
 src/workflows/run_benchmark/main.nf         | 3 +++
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
index 20e434b3..b60940c9 100755
--- a/scripts/run_benchmark/run_full_local.sh
+++ b/scripts/run_benchmark/run_full_local.sh
@@ -26,7 +26,7 @@ input_states: resources/datasets/**/state.yaml
 rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
-settings: '{"methods_exclude": ["uce", "scgpt_finetuned"]}'
+settings: '{"methods_exclude": ["uce", "scgpt_finetuned", "transcriptformer_mlflow"]}'
 HERE
 
 # run the benchmark
diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh
index 85e39583..4b7bf15e 100755
--- a/scripts/run_benchmark/run_test_local.sh
+++ b/scripts/run_benchmark/run_test_local.sh
@@ -21,7 +21,7 @@ input_states: resources_test/task_batch_integration/**/state.yaml
 rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
-settings: '{"methods_exclude": ["uce", "scgpt_finetuned"]}'
+settings: '{"methods_exclude": ["uce", "scgpt_finetuned", "transcriptformer_mlflow"]}'
 HERE
 
 nextflow run . \
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 09905ad0..d9fe9504 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -106,6 +106,7 @@ dependencies:
   - name: methods/scimilarity
   - name: methods/scprint
   - name: methods/scvi
+  - name: methods/transcriptformer_mlflow
   - name: methods/uce
   # metrics
   - name: metrics/asw_batch
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 6196f749..104485bd 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -40,6 +40,9 @@ methods = [
   ),
   scprint,
   scvi,
+  transcriptformer_mlflow.run(
+    args: [model: file("s3://openproblems-work/cache/transcriptformer-mlflow-model.zip")]
+  ),
   uce.run(
     args: [model: file("s3://openproblems-work/cache/uce-model-v5.zip")]
   )

From 74be8558a5d2d31701acf338c908615f03d54d8e Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 9 Jul 2025 11:38:05 +0200
Subject: [PATCH 7/9] Install openproblems package for transcriptformer

---
 src/methods/transcriptformer_mlflow/config.vsh.yaml | 2 ++
 src/methods/transcriptformer_mlflow/script.py       | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index ba708fa1..9cb3544f 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -57,6 +57,8 @@ engines:
         run: uv pip install -r /requirements.txt
       - type: docker
         run: uv pip install mlflow==3.1.0
+      - type: docker
+        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
 
 runners:
   - type: executable
diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index 76b41023..c5c37575 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -38,6 +38,7 @@
 
 if os.path.isdir(par["model"]):
     print("\n>>> Using model directory...", flush=True)
+    print(f"Directory path: '{par['model']}'", flush=True)
     model_temp = None
     model_dir = par["model"]
 else:

From 8693f35176d8245b61ae18c76c1fc5b8181ee51a Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 9 Jul 2025 12:16:12 +0200
Subject: [PATCH 8/9] Style transcriptformer_mlflow script

---
 src/methods/transcriptformer_mlflow/script.py | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
index c5c37575..b16806d3 100644
--- a/src/methods/transcriptformer_mlflow/script.py
+++ b/src/methods/transcriptformer_mlflow/script.py
@@ -1,11 +1,12 @@
-import anndata as ad
+import os
 import sys
-import mlflow.pyfunc
+import tarfile
 import tempfile
-import os
-import pandas as pd
 import zipfile
-import tarfile
+
+import anndata as ad
+import mlflow.pyfunc
+import pandas as pd
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -19,10 +20,10 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from read_anndata_partial import read_anndata
 from exit_codes import exit_non_applicable
+from read_anndata_partial import read_anndata
 
-print(f"====== TranscriptFormer (MLflow model) ======", flush=True)
+print("====== TranscriptFormer (MLflow model) ======", flush=True)
 
 print("\n>>> Reading input files...", flush=True)
 print(f"Input H5AD file: '{par['input']}'", flush=True)
@@ -31,7 +32,7 @@
 if adata.uns["dataset_organism"] != "homo_sapiens":
     exit_non_applicable(
         f"Transcriptformer can only be used with human data "
-        f"(dataset_organism == \"{adata.uns['dataset_organism']}\")"
+        f'(dataset_organism == "{adata.uns["dataset_organism"]}")'
     )
 
 print(adata, flush=True)
@@ -50,9 +51,7 @@
         print(f".zip path: '{par['model']}'", flush=True)
         with zipfile.ZipFile(par["model"], "r") as zip_file:
             zip_file.extractall(model_dir)
-    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(
-        ".tar.gz"
-    ):
+    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
         print("\n>>> Extracting model from .tar.gz...", flush=True)
         print(f".tar.gz path: '{par['model']}'", flush=True)
         with tarfile.open(par["model"], "r:gz") as tar_file:
@@ -68,8 +67,13 @@
 print(model, flush=True)
 
 print("\n>>> Writing temporary input H5AD file...", flush=True)
-input_adata = ad.AnnData(X = adata.X.copy(), var = adata.var.filter(items=["feature_id"]).rename(columns = {"feature_id": "ensembl_id"}))
-input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing
+input_adata = ad.AnnData(
+    X=adata.X.copy(),
+    var=adata.var.filter(items=["feature_id"]).rename(
+        columns={"feature_id": "ensembl_id"}
+    ),
+)
+input_adata.obs["assay"] = "unknown"  # Avoid error if assay is missing
 print(input_adata, flush=True)
 h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
 print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)

From b7dae0b4b7b98a17a3966ded2ffaaa8335379a20 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 29 Jul 2025 10:45:10 +0200
Subject: [PATCH 9/9] Increase transcriptformer memory label

---
 src/methods/transcriptformer_mlflow/config.vsh.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
index 9cb3544f..2d144c23 100644
--- a/src/methods/transcriptformer_mlflow/config.vsh.yaml
+++ b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -64,4 +64,4 @@ runners:
   - type: executable
   - type: nextflow
     directives:
-      label: [hightime, midmem, midcpu, gpu]
+      label: [hightime, highmem, midcpu, gpu]