From e765a2d89c0ec2d865cb6e98f7e248e3cb233e42 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 25 Jun 2025 10:34:40 +0200
Subject: [PATCH 1/8] Add scgpt_czbenchmarks component

---
 .../scgpt_czbenchmarks/config.vsh.yaml        | 57 +++++++++++++++++++
 src/methods/scgpt_czbenchmarks/script.py      | 55 ++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 src/methods/scgpt_czbenchmarks/config.vsh.yaml
 create mode 100644 src/methods/scgpt_czbenchmarks/script.py

diff --git a/src/methods/scgpt_czbenchmarks/config.vsh.yaml b/src/methods/scgpt_czbenchmarks/config.vsh.yaml
new file mode 100644
index 00000000..ecb2a1c3
--- /dev/null
+++ b/src/methods/scgpt_czbenchmarks/config.vsh.yaml
@@ -0,0 +1,57 @@
+__merge__: ../../api/base_method.yaml
+
+name: scgpt_czbenchmarks
+label: scGPT (CZ Benchmarks)
+summary: "A foundation model for single-cell biology (CZ Benchmarks implementation)"
+description: |
+  scGPT is a foundation model for single-cell biology based on a generative
+  pre-trained transformer and trained on a repository of over 33 million cells.
+
+  Here, we use zero-shot output from a pre-trained model to get an integrated
+  embedding for the batch integration task.
+references:
+  doi:
+    - 10.1038/s41592-024-02201-0
+links:
+  documentation: https://scgpt.readthedocs.io/en/latest/
+  repository: https://github.com/chanzuckerberg/cz-benchmarks/tree/main/docker/scgpt
+
+info:
+  method_types: [embedding]
+  preferred_normalization: counts
+
+# arguments:
+#   - name: --model_name
+#     type: string
+#     description: String giving the name of the scGPT model to use
+#     choices: ["scGPT_human", "scGPT_CP"]
+#     default: "scGPT_human"
+#   - name: --model
+#     type: file
+#     description: |
+#       Path to the directory containing the scGPT model specified by model_name
+#       or a .zip/.tar.gz archive to extract. If not given the model will be
+#       downloaded.
+#     required: false
+#   - name: --n_hvg
+#     type: integer
+#     default: 3000
+#     description: Number of highly variable genes to use.
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: https://raw.githubusercontent.com/chanzuckerberg/cz-benchmarks/refs/heads/main/docker/scgpt/model.py
+    dest: model.py
+  - path: /src/utils/read_anndata_partial.py
+  - path: /src/utils/exit_codes.py
+
+engines:
+  - type: docker
+    image: public.ecr.aws/czi-virtual-cells/cz-benchmarks-models-public:scgpt
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [hightime, midmem, midcpu, gpu]
diff --git a/src/methods/scgpt_czbenchmarks/script.py b/src/methods/scgpt_czbenchmarks/script.py
new file mode 100644
index 00000000..6c287a8a
--- /dev/null
+++ b/src/methods/scgpt_czbenchmarks/script.py
@@ -0,0 +1,55 @@
+import sys
+
+import anndata as ad
+import scgpt
+import czbenchmarks
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    "input": "resources_test/.../input.h5ad",
+    "output": "output.h5ad",
+}
+meta = {"name": "scgpt_czbenchmarks"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from model import ScGPT
+from read_anndata_partial import read_anndata
+from exit_codes import exit_non_applicable
+
+print(f"====== scGPT version {scgpt.__version__} (czbenchmarks {czbenchmarks.__version__}) ======", flush=True)
+
+print("\n>>> Reading input files...", flush=True)
+print(f"Input H5AD file: '{par['input']}'", flush=True)
+adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+
+if adata.uns["dataset_organism"] != "homo_sapiens":
+    exit_non_applicable(
+        f"scGPT can only be used with human data "
+        f"(dataset_organism == \"{adata.uns['dataset_organism']}\")"
+    )
+
+print(adata, flush=True)
+
+print("\n>>> Storing output...", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    # obsm={
+    #     "X_emb": embedded.X,
+    # },
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+print(output)
+
+print("\n>>> Writing output to file...", flush=True)
+print(f"Output H5AD file: '{par['output']}'", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
+
+print("\n>>> Done!", flush=True)

From ecef8200beeb7163fcbeefe33601a5a54771f56f Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 25 Jun 2025 11:06:14 +0200
Subject: [PATCH 2/8] Install python-is-python3 for scgpt_czbenchmarks

---
 src/methods/scgpt_czbenchmarks/config.vsh.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/methods/scgpt_czbenchmarks/config.vsh.yaml b/src/methods/scgpt_czbenchmarks/config.vsh.yaml
index ecb2a1c3..c07a8bc8 100644
--- a/src/methods/scgpt_czbenchmarks/config.vsh.yaml
+++ b/src/methods/scgpt_czbenchmarks/config.vsh.yaml
@@ -49,6 +49,10 @@ resources:
 engines:
   - type: docker
     image: public.ecr.aws/czi-virtual-cells/cz-benchmarks-models-public:scgpt
+    setup:
+      - type: apt
+        packages:
+          - python-is-python3
 
 runners:
   - type: executable

From 1aae8999d415583b256bb058dbd4be7adb24eb79 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 25 Jun 2025 11:09:19 +0200
Subject: [PATCH 3/8] Use included model script instead of resource

---
 src/methods/scgpt_czbenchmarks/config.vsh.yaml | 2 --
 src/methods/scgpt_czbenchmarks/script.py       | 6 ++++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/methods/scgpt_czbenchmarks/config.vsh.yaml b/src/methods/scgpt_czbenchmarks/config.vsh.yaml
index c07a8bc8..e674ce99 100644
--- a/src/methods/scgpt_czbenchmarks/config.vsh.yaml
+++ b/src/methods/scgpt_czbenchmarks/config.vsh.yaml
@@ -41,8 +41,6 @@ info:
 resources:
   - type: python_script
     path: script.py
-  - path: https://raw.githubusercontent.com/chanzuckerberg/cz-benchmarks/refs/heads/main/docker/scgpt/model.py
-    dest: model.py
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
 
diff --git a/src/methods/scgpt_czbenchmarks/script.py b/src/methods/scgpt_czbenchmarks/script.py
index 6c287a8a..63db65c6 100644
--- a/src/methods/scgpt_czbenchmarks/script.py
+++ b/src/methods/scgpt_czbenchmarks/script.py
@@ -15,11 +15,13 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from model import ScGPT
 from read_anndata_partial import read_anndata
 from exit_codes import exit_non_applicable
 
-print(f"====== scGPT version {scgpt.__version__} (czbenchmarks {czbenchmarks.__version__}) ======", flush=True)
+sys.path.append("/app")
+from model import ScGPT
+
+print(f"====== scGPT version {scgpt.__version__} (czbenchmarks) ======", flush=True)
 
 print("\n>>> Reading input files...", flush=True)
 print(f"Input H5AD file: '{par['input']}'", flush=True)

From 57af280968205ea1e423ee46a573efddf66ea525 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 25 Jun 2025 13:32:30 +0200
Subject: [PATCH 4/8] Implement scgpt_czbenchmarks

---
 src/methods/scgpt_czbenchmarks/script.py | 58 ++++++++++++++++++------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/src/methods/scgpt_czbenchmarks/script.py b/src/methods/scgpt_czbenchmarks/script.py
index 63db65c6..5c453a6d 100644
--- a/src/methods/scgpt_czbenchmarks/script.py
+++ b/src/methods/scgpt_czbenchmarks/script.py
@@ -1,8 +1,12 @@
 import sys
+import os
 
 import anndata as ad
 import scgpt
-import czbenchmarks
+
+from czbenchmarks.datasets.single_cell import SingleCellDataset
+from czbenchmarks.datasets.types import Organism, DataType
+from czbenchmarks.models.types import ModelType
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -23,28 +27,54 @@
 
 print(f"====== scGPT version {scgpt.__version__} (czbenchmarks) ======", flush=True)
 
-print("\n>>> Reading input files...", flush=True)
-print(f"Input H5AD file: '{par['input']}'", flush=True)
-adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+# Check organism and exit if needed
+adata_uns = read_anndata(par["input"], uns="uns")
 
-if adata.uns["dataset_organism"] != "homo_sapiens":
+if adata_uns.uns["dataset_organism"] != "homo_sapiens":
     exit_non_applicable(
         f"scGPT can only be used with human data "
-        f"(dataset_organism == \"{adata.uns['dataset_organism']}\")"
+        f"(dataset_organism == \"{adata_uns.uns['dataset_organism']}\")"
     )
 
-print(adata, flush=True)
+del adata_uns
+
+print("\n>>> Creating input dataset..", flush=True)
+dataset = SingleCellDataset(path = par["input"], organism = Organism.HUMAN)
+print(dataset)
+dataset.load_data()
+dataset.adata.X = dataset.adata.layers["counts"].copy()
+print(dataset.adata, flush=True)
+
+print("\n>>> Running scGPT..", flush=True)
+model = ScGPT()
+# Run these steps manually instead of using model.run() to avoid reloading data
+print("Validating data...", flush=True)
+dataset.validate()
+model.validate_dataset(dataset)
+print("Data validated successfully", flush=True)
+
+print("Downloading model weights...", flush=True)
+if not os.path.exists("/weights/human"):
+    os.makedirs("/weights/human")
+model.download_model_weights(dataset)
+print("Model weights downloaded successfully", flush=True)
+
+print("Running model...", flush=True)
+model.run_model(dataset)
+print("Model ran successfully", flush=True)
+
+embedding = dataset.get_output(ModelType.SCGPT, DataType.EMBEDDING)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
-    obs=adata.obs[[]],
-    var=adata.var[[]],
-    # obsm={
-    #     "X_emb": embedded.X,
-    # },
+    obs=dataset.adata.obs[[]],
+    var=dataset.adata.var[[]],
+    obsm={
+        "X_emb": embedding,
+    },
     uns={
-        "dataset_id": adata.uns["dataset_id"],
-        "normalization_id": adata.uns["normalization_id"],
+        "dataset_id": dataset.adata.uns["dataset_id"],
+        "normalization_id": dataset.adata.uns["normalization_id"],
         "method_id": meta["name"],
     },
 )

From 072c8dd05fad6c52cfb303365e63fb39c06ffa8b Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 25 Jun 2025 14:51:52 +0200
Subject: [PATCH 5/8] Add scgpt_czbenchmarks to run_benchmark workflow

---
 src/workflows/run_benchmark/config.vsh.yaml | 1 +
 src/workflows/run_benchmark/main.nf         | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 09905ad0..c3e987db 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -101,6 +101,7 @@ dependencies:
   - name: methods/scalex
   - name: methods/scanorama
   - name: methods/scanvi
+  - name: methods/scgpt_czbenchmarks
   - name: methods/scgpt_finetuned
   - name: methods/scgpt_zeroshot
   - name: methods/scimilarity
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 6196f749..68321f50 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -29,6 +29,7 @@ methods = [
   scalex,
   scanorama,
   scanvi,
+  scgpt_czbenchmarks
   scgpt_finetuned.run(
     args: [model: file("s3://openproblems-work/cache/scGPT_human.zip")]
   ),

From c51d694c22755805d327066e46efb6ab6e3d4eb1 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 25 Jun 2025 14:55:09 +0200
Subject: [PATCH 6/8] Remove commented arguments from scgpt_czbenchmarks

---
 src/methods/scgpt_czbenchmarks/config.vsh.yaml | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/src/methods/scgpt_czbenchmarks/config.vsh.yaml b/src/methods/scgpt_czbenchmarks/config.vsh.yaml
index e674ce99..945f988c 100644
--- a/src/methods/scgpt_czbenchmarks/config.vsh.yaml
+++ b/src/methods/scgpt_czbenchmarks/config.vsh.yaml
@@ -20,24 +20,6 @@ info:
   method_types: [embedding]
   preferred_normalization: counts
 
-# arguments:
-#   - name: --model_name
-#     type: string
-#     description: String giving the name of the scGPT model to use
-#     choices: ["scGPT_human", "scGPT_CP"]
-#     default: "scGPT_human"
-#   - name: --model
-#     type: file
-#     description: |
-#       Path to the directory containing the scGPT model specified by model_name
-#       or a .zip/.tar.gz archive to extract. If not given the model will be
-#       downloaded.
-#     required: false
-#   - name: --n_hvg
-#     type: integer
-#     default: 3000
-#     description: Number of highly variable genes to use.
-
 resources:
   - type: python_script
     path: script.py

From b8e50f82d5555d51b8cc0da486879ad7d39d54bd Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 25 Jun 2025 16:49:16 +0200
Subject: [PATCH 7/8] Bump viash version

---
 _viash.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_viash.yaml b/_viash.yaml
index 5b612d43..626b71f8 100644
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -1,4 +1,4 @@
-viash_version: 0.9.0
+viash_version: 0.9.4
 
 name: task_batch_integration
 organization: openproblems-bio

From 67662affaea5c06c1a60731c13c2529cb017f48a Mon Sep 17 00:00:00 2001
From: Luke Zappia <lazappi@users.noreply.github.com>
Date: Thu, 26 Jun 2025 17:35:04 +0200
Subject: [PATCH 8/8] Fix comma in run_benchmark

Co-authored-by: Robrecht Cannoodt <rcannood@gmail.com>
---
 src/workflows/run_benchmark/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 68321f50..c4356578 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -29,7 +29,7 @@ methods = [
   scalex,
   scanorama,
   scanvi,
-  scgpt_czbenchmarks
+  scgpt_czbenchmarks,
   scgpt_finetuned.run(
     args: [model: file("s3://openproblems-work/cache/scGPT_human.zip")]
   ),