From 0ed1b06988fce28ab9bf59ee12df4f941f8d1e03 Mon Sep 17 00:00:00 2001 From: "Daniel E. Schaffer" Date: Sun, 19 Oct 2025 19:16:05 -0400 Subject: [PATCH 1/3] Split Scanorama --- .../config.vsh.yaml | 6 +- .../script.py | 8 +-- .../scanorama_integrate/config.vsh.yaml | 42 +++++++++++++ src/methods/scanorama_integrate/script.py | 59 +++++++++++++++++++ src/workflows/run_benchmark/config.vsh.yaml | 3 +- src/workflows/run_benchmark/main.nf | 3 +- 6 files changed, 110 insertions(+), 11 deletions(-) rename src/methods/{scanorama => scanorama_correct}/config.vsh.yaml (94%) rename src/methods/{scanorama => scanorama_correct}/script.py (92%) create mode 100644 src/methods/scanorama_integrate/config.vsh.yaml create mode 100644 src/methods/scanorama_integrate/script.py diff --git a/src/methods/scanorama/config.vsh.yaml b/src/methods/scanorama_correct/config.vsh.yaml similarity index 94% rename from src/methods/scanorama/config.vsh.yaml rename to src/methods/scanorama_correct/config.vsh.yaml index cb7c2f44..c346d96a 100644 --- a/src/methods/scanorama/config.vsh.yaml +++ b/src/methods/scanorama_correct/config.vsh.yaml @@ -1,6 +1,6 @@ __merge__: /src/api/comp_method.yaml -name: scanorama -label: Scanorama +name: scanorama_correct +label: Scanorama-Corrrect summary: Efficient integration of heterogeneous single-cell transcriptomes using Scanorama description: | Scanorama enables batch-correction and integration of heterogeneous scRNA-seq datasets. @@ -17,7 +17,7 @@ links: repository: https://github.com/brianhie/scanorama documentation: https://github.com/brianhie/scanorama#readme info: - method_types: [feature, embedding] + method_types: [feature] preferred_normalization: log_cp10k resources: - type: python_script diff --git a/src/methods/scanorama/script.py b/src/methods/scanorama_correct/script.py similarity index 92% rename from src/methods/scanorama/script.py rename to src/methods/scanorama_correct/script.py index 2ddb91df..e0831869 100644 --- a/src/methods/scanorama/script.py +++ b/src/methods/scanorama_correct/script.py @@ -8,8 +8,7 @@ 'output': 'output.h5ad', } meta = { - 'name': 'foo', - 'config': 'bar' + 'name': 'scanorama-correct', } ## VIASH END @@ -57,7 +56,7 @@ def merge_adata(*adata_list, **kwargs): batch_categories = adata.obs['batch'].cat.categories for i in batch_categories: split.append(adata[adata.obs['batch'] == i].copy()) -corrected = scanorama.correct_scanpy(split, return_dimred=True) +corrected = scanorama.correct_scanpy(split, return_dimred=False) corrected = merge_adata(*corrected, batch_key='batch', batch_categories=batch_categories, index_unique=None) print("Store output", flush=True) @@ -71,9 +70,6 @@ def merge_adata(*adata_list, **kwargs): }, layers={ 'corrected_counts': corrected.X, - }, - obsm={ - 'X_emb': corrected.obsm["X_scanorama"], } ) diff --git a/src/methods/scanorama_integrate/config.vsh.yaml b/src/methods/scanorama_integrate/config.vsh.yaml new file mode 100644 index 00000000..a53ab6c5 --- /dev/null +++ b/src/methods/scanorama_integrate/config.vsh.yaml @@ -0,0 +1,42 @@ +__merge__: /src/api/comp_method.yaml +name: scanorama_integrate +label: Scanorama-Integrate +summary: Efficient integration of heterogeneous single-cell transcriptomes using Scanorama +description: | + Scanorama enables batch-correction and integration of heterogeneous scRNA-seq datasets. + It is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods, + including those for imputation and highly-variable gene filtering. The results from + Scanorama integration and batch correction can then be used as input to other tools + for scRNA-seq clustering, visualization, and analysis. +references: + # Hie, B., Bryson, B. & Berger, B. Efficient integration of heterogeneous single-cell + # transcriptomes using Scanorama. Nat Biotechnol 37, 685–691 (2019). + # https://doi.org/10.1038/s41587-019-0113-3 + doi: 10.1038/s41587-019-0113-3 +links: + repository: https://github.com/brianhie/scanorama + documentation: https://github.com/brianhie/scanorama#readme +info: + method_types: [embedding] + preferred_normalization: log_cp10k +arguments: + - name: --dimred + type: integer + default: 100 + description: Embedding dimension +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: python + pypi: + - scanorama +runners: + - type: executable + - type: nextflow + directives: + label: [hightime, highmem, lowcpu] diff --git a/src/methods/scanorama_integrate/script.py b/src/methods/scanorama_integrate/script.py new file mode 100644 index 00000000..bbe6ca86 --- /dev/null +++ b/src/methods/scanorama_integrate/script.py @@ -0,0 +1,59 @@ +import sys +import anndata as ad +import scanorama +import numpy as np + +## VIASH START +par = { + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', + 'output': 'output.h5ad', + 'dimred': 100 +} +meta = { + 'name': 'scanorama-integrate', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +print('Run scanorama', flush=True) +split = [] +batch_categories = adata.obs['batch'].cat.categories +for b in batch_categories: + split.append(adata[adata.obs['batch'] == b].copy()) +scanorama.integrate_scanpy(split, dimred=par["dimred"]) + +#From https://colab.research.google.com/drive/1CebA3Ow4jXITK0dW5el320KVTX_szhxG +result = np.zeros((adata.shape[0], split[0].obsm["X_scanorama"].shape[1])) +for i, b in enumerate(batch_categories): + result[adata.obs['batch'] == b] = split[i].obsm["X_scanorama"] + + +print("Store output", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['name'], + }, + obsm={ + 'X_emb': result + }, + shape=adata.shape, +) + +print("Write output to file", flush=True) +output.write(par['output'], compression='gzip') diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 09905ad0..ba479227 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -99,7 +99,8 @@ dependencies: - name: methods/mnnpy - name: methods/pyliger - name: methods/scalex - - name: methods/scanorama + - name: methods/scanorama_correct + - name: methods/scanorama_integrate - name: methods/scanvi - name: methods/scgpt_finetuned - name: methods/scgpt_zeroshot diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 6196f749..541e95b7 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -27,7 +27,8 @@ methods = [ mnnpy, pyliger, scalex, - scanorama, + scanorama_correct, + scanorama_integrate, scanvi, scgpt_finetuned.run( args: [model: file("s3://openproblems-work/cache/scGPT_human.zip")] From c768daf5892b6c724166a78111e0dfa303b3eb6f Mon Sep 17 00:00:00 2001 From: "Daniel E. Schaffer" Date: Sun, 19 Oct 2025 19:26:39 -0400 Subject: [PATCH 2/3] Specify change --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 890c4eb7..4e4e55c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,12 @@ * Update scPRINT to use latest stable version (PR #70) * Fix kbet dependencies to numpy<2 and scipy<=1.13 (PR #78). +* Split Scanorama into two methods/scores + - Split scanorama into embedding (integrate) and count-correction (correct) modes, instead of running both together. + This makes clear what the reported score(s) are describing, and also corrects the misleadingly low score that + the combined method receives. The scores for each componenet are in line with their scores from v1, where the modes + were seperated. + # task_batch_integration 2.0.0 A major update to the OpenProblems framework, switching from a Python-based framework to a Viash + Nextflow-based framework. This update features the same concepts as the previous version, but with a new implementation that is more flexible, scalable, and maintainable. From db5f0669e6be3feaa5c293551498d05978c84f73 Mon Sep 17 00:00:00 2001 From: "Daniel E. Schaffer" Date: Sun, 19 Oct 2025 19:28:41 -0400 Subject: [PATCH 3/3] typos --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e4e55c4..80aa7b38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,10 +25,10 @@ * Fix kbet dependencies to numpy<2 and scipy<=1.13 (PR #78). * Split Scanorama into two methods/scores - - Split scanorama into embedding (integrate) and count-correction (correct) modes, instead of running both together. + - Split Scanorama into embedding (integrate) and count-correction (correct) modes, instead of running both together. This makes clear what the reported score(s) are describing, and also corrects the misleadingly low score that - the combined method receives. The scores for each componenet are in line with their scores from v1, where the modes - were seperated. + the combined method receives. The scores for each component are in line with their scores from v1, where the modes + were separated. # task_batch_integration 2.0.0