diff --git a/CHANGELOG.md b/CHANGELOG.md index 890c4eb7..80aa7b38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,12 @@ * Update scPRINT to use latest stable version (PR #70) * Fix kbet dependencies to numpy<2 and scipy<=1.13 (PR #78). +* Split Scanorama into two methods/scores + - Split Scanorama into embedding (integrate) and count-correction (correct) modes, instead of running both together. + This makes clear what the reported score(s) are describing, and also corrects the misleadingly low score that + the combined method receives. The scores for each component are in line with their scores from v1, where the modes + were separated. + # task_batch_integration 2.0.0 A major update to the OpenProblems framework, switching from a Python-based framework to a Viash + Nextflow-based framework. This update features the same concepts as the previous version, but with a new implementation that is more flexible, scalable, and maintainable. diff --git a/src/methods/scanorama/config.vsh.yaml b/src/methods/scanorama_correct/config.vsh.yaml similarity index 94% rename from src/methods/scanorama/config.vsh.yaml rename to src/methods/scanorama_correct/config.vsh.yaml index cb7c2f44..c346d96a 100644 --- a/src/methods/scanorama/config.vsh.yaml +++ b/src/methods/scanorama_correct/config.vsh.yaml @@ -1,6 +1,6 @@ __merge__: /src/api/comp_method.yaml -name: scanorama -label: Scanorama +name: scanorama_correct +label: Scanorama-Corrrect summary: Efficient integration of heterogeneous single-cell transcriptomes using Scanorama description: | Scanorama enables batch-correction and integration of heterogeneous scRNA-seq datasets. @@ -17,7 +17,7 @@ links: repository: https://github.com/brianhie/scanorama documentation: https://github.com/brianhie/scanorama#readme info: - method_types: [feature, embedding] + method_types: [feature] preferred_normalization: log_cp10k resources: - type: python_script diff --git a/src/methods/scanorama/script.py b/src/methods/scanorama_correct/script.py similarity index 92% rename from src/methods/scanorama/script.py rename to src/methods/scanorama_correct/script.py index 2ddb91df..e0831869 100644 --- a/src/methods/scanorama/script.py +++ b/src/methods/scanorama_correct/script.py @@ -8,8 +8,7 @@ 'output': 'output.h5ad', } meta = { - 'name': 'foo', - 'config': 'bar' + 'name': 'scanorama-correct', } ## VIASH END @@ -57,7 +56,7 @@ def merge_adata(*adata_list, **kwargs): batch_categories = adata.obs['batch'].cat.categories for i in batch_categories: split.append(adata[adata.obs['batch'] == i].copy()) -corrected = scanorama.correct_scanpy(split, return_dimred=True) +corrected = scanorama.correct_scanpy(split, return_dimred=False) corrected = merge_adata(*corrected, batch_key='batch', batch_categories=batch_categories, index_unique=None) print("Store output", flush=True) @@ -71,9 +70,6 @@ def merge_adata(*adata_list, **kwargs): }, layers={ 'corrected_counts': corrected.X, - }, - obsm={ - 'X_emb': corrected.obsm["X_scanorama"], } ) diff --git a/src/methods/scanorama_integrate/config.vsh.yaml b/src/methods/scanorama_integrate/config.vsh.yaml new file mode 100644 index 00000000..a53ab6c5 --- /dev/null +++ b/src/methods/scanorama_integrate/config.vsh.yaml @@ -0,0 +1,42 @@ +__merge__: /src/api/comp_method.yaml +name: scanorama_integrate +label: Scanorama-Integrate +summary: Efficient integration of heterogeneous single-cell transcriptomes using Scanorama +description: | + Scanorama enables batch-correction and integration of heterogeneous scRNA-seq datasets. + It is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods, + including those for imputation and highly-variable gene filtering. The results from + Scanorama integration and batch correction can then be used as input to other tools + for scRNA-seq clustering, visualization, and analysis. +references: + # Hie, B., Bryson, B. & Berger, B. Efficient integration of heterogeneous single-cell + # transcriptomes using Scanorama. Nat Biotechnol 37, 685–691 (2019). + # https://doi.org/10.1038/s41587-019-0113-3 + doi: 10.1038/s41587-019-0113-3 +links: + repository: https://github.com/brianhie/scanorama + documentation: https://github.com/brianhie/scanorama#readme +info: + method_types: [embedding] + preferred_normalization: log_cp10k +arguments: + - name: --dimred + type: integer + default: 100 + description: Embedding dimension +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: python + pypi: + - scanorama +runners: + - type: executable + - type: nextflow + directives: + label: [hightime, highmem, lowcpu] diff --git a/src/methods/scanorama_integrate/script.py b/src/methods/scanorama_integrate/script.py new file mode 100644 index 00000000..bbe6ca86 --- /dev/null +++ b/src/methods/scanorama_integrate/script.py @@ -0,0 +1,59 @@ +import sys +import anndata as ad +import scanorama +import numpy as np + +## VIASH START +par = { + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', + 'output': 'output.h5ad', + 'dimred': 100 +} +meta = { + 'name': 'scanorama-integrate', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +print('Run scanorama', flush=True) +split = [] +batch_categories = adata.obs['batch'].cat.categories +for b in batch_categories: + split.append(adata[adata.obs['batch'] == b].copy()) +scanorama.integrate_scanpy(split, dimred=par["dimred"]) + +#From https://colab.research.google.com/drive/1CebA3Ow4jXITK0dW5el320KVTX_szhxG +result = np.zeros((adata.shape[0], split[0].obsm["X_scanorama"].shape[1])) +for i, b in enumerate(batch_categories): + result[adata.obs['batch'] == b] = split[i].obsm["X_scanorama"] + + +print("Store output", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['name'], + }, + obsm={ + 'X_emb': result + }, + shape=adata.shape, +) + +print("Write output to file", flush=True) +output.write(par['output'], compression='gzip') diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 09905ad0..ba479227 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -99,7 +99,8 @@ dependencies: - name: methods/mnnpy - name: methods/pyliger - name: methods/scalex - - name: methods/scanorama + - name: methods/scanorama_correct + - name: methods/scanorama_integrate - name: methods/scanvi - name: methods/scgpt_finetuned - name: methods/scgpt_zeroshot diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 6196f749..541e95b7 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -27,7 +27,8 @@ methods = [ mnnpy, pyliger, scalex, - scanorama, + scanorama_correct, + scanorama_integrate, scanvi, scgpt_finetuned.run( args: [model: file("s3://openproblems-work/cache/scGPT_human.zip")]