From 6a06835ba86133331ed57bf1880921b87a5eb083 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 13 Oct 2022 13:20:34 -0400 Subject: [PATCH 001/266] share xgboost code with scarches_xgb --- .../label_projection/methods/scvi_tools.py | 68 +++++-------------- .../tasks/label_projection/methods/xgboost.py | 19 +++++- 2 files changed, 32 insertions(+), 55 deletions(-) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 1fce0cbb88..1a1ea42223 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -1,5 +1,6 @@ from ....tools.decorators import method from ....tools.utils import check_version +from .xgboost import _xgboost from typing import Optional import functools @@ -95,6 +96,7 @@ def _scanvi_scarches( n_layers=None, prediction_method="scanvi", ): + import numpy as np import scvi if test: @@ -106,11 +108,14 @@ def _scanvi_scarches( n_layers = n_layers or 2 n_hidden = n_hidden or 128 + unlabeled_category = "Unknown" + # new obs labels to mask test set + adata.obs["scanvi_labels"] = np.where( + adata.obs["is_train"], adata.obs["labels"], unlabeled_category + ) adata_train = adata[adata.obs["is_train"]].copy() - adata_train.obs["scanvi_labels"] = adata_train.obs["labels"].copy() adata_test = adata[~adata.obs["is_train"]].copy() - adata_test.obs["scanvi_labels"] = "Unknown" scvi.model.SCVI.setup_anndata( adata_train, batch_key="batch", labels_key="scanvi_labels" ) @@ -135,7 +140,9 @@ def _scanvi_scarches( train_kwargs["limit_train_batches"] = 10 train_kwargs["limit_val_batches"] = 10 scvi_model.train(**train_kwargs) - model = scvi.model.SCANVI.from_scvi_model(scvi_model, unlabeled_category="Unknown") + model = scvi.model.SCANVI.from_scvi_model( + scvi_model, unlabeled_category=unlabeled_category + ) model.train(**train_kwargs) query_model = scvi.model.SCANVI.load_query_data(adata_test, model) @@ -149,7 +156,7 @@ def _scanvi_scarches( if prediction_method == "scanvi": preds = _pred_scanvi(adata, query_model) elif prediction_method == "xgboost": - preds = _pred_xgb(adata, adata_train, adata_test, query_model, test=test) + preds = _pred_xgb(adata, query_model, test=test) return preds @@ -166,58 +173,15 @@ def _pred_scanvi(adata, query_model): # note: could extend test option def _pred_xgb( adata, - adata_train, - adata_test, query_model, - label_col="labels", test=False, num_round: Optional[int] = None, ): - import numpy as np - import xgboost as xgb - - df = _classif_df(adata_train, query_model, label_col) - - df["labels_int"] = df["labels"].cat.codes - categories = df["labels"].cat.categories - - # X_train = df.drop(columns="labels") - X_train = df.drop(columns=["labels", "labels_int"]) - # y_train = df["labels"].astype("category") - y_train = df["labels_int"].astype(int) - - X_test = query_model.get_latent_representation(adata_test) - - if test: - num_round = num_round or 2 - else: - num_round = num_round or 5 - - xgbc = xgb.XGBClassifier(tree_method="hist", objective="multi:softprob") - - xgbc.fit(X_train, y_train) - - # adata_test.obs["preds_test"] = xgbc.predict(X_test) - adata_test.obs["preds_test"] = categories[xgbc.predict(X_test)] - - preds = [ - adata_test.obs["preds_test"][idx] if idx in adata_test.obs_names else np.nan - for idx in adata.obs_names - ] - - return preds - - -def _classif_df(adata, trained_model, label_col): - import pandas as pd - - emb_data = trained_model.get_latent_representation(adata) - - df = pd.DataFrame(data=emb_data, index=adata.obs_names) - - df["labels"] = adata.obs[label_col] - - return df + adata.obsm["X_emb"] = query_model.get_latent_representation(adata) + adata = _xgboost( + adata, test=test, obsm="X_emb", num_round=num_round, tree_method="hist" + ) + return adata.obs["labels_pred"] @_scanvi_method(method_name="scANVI (All genes)") diff --git a/openproblems/tasks/label_projection/methods/xgboost.py b/openproblems/tasks/label_projection/methods/xgboost.py index b489f98bce..03d59a5136 100644 --- a/openproblems/tasks/label_projection/methods/xgboost.py +++ b/openproblems/tasks/label_projection/methods/xgboost.py @@ -16,7 +16,13 @@ ) -def _xgboost(adata, test: bool = False, num_round: Optional[int] = None): +def _xgboost( + adata, + test: bool = False, + obsm: Optional[str] = None, + num_round: Optional[int] = None, + **kwargs, +): import xgboost as xgb if test: @@ -30,12 +36,19 @@ def _xgboost(adata, test: bool = False, num_round: Optional[int] = None): adata_train = adata[adata.obs["is_train"]] adata_test = adata[~adata.obs["is_train"]].copy() - xg_train = xgb.DMatrix(adata_train.X, label=adata_train.obs["labels_int"]) - xg_test = xgb.DMatrix(adata_test.X, label=adata_test.obs["labels_int"]) + xg_train = xgb.DMatrix( + adata_train.obsm[obsm] if obsm else adata_train.X, + label=adata_train.obs["labels_int"], + ) + xg_test = xgb.DMatrix( + adata_test.obsm[obsm] if obsm else adata_test.X, + label=adata_test.obs["labels_int"], + ) param = dict( objective="multi:softmax", num_class=len(categories), + **kwargs, ) watchlist = [(xg_train, "train")] From ffd9f268caf5f2696c9f6dca17a3be1733b64e39 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 14 Oct 2022 10:12:34 -0400 Subject: [PATCH 002/266] Bump styfle/cancel-workflow-action from 0.10.1 to 0.11.0 (#624) Bumps [styfle/cancel-workflow-action](https://github.com/styfle/cancel-workflow-action) from 0.10.1 to 0.11.0. - [Release notes](https://github.com/styfle/cancel-workflow-action/releases) - [Commits](https://github.com/styfle/cancel-workflow-action/compare/0.10.1...0.11.0) --- updated-dependencies: - dependency-name: styfle/cancel-workflow-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .github/workflows/pre-commit.yml | 2 +- .github/workflows/run_tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index b38848967e..0929ca3594 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -23,7 +23,7 @@ jobs: steps: - name: Cancel Previous Runs - uses: styfle/cancel-workflow-action@0.10.1 + uses: styfle/cancel-workflow-action@0.11.0 with: access_token: ${{ github.token }} diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 217af3fb4e..d735871766 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -21,7 +21,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Cancel Previous Runs - uses: styfle/cancel-workflow-action@0.10.1 + uses: styfle/cancel-workflow-action@0.11.0 with: access_token: ${{ github.token }} From b7724aecfcc18b6d14f53c69649a7c25d40bf46a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 14 Oct 2022 15:10:10 -0400 Subject: [PATCH 003/266] Update coverage requirement from ==6.4.* to >=6.4,<6.6 (#609) Updates the requirements on [coverage](https://github.com/nedbat/coveragepy) to permit the latest version. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/6.4...6.5.0) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:development ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6012e3335f..8b4bf96f3e 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ "pytest==7.1.*", "pytest-cov>=3.0,<4.1", "black==22.8.0", - "coverage==6.4.*", + "coverage>=6.4,<6.6", "codecov==2.1.*", "parameterized==0.8.*", "requests==2.28.*", From 0cdb935d3e51912cd925780d71ec6470107dab67 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 14 Oct 2022 18:22:10 -0400 Subject: [PATCH 004/266] Update forecast to 8.18 (#626) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-extras/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index ebfd924607..9ab5d03759 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -15,7 +15,7 @@ downlit@0.4.2 dplyr@1.0.10 e1071@1.7-11 ellipsis@0.3.2 -forecast@8.17.0 +forecast@8.18 hardhat@1.1.0 here@1.0.1 hexbin@1.28.2 From 8bc1bc177ca2ac24cb8e5eac7e7f304ecc613d0d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Oct 2022 09:58:31 -0400 Subject: [PATCH 005/266] Update keras requirement from <2.6,>=2.4 to >=2.4,<2.11 in /docker/openproblems-python-tf2.4 (#608) * Update keras requirement in /docker/openproblems-python-tf2.4 Updates the requirements on [keras](https://github.com/keras-team/keras) to permit the latest version. - [Release notes](https://github.com/keras-team/keras/releases) - [Commits](https://github.com/keras-team/keras/compare/2.4.0...v2.10.0) --- updated-dependencies: - dependency-name: keras dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Update requirements.txt Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-python-tf2.4/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-python-tf2.4/requirements.txt b/docker/openproblems-python-tf2.4/requirements.txt index 9dedbff275..31a56c2ea2 100644 --- a/docker/openproblems-python-tf2.4/requirements.txt +++ b/docker/openproblems-python-tf2.4/requirements.txt @@ -1,4 +1,4 @@ dca==0.3.* -keras>=2.4,<2.6 # pinned in dca +keras>=2.4,<2.11 pyyaml==6.0 # pinned in #431 tensorflow-cpu==2.4.* # pinned in dca From 5b380f266457caf2823c12c5264f2b12754bc46a Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 18 Oct 2022 10:40:38 -0400 Subject: [PATCH 006/266] sort matching rows after merge (#629) --- .../_cell_cell_communication/_common/metrics/odds_ratio.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py index 0beaab2335..74f67252dd 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py @@ -8,10 +8,11 @@ def odds_ratio(adata, merge_keys, top_n=100): # Join benchmark (assumed truth) and ccc results # Get /w ccc_target and a response [0, 1] column - gt = adata.uns["ccc_target"].merge( - adata.uns["ccc_pred"], on=merge_keys, how="right" + gt = ( + adata.uns["ccc_target"] + .merge(adata.uns["ccc_pred"], on=merge_keys, how="inner") + .sort_values("score", ascending=False) ) - gt = gt[gt["response"].notna()] # assign the top rank interactions to 1 a = np.zeros(len(gt["score"])) From a56a08cb7e3c49a4acc643d12d8a6ffab407d157 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 18 Oct 2022 16:42:28 +0200 Subject: [PATCH 007/266] Add dimensions to dimred datasets (#628) Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .../datasets/mouse_blood_olssen_labelled.py | 3 ++- .../datasets/mouse_hspc_nestorowa2016.py | 3 ++- .../tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olssen_labelled.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olssen_labelled.py index bb3b3ccb94..563d3278af 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olssen_labelled.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olssen_labelled.py @@ -7,7 +7,8 @@ data_url=load_olsson_2016_mouse_blood.metadata["data_url"], data_reference=load_olsson_2016_mouse_blood.metadata["data_reference"], dataset_summary="Myeloid lineage differentiation from mouse blood. " - "Sequenced by SMARTseq in 2016 by Olsson et al.", + "Sequenced by SMARTseq in 2016 by Olsson et al. " + "660 cells x 112815 features with 4 cell type labels", ) def olsson_2016_mouse_blood(test=False): return load_olsson_2016_mouse_blood(test=test) diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py index b010474aa0..5aa7f68404 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py @@ -7,7 +7,8 @@ data_url=load_mouse_hspc_nestorowa2016.metadata["data_url"], data_reference=load_mouse_hspc_nestorowa2016.metadata["data_reference"], dataset_summary="1.6k hematopoietic stem and progenitor cells from mouse bone " - "marrow. Sequenced by Smart-seq2.", + "marrow. Sequenced by Smart-seq2." + "1920 cells x 43258 features with 3 cell type labels", ) def mouse_hspc_nestorowa2016(test=False): return load_mouse_hspc_nestorowa2016(test=test) diff --git a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py index 3b94989deb..42e3558c1e 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py @@ -9,6 +9,7 @@ dataset_summary=( "5k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. " "Sequenced on 10X v3 chemistry in July 2019 by 10X Genomics." + "5247 cells x 20822 features with no cell type labels" ), ) def tenx_5k_pbmc(test=False): From e75c4a3e56baa272b0147998d8f1fb92a4aaf7ed Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Oct 2022 13:34:52 -0400 Subject: [PATCH 008/266] Bump jax from 0.3.20 to 0.3.23 in /docker/openproblems-python-scvi (#622) Bumps [jax](https://github.com/google/jax) from 0.3.20 to 0.3.23. - [Release notes](https://github.com/google/jax/releases) - [Changelog](https://github.com/google/jax/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/jax/compare/jax-v0.3.20...jax-v0.3.23) --- updated-dependencies: - dependency-name: jax dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-python-scvi/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-python-scvi/requirements.txt b/docker/openproblems-python-scvi/requirements.txt index cd95e5cd0d..e3fd4746a6 100644 --- a/docker/openproblems-python-scvi/requirements.txt +++ b/docker/openproblems-python-scvi/requirements.txt @@ -1,4 +1,4 @@ -jax==0.3.20 +jax==0.3.23 jaxlib==0.3.20 scikit-misc==0.1.* scvi-tools~=0.17 # pinned in #313 From d7cf502f8f9803181c897bb1cec6ac59eede3f89 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 19 Oct 2022 11:01:19 -0400 Subject: [PATCH 009/266] Update tabulate requirement from <0.9 to <0.10 (#630) Updates the requirements on [tabulate](https://github.com/astanin/python-tabulate) to permit the latest version. - [Release notes](https://github.com/astanin/python-tabulate/releases) - [Changelog](https://github.com/astanin/python-tabulate/blob/master/CHANGELOG) - [Commits](https://github.com/astanin/python-tabulate/compare/v0.3...v0.9.0) --- updated-dependencies: - dependency-name: tabulate dependency-type: direct:development ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8b4bf96f3e..1b51740536 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ "anndata2ri==1.1.*", ] -evaluate_requires = ["snakemake>=7.8,<7.15", "tabulate<0.9"] +evaluate_requires = ["snakemake>=7.8,<7.15", "tabulate<0.10"] process_requires = ["numpyencoder==0.3.*"] From fd28407b3bd56869b13fc9288a57a72a1d1f2c19 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 20 Oct 2022 09:42:22 -0400 Subject: [PATCH 010/266] Update lifecycle to 1.0.3 (#631) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-extras/r_requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 9ab5d03759..d612dc1c17 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -22,7 +22,7 @@ hexbin@1.28.2 htmltools@0.5.3 htmlwidgets@1.5.4 igraph@1.3.5 -lifecycle@1.0.2 +lifecycle@1.0.3 Matrix@1.5-1 pkgdown@2.0.6 pkgload@1.3.0 @@ -32,7 +32,7 @@ ragg@1.2.2 Rcpp@1.0.9 RcppTOML@0.1.7 reticulate@1.26 -rlang@1.0.5 +rlang@1.0.6 rliger@1.0.0 rmarkdown@2.2 RSQLite@2.2.4 From 0d7afb2d6f091b646e2d1dec79da21ca81246f2d Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 20 Oct 2022 11:13:59 -0400 Subject: [PATCH 011/266] Revert "Update tabulate requirement from <0.9 to <0.10 (#630)" (#640) This reverts commit d7cf502f8f9803181c897bb1cec6ac59eede3f89. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1b51740536..8b4bf96f3e 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ "anndata2ri==1.1.*", ] -evaluate_requires = ["snakemake>=7.8,<7.15", "tabulate<0.10"] +evaluate_requires = ["snakemake>=7.8,<7.15", "tabulate<0.9"] process_requires = ["numpyencoder==0.3.*"] From 58966921311fe6049f1719db14d9b1336a04fdc1 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 20 Oct 2022 15:46:14 -0400 Subject: [PATCH 012/266] bump louvain to 0.8 (#639) --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 8b4bf96f3e..25a13c74bb 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,8 @@ "scprep>=1.2.1", "scipy>=1.8,<1.10", "scanpy>=1.6", - "louvain==0.7.*", - "python-igraph<0.10", + "louvain==0.8.*", + "python-igraph==0.10.*", "decorator<5.0", # pinned in #324 "memory-profiler==0.60", "colorama==0.4.*", From 71d8d0997b1d9a75f1de4e9a5a57cbeae4d697bb Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 21 Oct 2022 13:56:04 -0400 Subject: [PATCH 013/266] use codecov github action (#642) --- .github/workflows/run_tests.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index d735871766..d083cdb761 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -198,8 +198,11 @@ jobs: run: pytest --cov=openproblems --cov-report=xml -vv --durations=15 --tb=native - name: Upload coverage - continue-on-error: ${{ github.repository != 'openproblems-bio/openproblems' }} - run: codecov --no-color --required --flags unittests + uses: codecov/codecov-action@v3 + with: + flags: unittests + fail_ci_if_error: ${{ github.repository == 'openproblems-bio/openproblems' }} + verbose: true - name: Upload check results on fail if: failure() From b460ecb183328c857cbbf653488f522a4034a61c Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 21 Oct 2022 13:56:15 -0400 Subject: [PATCH 014/266] add baseline methods (#615) * add baseline methods * fix import errors * bugfixes * convert coo to csr * add celltype-specific random integration * bugfixes * import DR baseline * use 500d PCA instead of raw features * handle adata.shape < 500 * set adata.uns['neighbors'] * uni not uni_neighbors * partition by celltype * use gene symbols from resource * fix imports * just scramble graph node ids * bugfix graph shuffling * use random_features in randomize_graph --- .../batch_integration_embed/api.py | 2 +- .../methods/__init__.py | 4 + .../methods/baseline.py | 63 ++++++++++++ .../batch_integration_feature/api.py | 5 +- .../methods/__init__.py | 4 + .../methods/baseline.py | 58 +++++++++++ .../batch_integration_graph/api.py | 9 +- .../methods/__init__.py | 4 + .../methods/baseline.py | 98 +++++++++++++++++++ .../_cell_cell_communication/_common/api.py | 2 +- .../_common/methods/__init__.py | 2 + .../_common/methods/baseline.py | 47 +++++++++ .../methods/__init__.py | 2 + .../methods/__init__.py | 2 + openproblems/tasks/denoising/api.py | 2 +- .../tasks/denoising/methods/__init__.py | 3 +- .../methods/{no_denoising.py => baseline.py} | 16 +++ .../tasks/dimensionality_reduction/api.py | 5 +- .../methods/__init__.py | 2 + .../methods/baseline.py | 43 ++++++++ openproblems/tasks/label_projection/api.py | 6 +- .../label_projection/methods/__init__.py | 1 + .../label_projection/methods/baseline.py | 15 +++ .../tasks/multimodal_data_integration/api.py | 2 +- .../methods/__init__.py | 2 + .../methods/baseline.py | 44 +++++++++ .../tasks/regulatory_effect_prediction/api.py | 2 +- .../methods/__init__.py | 2 + .../methods/baseline.py | 34 +++++++ .../tasks/spatial_decomposition/api.py | 2 +- .../spatial_decomposition/methods/__init__.py | 3 +- .../spatial_decomposition/methods/baseline.py | 40 ++++++++ .../spatial_decomposition/methods/random.py | 26 ----- openproblems/tools/decorators.py | 4 + test/test_1_methods.py | 4 +- 35 files changed, 517 insertions(+), 43 deletions(-) create mode 100644 openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py create mode 100644 openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py rename openproblems/tasks/denoising/methods/{no_denoising.py => baseline.py} (51%) create mode 100644 openproblems/tasks/dimensionality_reduction/methods/baseline.py create mode 100644 openproblems/tasks/multimodal_data_integration/methods/baseline.py create mode 100644 openproblems/tasks/regulatory_effect_prediction/methods/baseline.py create mode 100644 openproblems/tasks/spatial_decomposition/methods/baseline.py delete mode 100644 openproblems/tasks/spatial_decomposition/methods/random.py diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py index e7e47528b7..c867a03157 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py @@ -16,7 +16,7 @@ def check_dataset(adata): return True -def check_method(adata): +def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" assert "X_emb" in adata.obsm return True diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py index dffc02a1f9..2f2cce98a6 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py @@ -28,3 +28,7 @@ from ...batch_integration_graph.methods.scanvi import scanvi_hvg_unscaled from ...batch_integration_graph.methods.scvi import scvi_full_unscaled from ...batch_integration_graph.methods.scvi import scvi_hvg_unscaled +from .baseline import batch_random_integration +from .baseline import celltype_random_integration +from .baseline import no_integration +from .baseline import random_integration diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py new file mode 100644 index 0000000000..a7a0aa3c21 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py @@ -0,0 +1,63 @@ +from .....tools.decorators import method +from .....tools.utils import check_version +from ...batch_integration_graph.methods.baseline import _randomize_features + + +@method( + method_name="No Integration", + paper_name="No Integration (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def no_integration(adata, test=False): + adata.obsm["X_emb"] = adata.obsm["X_uni_pca"] + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="Random Integration", + paper_name="Random Integration (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def random_integration(adata, test=False): + adata.obsm["X_emb"] = _randomize_features(adata.obsm["X_uni_pca"]) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="Random Integration by Celltype", + paper_name="Random Integration by Celltype (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def celltype_random_integration(adata, test=False): + adata.obsm["X_emb"] = _randomize_features( + adata.obsm["X_uni_pca"], partition=adata.obs["labels"] + ) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="Random Integration by Batch", + paper_name="Random Integration by Batch (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def batch_random_integration(adata, test=False): + adata.obsm["X_emb"] = _randomize_features( + adata.obsm["X_uni_pca"], partition=adata.obs["batch"] + ) + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py index 330a3a28b4..adf42b38c2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py @@ -18,10 +18,11 @@ def check_dataset(adata): return True -def check_method(adata): +def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" assert "log_normalized" in adata.layers - assert adata.layers["log_normalized"] is not adata.X + if not is_baseline: + assert adata.layers["log_normalized"] is not adata.X return True diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py index f1243781f9..7d4cb12759 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py @@ -28,6 +28,10 @@ from ...batch_integration_graph.methods.scanorama import scanorama_feature_full_unscaled from ...batch_integration_graph.methods.scanorama import scanorama_feature_hvg_scaled from ...batch_integration_graph.methods.scanorama import scanorama_feature_hvg_unscaled +from .baseline import batch_random_integration +from .baseline import celltype_random_integration +from .baseline import no_integration +from .baseline import random_integration # from ...batch_integration_graph.methods.scgen import scgen_full_scaled # from ...batch_integration_graph.methods.scgen import scgen_full_unscaled diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py new file mode 100644 index 0000000000..97053a2d16 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py @@ -0,0 +1,58 @@ +from .....tools.decorators import method +from .....tools.utils import check_version +from ...batch_integration_graph.methods.baseline import _randomize_features + + +@method( + method_name="No Integration", + paper_name="No Integration (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def no_integration(adata, test=False): + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="Random Integration", + paper_name="Random Integration (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def random_integration(adata, test=False): + adata.X = _randomize_features(adata.X) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="Random Integration by Celltype", + paper_name="Random Integration by Celltype (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def celltype_random_integration(adata, test=False): + adata.X = _randomize_features(adata.X, partition=adata.obs["labels"]) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="Random Integration by Batch", + paper_name="Random Integration by Batch (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def batch_random_integration(adata, test=False): + adata.X = _randomize_features(adata.X, partition=adata.obs["batch"]) + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/api.py b/openproblems/tasks/_batch_integration/batch_integration_graph/api.py index dd45a42aed..123860d6f2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/api.py @@ -11,14 +11,21 @@ def check_dataset(adata): assert "X_uni_pca" in adata.obsm assert "batch" in adata.obs assert "labels" in adata.obs + assert "uni" in adata.uns + assert adata.uns["uni"]["connectivities_key"] == "uni_connectivities" + assert adata.uns["uni"]["distances_key"] == "uni_distances" assert "uni_connectivities" in adata.obsp + assert "uni_distances" in adata.obsp assert "log_normalized" in adata.layers return True -def check_method(adata): +def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" + assert "neighbors" in adata.uns + assert adata.uns["neighbors"]["connectivities_key"] == "connectivities" + assert adata.uns["neighbors"]["distances_key"] == "distances" assert "connectivities" in adata.obsp assert "distances" in adata.obsp return True diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py index 308a6462c6..e360162f21 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py @@ -1,3 +1,7 @@ +from .baseline import batch_random_integration +from .baseline import celltype_random_integration +from .baseline import no_integration +from .baseline import random_integration from .bbknn import bbknn_full_scaled from .bbknn import bbknn_full_unscaled from .bbknn import bbknn_hvg_scaled diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py new file mode 100644 index 0000000000..b54f1cb018 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py @@ -0,0 +1,98 @@ +from .....tools.decorators import method +from .....tools.utils import check_version + +import numpy as np + + +def _set_uns(adata): + adata.uns["neighbors"] = adata.uns["uni"] + adata.uns["neighbors"]["connectivities_key"] = "connectivities" + adata.uns["neighbors"]["distances_key"] = "distances" + + +@method( + method_name="No Integration", + paper_name="No Integration (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def no_integration(adata, test=False): + adata.obsp["connectivities"] = adata.obsp["uni_connectivities"] + adata.obsp["distances"] = adata.obsp["uni_distances"] + _set_uns(adata) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +def _randomize_features(X, partition=None): + X_out = X.copy() + if partition is None: + partition = np.full(X.shape[0], 0) + else: + partition = np.asarray(partition) + for partition_name in np.unique(partition): + partition_idx = np.argwhere(partition == partition_name).flatten() + X_out[partition_idx] = X[np.random.permutation(partition_idx)] + return X_out + + +def _randomize_graph(adata, partition=None): + distances, connectivities = ( + adata.obsp["uni_distances"], + adata.obsp["uni_connectivities"], + ) + new_idx = _randomize_features(np.arange(distances.shape[0]), partition=partition) + adata.obsp["distances"] = distances[new_idx][:, new_idx] + adata.obsp["connectivities"] = connectivities[new_idx][:, new_idx] + _set_uns(adata) + return adata + + +@method( + method_name="Random Integration", + paper_name="Random Integration (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def random_integration(adata, test=False): + adata = _randomize_graph(adata) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="Random Integration by Celltype", + paper_name="Random Integration by Celltype (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def celltype_random_integration(adata, test=False): + adata = _randomize_graph( + adata, + partition=adata.obs["labels"].to_numpy(), + ) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="Random Integration by Batch", + paper_name="Random Integration by Batch (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def batch_random_integration(adata, test=False): + adata = _randomize_graph( + adata, + partition=adata.obs["batch"].to_numpy(), + ) + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/_cell_cell_communication/_common/api.py b/openproblems/tasks/_cell_cell_communication/_common/api.py index f0fdd504df..6ef59191fe 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/api.py +++ b/openproblems/tasks/_cell_cell_communication/_common/api.py @@ -136,7 +136,7 @@ def check_dataset(adata, merge_keys): return True -def check_method(adata, merge_keys): +def check_method(adata, merge_keys, is_baseline=False): """Check that method output fits expected API.""" assert "ccc_pred" in adata.uns diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py b/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py index 0db596d2be..ab4c4f2b6d 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py @@ -1,3 +1,5 @@ +from .baseline import random_events +from .baseline import true_events from .liana import cellphonedb from .liana import connectome from .liana import liana diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py b/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py new file mode 100644 index 0000000000..582e71b93b --- /dev/null +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py @@ -0,0 +1,47 @@ +from .....tools.decorators import method +from .....tools.utils import check_version + +import numpy as np +import pandas as pd + + +@method( + method_name="Random Events", + paper_name="Random Events (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def random_events(adata, test=False, n_events=1000): + adata.uns["ccc_pred"] = pd.DataFrame( + { + "ligand": np.random.choice( + adata.uns["ligand_receptor_resource"]["ligand_genesymbol"], n_events + ), + "receptor": np.random.choice( + adata.uns["ligand_receptor_resource"]["receptor_genesymbol"], n_events + ), + "source": np.random.choice(adata.obs["label"].cat.categories, n_events), + "target": np.random.choice(adata.obs["label"].cat.categories, n_events), + "score": np.random.uniform(0, 1, n_events), + } + ) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="True Events", + paper_name="True Events (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def true_events(adata, test=False): + adata.uns["ccc_pred"] = adata.uns["ccc_target"].rename( + {"response": "score"}, axis=1 + ) + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py index 2475b922e7..d8e04875fc 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py @@ -3,4 +3,6 @@ from ..._common.methods import liana from ..._common.methods import logfc from ..._common.methods import natmi +from ..._common.methods import random_events from ..._common.methods import sca +from ..._common.methods import true_events diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py index 2475b922e7..d8e04875fc 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py @@ -3,4 +3,6 @@ from ..._common.methods import liana from ..._common.methods import logfc from ..._common.methods import natmi +from ..._common.methods import random_events from ..._common.methods import sca +from ..._common.methods import true_events diff --git a/openproblems/tasks/denoising/api.py b/openproblems/tasks/denoising/api.py index ec52e4fb7f..816d7eec06 100644 --- a/openproblems/tasks/denoising/api.py +++ b/openproblems/tasks/denoising/api.py @@ -18,7 +18,7 @@ def check_dataset(adata): return True -def check_method(adata): +def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" assert "denoised" in adata.obsm assert isinstance(adata.obsm["denoised"], np.ndarray) diff --git a/openproblems/tasks/denoising/methods/__init__.py b/openproblems/tasks/denoising/methods/__init__.py index 611ee9f2dc..1bfa91d5c6 100644 --- a/openproblems/tasks/denoising/methods/__init__.py +++ b/openproblems/tasks/denoising/methods/__init__.py @@ -1,7 +1,8 @@ from .alra import alra +from .baseline import no_denoising +from .baseline import perfect_denoising from .dca import dca from .knn_smoothing import knn_smoothing from .magic import knn_naive from .magic import magic from .magic import magic_approx -from .no_denoising import no_denoising diff --git a/openproblems/tasks/denoising/methods/no_denoising.py b/openproblems/tasks/denoising/methods/baseline.py similarity index 51% rename from openproblems/tasks/denoising/methods/no_denoising.py rename to openproblems/tasks/denoising/methods/baseline.py index 40a369af49..b31f0781fa 100644 --- a/openproblems/tasks/denoising/methods/no_denoising.py +++ b/openproblems/tasks/denoising/methods/baseline.py @@ -8,9 +8,25 @@ paper_url="https://doi.org/10.1101/786269", paper_year=2019, code_url="https://github.com/czbiohub/molecular-cross-validation", + is_baseline=True, ) def no_denoising(adata, test=False): """Do nothing.""" adata.obsm["denoised"] = adata.obsm["train"].toarray() adata.uns["method_code_version"] = check_version("openproblems") return adata + + +@method( + method_name="Perfect denoising", + paper_name="Molecular Cross-Validation for Single-Cell RNA-seq", + paper_url="https://doi.org/10.1101/786269", + paper_year=2019, + code_url="https://github.com/czbiohub/molecular-cross-validation", + is_baseline=True, +) +def perfect_denoising(adata, test=False): + """Cheat.""" + adata.obsm["denoised"] = adata.obsm["test"].toarray() + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/dimensionality_reduction/api.py b/openproblems/tasks/dimensionality_reduction/api.py index 5b5634836e..c58f025a1c 100644 --- a/openproblems/tasks/dimensionality_reduction/api.py +++ b/openproblems/tasks/dimensionality_reduction/api.py @@ -10,10 +10,11 @@ def check_dataset(adata): return True -def check_method(adata): +def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" assert "X_emb" in adata.obsm - assert adata.obsm["X_emb"].shape[1] == 2 + if not is_baseline: + assert adata.obsm["X_emb"].shape[1] == 2 assert np.all(np.isfinite(adata.obsm["X_emb"])) return True diff --git a/openproblems/tasks/dimensionality_reduction/methods/__init__.py b/openproblems/tasks/dimensionality_reduction/methods/__init__.py index 4240a96951..1ddab56615 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/methods/__init__.py @@ -1,3 +1,5 @@ +from .baseline import high_dim_pca +from .baseline import random_features from .densmap import densmap_logCPM_1kHVG from .densmap import densmap_pca_logCPM_1kHVG from .neuralee import neuralee_default diff --git a/openproblems/tasks/dimensionality_reduction/methods/baseline.py b/openproblems/tasks/dimensionality_reduction/methods/baseline.py new file mode 100644 index 0000000000..c2a19bc80f --- /dev/null +++ b/openproblems/tasks/dimensionality_reduction/methods/baseline.py @@ -0,0 +1,43 @@ +from ....tools.decorators import method +from ....tools.utils import check_version +from typing import Optional + +import numpy as np + + +@method( + method_name="Random Features", + paper_name="Random Features (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def random_features(adata, test=False): + adata.obsm["X_emb"] = np.random.normal(0, 1, (adata.shape[0], 2)) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="High-dimensional PCA", + paper_name="High-dimensional PCA (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def high_dim_pca(adata, n_comps: Optional[int] = None, test=False): + # We wanted to use all features, but output must be dense + # so this is a close approximation + import scanpy as sc + + if test: + n_comps = n_comps or 50 + else: # pragma: nocover + n_comps = n_comps or 500 + + sc.pp.pca(adata, n_comps=min(min(adata.shape), n_comps)) + adata.obsm["X_emb"] = adata.obsm["X_pca"] + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/label_projection/api.py b/openproblems/tasks/label_projection/api.py index 6e9cc6517d..2fc5be81d6 100644 --- a/openproblems/tasks/label_projection/api.py +++ b/openproblems/tasks/label_projection/api.py @@ -11,14 +11,14 @@ def check_dataset(adata): assert "batch" in adata.obs assert "is_train" in adata.obs assert np.issubdtype(adata.obs["is_train"].dtype, bool) - assert pd.api.types.is_categorical(adata.obs["batch"]) - assert pd.api.types.is_categorical(adata.obs["labels"]) + assert pd.api.types.is_categorical_dtype(adata.obs["batch"]) + assert pd.api.types.is_categorical_dtype(adata.obs["labels"]) assert np.sum(adata.obs["is_train"]) > 0 assert np.sum(~adata.obs["is_train"]) > 0 return True -def check_method(adata): +def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" assert "labels_pred" in adata.obs assert np.issubdtype(adata.obs["is_train"].dtype, bool) diff --git a/openproblems/tasks/label_projection/methods/__init__.py b/openproblems/tasks/label_projection/methods/__init__.py index 40d35a403c..e6d932e2ba 100644 --- a/openproblems/tasks/label_projection/methods/__init__.py +++ b/openproblems/tasks/label_projection/methods/__init__.py @@ -1,5 +1,6 @@ from .baseline import majority_vote from .baseline import random_labels +from .baseline import true_labels from .knn_classifier import knn_classifier_log_cpm from .knn_classifier import knn_classifier_scran from .logistic_regression import logistic_regression_log_cpm diff --git a/openproblems/tasks/label_projection/methods/baseline.py b/openproblems/tasks/label_projection/methods/baseline.py index 6bc0b8a1c5..fd4d292815 100644 --- a/openproblems/tasks/label_projection/methods/baseline.py +++ b/openproblems/tasks/label_projection/methods/baseline.py @@ -26,6 +26,7 @@ def majority_vote(adata, test=False): paper_url="https://openproblems.bio", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, ) def random_labels(adata, test=False): label_distribution = adata.obs.labels[adata.obs.is_train].value_counts() @@ -40,3 +41,17 @@ def random_labels(adata, test=False): adata.uns["method_code_version"] = check_version("openproblems") return adata + + +@method( + method_name="True Labels", + paper_name="True Labels (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def true_labels(adata, test=False): + adata.obs["labels_pred"] = adata.obs["labels"] + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/multimodal_data_integration/api.py b/openproblems/tasks/multimodal_data_integration/api.py index ba04842f0d..afc9d38add 100644 --- a/openproblems/tasks/multimodal_data_integration/api.py +++ b/openproblems/tasks/multimodal_data_integration/api.py @@ -14,7 +14,7 @@ def check_dataset(adata): return True -def check_method(adata): +def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" assert "aligned" in adata.obsm assert "mode2_aligned" in adata.obsm diff --git a/openproblems/tasks/multimodal_data_integration/methods/__init__.py b/openproblems/tasks/multimodal_data_integration/methods/__init__.py index 5c7d74eab8..ec3fcfa409 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/__init__.py +++ b/openproblems/tasks/multimodal_data_integration/methods/__init__.py @@ -1,3 +1,5 @@ +from .baseline import random_features +from .baseline import true_features from .harmonic_alignment import harmonic_alignment_log_scran_pooling from .harmonic_alignment import harmonic_alignment_sqrt_cpm from .mnn import mnn_log_cpm diff --git a/openproblems/tasks/multimodal_data_integration/methods/baseline.py b/openproblems/tasks/multimodal_data_integration/methods/baseline.py new file mode 100644 index 0000000000..8419e90efb --- /dev/null +++ b/openproblems/tasks/multimodal_data_integration/methods/baseline.py @@ -0,0 +1,44 @@ +from ....tools.decorators import method +from ....tools.normalize import log_cpm +from ....tools.utils import check_version + +import numpy as np +import sklearn.decomposition + + +@method( + method_name="Random Features", + paper_name="Random Features (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def random_features(adata, test=False, n_svd=20): + n_svd = min([n_svd, min(adata.X.shape) - 1, min(adata.obsm["mode2"].shape) - 1]) + adata = log_cpm(adata) + X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) + adata.obsm["aligned"] = X_pca[np.random.permutation(np.arange(adata.shape[0]))] + adata.obsm["mode2_aligned"] = X_pca[ + np.random.permutation(np.arange(adata.shape[0])) + ] + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="True Features", + paper_name="True Features (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def true_features(adata, test=False, n_svd=20): + n_svd = min([n_svd, min(adata.X.shape) - 1, min(adata.obsm["mode2"].shape) - 1]) + adata = log_cpm(adata) + X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) + adata.obsm["aligned"] = X_pca + adata.obsm["mode2_aligned"] = X_pca + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/regulatory_effect_prediction/api.py b/openproblems/tasks/regulatory_effect_prediction/api.py index 28a5dcd262..266d67bc76 100644 --- a/openproblems/tasks/regulatory_effect_prediction/api.py +++ b/openproblems/tasks/regulatory_effect_prediction/api.py @@ -26,7 +26,7 @@ def check_dataset(adata): return True -def check_method(adata): +def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" assert "gene_score" in adata.obsm assert adata.obsm["gene_score"].shape == adata.X.shape diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/__init__.py b/openproblems/tasks/regulatory_effect_prediction/methods/__init__.py index 953d36fd49..20a47dc3d3 100644 --- a/openproblems/tasks/regulatory_effect_prediction/methods/__init__.py +++ b/openproblems/tasks/regulatory_effect_prediction/methods/__init__.py @@ -1 +1,3 @@ +from .baseline import random_scores +from .baseline import true_scores from .beta import beta diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/baseline.py b/openproblems/tasks/regulatory_effect_prediction/methods/baseline.py new file mode 100644 index 0000000000..eb5afbfd71 --- /dev/null +++ b/openproblems/tasks/regulatory_effect_prediction/methods/baseline.py @@ -0,0 +1,34 @@ +from ....tools.decorators import method +from ....tools.utils import check_version + +import numpy as np + + +@method( + method_name="Random Scores", + paper_name="Random Scores (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def random_scores(adata, test=False): + adata.obsm["gene_score"] = adata.X[ + np.random.permutation(np.arange(adata.X.shape[0])) + ] + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="True Scores", + paper_name="True Scores (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def true_scores(adata, test=False): + adata.obsm["gene_score"] = adata.X + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/spatial_decomposition/api.py b/openproblems/tasks/spatial_decomposition/api.py index 3fa833ac03..deb6a0187b 100644 --- a/openproblems/tasks/spatial_decomposition/api.py +++ b/openproblems/tasks/spatial_decomposition/api.py @@ -32,7 +32,7 @@ def check_dataset(adata: AnnData): return True -def check_method(adata: AnnData): +def check_method(adata: AnnData, is_baseline=False): """Check that method output fits expected API.""" assert np.all(adata.obs["modality"] == "sp") assert "proportions_pred" in adata.obsm diff --git a/openproblems/tasks/spatial_decomposition/methods/__init__.py b/openproblems/tasks/spatial_decomposition/methods/__init__.py index bb1e6f57c8..2610c91831 100644 --- a/openproblems/tasks/spatial_decomposition/methods/__init__.py +++ b/openproblems/tasks/spatial_decomposition/methods/__init__.py @@ -1,3 +1,5 @@ +from .baseline import random_proportions +from .baseline import true_proportions from .cell2location import cell2location_amortised_detection_alpha_20 from .cell2location import cell2location_detection_alpha_20 from .cell2location import cell2location_detection_alpha_20_nb @@ -5,7 +7,6 @@ from .destvi import destvi from .nmfreg import nmfreg from .nnls import nnls_scipy -from .random import random_proportion_assignment from .rctd import rctd from .seuratv3 import seuratv3 from .stereoscope import stereoscope diff --git a/openproblems/tasks/spatial_decomposition/methods/baseline.py b/openproblems/tasks/spatial_decomposition/methods/baseline.py new file mode 100644 index 0000000000..6c048326c5 --- /dev/null +++ b/openproblems/tasks/spatial_decomposition/methods/baseline.py @@ -0,0 +1,40 @@ +from ....tools.decorators import method +from ....tools.utils import check_version +from ..utils import split_sc_and_sp + +import numpy as np + + +@method( + method_name="Random Proportions", + paper_name="Random Proportions (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def random_proportions(adata, test=False): + adata_sc, adata = split_sc_and_sp(adata) + label_distribution = adata_sc.obs["label"].value_counts() + adata.obsm["proportions_pred"] = np.random.dirichlet( + label_distribution, + size=adata.shape[0], + ) + + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="True Proportions", + paper_name="True Proportions (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def true_proportions(adata, test=False): + _, adata = split_sc_and_sp(adata) + adata.obsm["proportions_pred"] = adata.obsm["proportions_true"] + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/spatial_decomposition/methods/random.py b/openproblems/tasks/spatial_decomposition/methods/random.py deleted file mode 100644 index bb3f930e78..0000000000 --- a/openproblems/tasks/spatial_decomposition/methods/random.py +++ /dev/null @@ -1,26 +0,0 @@ -from ....tools.decorators import method -from ....tools.utils import check_version -from ..utils import split_sc_and_sp - -import numpy as np - - -@method( - method_name="Random assignment (baseline)", - paper_name="Open Problems for Single Cell Analysis", - paper_url="https://openproblems.bio", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", -) -def random_proportion_assignment(adata, test=False): - _, adata = split_sc_and_sp(adata) - n_types = adata.obsm["proportions_true"].shape[1] - props = np.random.dirichlet( - np.ones(n_types), - size=adata.shape[0], - ) - - adata.obsm["proportions_pred"] = props - adata.uns["method_code_version"] = check_version("openproblems") - - return adata diff --git a/openproblems/tools/decorators.py b/openproblems/tools/decorators.py index 2e102eca4a..7912799d19 100644 --- a/openproblems/tools/decorators.py +++ b/openproblems/tools/decorators.py @@ -54,6 +54,7 @@ def method( code_url, code_version=None, image="openproblems", + is_baseline=False, ): """Decorate a method function. @@ -71,6 +72,8 @@ def method( Link to the code base providing the canonical implementation image : str, optional (default: "openproblems") Name of the Docker image to be used for this method + is_baseline : bool, optional (default: False) + If True, this method serves as a baseline for the task """ def decorator(func): @@ -86,6 +89,7 @@ def apply_method(*args, **kwargs): paper_year=paper_year, code_url=code_url, image=image, + is_baseline=is_baseline, ) apply_method = _backport_code_version(apply_method, code_version) return apply_method diff --git a/test/test_1_methods.py b/test/test_1_methods.py index e765724f9d..a1f49ea3ca 100644 --- a/test/test_1_methods.py +++ b/test/test_1_methods.py @@ -41,7 +41,7 @@ def test_method(task_name, method_name, image): ) adata = method(adata, test=True) assert isinstance(adata, anndata.AnnData) - assert task.api.check_method(adata) + assert task.api.check_method(adata, is_baseline=method.metadata["is_baseline"]) if "method_code_version" not in adata.uns: openproblems.utils.future_warning( "Setting code_version in the method decorator is deprecated. " @@ -68,6 +68,7 @@ def test_method_metadata(method): "paper_year", "code_url", "image", + "is_baseline", ]: assert attr in method.metadata @@ -80,3 +81,4 @@ def test_method_metadata(method): assert utils.asserts.assert_url_accessible(method.metadata["paper_url"]) assert isinstance(method.metadata["code_url"], str) assert utils.asserts.assert_url_accessible(method.metadata["code_url"]) + assert isinstance(method.metadata["is_baseline"], bool) From 7455e35cbee06267e6a5f977e020a816f98168f5 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 21 Oct 2022 17:03:39 -0400 Subject: [PATCH 015/266] add SCALEX to batch_integration (#637) * add SCALEX to batch_integration * fix typo * don't filter in test mode * don't filter if returned * add hvg * choose the first batch to decode * set outdir to devnull * set outdir to /tmp * Remove accidental comment reordering * Remove unintended comment order change * pre-commit * add scalex_hvg to batch_integration_feature Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../requirements.txt | 1 + .../methods/__init__.py | 2 + .../batch_integration_embed/methods/scalex.py | 27 ++++++ .../methods/__init__.py | 2 + .../methods/scalex.py | 27 ++++++ .../methods/__init__.py | 2 + .../batch_integration_graph/methods/scalex.py | 86 +++++++++++++++++++ 7 files changed, 147 insertions(+) create mode 100644 openproblems/tasks/_batch_integration/batch_integration_embed/methods/scalex.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/methods/scalex.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index 71b8137fa1..bf43b0957b 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -9,6 +9,7 @@ magic-impute==3.0.* phate==1.0.* pybedtools==0.9.* pyensembl==2.0.* +scalex==1.0.0 scvi-tools==0.16.* tangram-sc==1.0.* tensorflow-cpu==2.9.* diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py index 2f2cce98a6..3fd0c826ff 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py @@ -32,3 +32,5 @@ from .baseline import celltype_random_integration from .baseline import no_integration from .baseline import random_integration +from .scalex import scalex_full +from .scalex import scalex_hvg diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/scalex.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/scalex.py new file mode 100644 index 0000000000..f4580c90cf --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/scalex.py @@ -0,0 +1,27 @@ +from ...batch_integration_graph.methods.scalex import _scalex +from ...batch_integration_graph.methods.scalex import _scalex_method +from typing import Optional + + +@_scalex_method(method_name="SCALEX (full)") +def scalex_full(adata, test: bool = False, max_iteration: Optional[int] = None): + return _scalex( + adata, + test=test, + max_iteration=max_iteration, + compute_neighbors=False, + compute_features=False, + n_top_features=0, + ) + + +@_scalex_method(method_name="SCALEX (hvg)") +def scalex_hvg(adata, test: bool = False, max_iteration: Optional[int] = None): + return _scalex( + adata, + test=test, + max_iteration=max_iteration, + compute_neighbors=False, + compute_features=False, + n_top_features=2000, + ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py index 7d4cb12759..6454d1a617 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py @@ -32,6 +32,8 @@ from .baseline import celltype_random_integration from .baseline import no_integration from .baseline import random_integration +from .scalex import scalex_full +from .scalex import scalex_hvg # from ...batch_integration_graph.methods.scgen import scgen_full_scaled # from ...batch_integration_graph.methods.scgen import scgen_full_unscaled diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/scalex.py b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/scalex.py new file mode 100644 index 0000000000..1e6e425c46 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/scalex.py @@ -0,0 +1,27 @@ +from ...batch_integration_graph.methods.scalex import _scalex +from ...batch_integration_graph.methods.scalex import _scalex_method +from typing import Optional + + +@_scalex_method(method_name="SCALEX (full)") +def scalex_full(adata, test: bool = False, max_iteration: Optional[int] = None): + return _scalex( + adata, + test=test, + max_iteration=max_iteration, + compute_neighbors=False, + compute_features=True, + n_top_features=0, + ) + + +@_scalex_method(method_name="SCALEX (hvg)") +def scalex_hvg(adata, test: bool = False, max_iteration: Optional[int] = None): + return _scalex( + adata, + test=test, + max_iteration=max_iteration, + compute_neighbors=False, + compute_features=True, + n_top_features=2000, + ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py index e360162f21..89519752f9 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py @@ -33,6 +33,8 @@ from .mnn import mnn_full_unscaled from .mnn import mnn_hvg_scaled from .mnn import mnn_hvg_unscaled +from .scalex import scalex_full +from .scalex import scalex_hvg # from .saucie_embed import saucie_embed_full_scaled # from .saucie_embed import saucie_embed_full_unscaled diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py new file mode 100644 index 0000000000..d5a2c0998d --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py @@ -0,0 +1,86 @@ +from .....tools.decorators import method +from .....tools.utils import check_version +from typing import Optional + +import functools +import tempfile + +_scalex_method = functools.partial( + method, + paper_name="Online single-cell data integration through projecting heterogeneous " + "datasets into a common cell-embedding space", + paper_url="https://doi.org/10.1038/s41467-022-33758-z", + paper_year=2022, + code_url="https://github.com/jsxlei/SCALEX", + image="openproblems-python-extras", +) + + +def _scalex( + adata, + test: bool = False, + n_top_features: int = 0, + max_iteration: Optional[int] = None, + min_features: Optional[int] = None, + min_cells: Optional[int] = None, + compute_neighbors: bool = False, + compute_features: bool = False, +): + import scalex + import scanpy as sc + + if test: + max_iteration = max_iteration or 2 + else: # pragma: nocover + max_iteration = max_iteration or 30000 + + if test or compute_features: + min_features = min_features or 1 + else: # pragma: nocover + min_features = min_features or 600 + + min_cells = min_cells or 1 + + with tempfile.TemporaryDirectory() as outdir: + adata = scalex.SCALEX( + adata, + batch_key="batch", + ignore_umap=True, + impute=adata.obs["batch"].cat.categories[0] if compute_features else False, + max_iteration=max_iteration, + min_features=min_features, + min_cells=min_cells, + n_top_features=n_top_features, + outdir=outdir, + ) + adata.obsm["X_emb"] = adata.obsm["latent"] + if compute_features: + adata.X = adata.layers["impute"] + if compute_neighbors: + sc.pp.neighbors(adata, use_rep="X_emb") + adata.uns["method_code_version"] = check_version("scalex") + return adata + + +@_scalex_method(method_name="SCALEX (full)") +def scalex_full(adata, test: bool = False, max_iteration: Optional[int] = None): + return _scalex( + adata, + test=test, + max_iteration=max_iteration, + compute_neighbors=True, + compute_features=False, + n_top_features=0, + ) + + +@_scalex_method(method_name="SCALEX (hvg)") +def scalex_hvg(adata, test: bool = False, max_iteration: Optional[int] = None): + return _scalex( + adata, + test=test, + max_iteration=max_iteration, + compute_neighbors=True, + compute_features=False, + n_top_features=2000, + ) From 21eca6df9adfde109f8d488cb7b7a9433c9e115b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 24 Oct 2022 08:55:46 -0400 Subject: [PATCH 016/266] Update conos to 1.5.0 (#632) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-extras/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index d612dc1c17..68dc5863b7 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -6,7 +6,7 @@ bioc::scuttle@1.6.3 bslib@0.4.0 caret@6.0-93 cli@3.4.1 -conos@1.4.9 +conos@1.5.0 crayon@1.5.2 dbplyr@2.2.1 devtools@2.4.5 From c43a9ecf97b54e7723f66a38c96617ada2427ca3 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 24 Oct 2022 10:43:56 -0400 Subject: [PATCH 017/266] Normalize method results to baseline scores (#643) * add baseline methods * fix import errors * bugfixes * use baseline methods to standardise score range from 0 to 1 * normalize only to baselines unless missing * fix comments * enumerate * consistent use of .keys() * fix max - min * store raw metric results separately --- workflow/parse_nextflow.py | 68 ++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index bc9ef3a21a..471a6dc110 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -150,28 +150,72 @@ def parse_method_versions(results_path, results): return results -def compute_ranking(task_name, dataset_results): - """Rank all methods on a specific dataset.""" - rankings = np.zeros(len(dataset_results)) +def normalize_scores(task_name, dataset_results): + """Normalize method scores to [0, 1] based on baseline method scores.""" + for method_name in dataset_results: + # store original unnormalized results + dataset_results[method_name]["metrics_raw"] = dataset_results[method_name][ + "metrics" + ] metric_names = list(dataset_results.values())[0]["metrics"].keys() for metric_name in metric_names: metric = openproblems.api.utils.get_function(task_name, "metrics", metric_name) - sorted_order = np.argsort( + metric_scores = np.array( [ dataset_results[method_name]["metrics"][metric_name] for method_name in dataset_results ] ) - if metric.metadata["maximize"]: - sorted_order = sorted_order[::-1] - rankings += np.argsort(sorted_order) + baseline_methods = [ + method_name + for method_name in dataset_results + if openproblems.api.utils.get_function( + task_name, "methods", method_name + ).metadata["is_baseline"] + ] + if len(baseline_methods) < 2: + # just use all methods as a fallback + baseline_methods = dataset_results.keys() + baseline_scores = np.array( + [ + dataset_results[method_name]["metrics"][metric_name] + for method_name in baseline_methods + ] + ) + metric_scores -= baseline_scores.min() + baseline_range = baseline_scores.max() - baseline_scores.min() + metric_scores /= np.where(baseline_range != 0, baseline_range, 1) + if not metric.metadata["maximize"]: + metric_scores = 1 - metric_scores + for method_name, score in zip(dataset_results, metric_scores): + dataset_results[method_name]["metrics"][metric_name] = score + return dataset_results + + +def drop_baselines(task_name, dataset_results): + """Remove baseline methods from dataset results.""" + for method_name in dataset_results.keys(): + method = openproblems.api.utils.get_function(task_name, "methods", method_name) + if method.metadata["is_baseline"]: + del dataset_results[method_name] + return dataset_results + +def compute_ranking(dataset_results): + """Rank all methods on a specific dataset.""" + metric_sums = np.zeros(len(dataset_results)) + metric_names = list(dataset_results.values())[0]["metrics"].keys() method_names = list(dataset_results.keys()) + for metric_name in metric_names: + metric_scores = [ + dataset_results[method_name]["metrics"][metric_name] + for method_name in method_names + ] + metric_sums += metric_scores + final_ranking = { method_names[method_idx]: rank + 1 - for method_idx, rank in zip( - np.argsort(rankings), np.arange(len(dataset_results)) - ) + for rank, method_idx in enumerate(np.argsort(metric_sums)[::-1]) } return final_ranking @@ -186,7 +230,9 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results): headers=dict(names=["Rank"], fixed=["Name", "Paper", "Website", "Code"]), results=list(), ) - ranking = compute_ranking(task_name, dataset_results) + dataset_results = normalize_scores(task_name, dataset_results) + dataset_results = drop_baselines(task_name, dataset_results) + ranking = compute_ranking(dataset_results) metric_names = set() for method_name, rank in ranking.items(): method_results = dataset_results[method_name] From 038c706bc73584c4f398f2d24e5a0d4b570cd516 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 24 Oct 2022 11:03:01 -0400 Subject: [PATCH 018/266] Use secret token if available --- .github/workflows/run_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index d083cdb761..cf1d43602e 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -200,6 +200,7 @@ jobs: - name: Upload coverage uses: codecov/codecov-action@v3 with: + token: ${{ secrets.CODECOV_TOKEN }} flags: unittests fail_ci_if_error: ${{ github.repository == 'openproblems-bio/openproblems' }} verbose: true From 55a4da98c5c046906115bc05a419ec1e7cd3324c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Oct 2022 13:15:07 -0400 Subject: [PATCH 019/266] Bump jaxlib from 0.3.20 to 0.3.22 in /docker/openproblems-python-scvi (#634) Bumps [jaxlib](https://github.com/google/jax) from 0.3.20 to 0.3.22. - [Release notes](https://github.com/google/jax/releases) - [Changelog](https://github.com/google/jax/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/jax/compare/jaxlib-v0.3.20...jaxlib-v0.3.22) --- updated-dependencies: - dependency-name: jaxlib dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-python-scvi/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-python-scvi/requirements.txt b/docker/openproblems-python-scvi/requirements.txt index e3fd4746a6..87aa041993 100644 --- a/docker/openproblems-python-scvi/requirements.txt +++ b/docker/openproblems-python-scvi/requirements.txt @@ -1,5 +1,5 @@ jax==0.3.23 -jaxlib==0.3.20 +jaxlib==0.3.22 scikit-misc==0.1.* scvi-tools~=0.17 # pinned in #313 xgboost==1.6.* From 6aa453b836f1f62cb52c3a53ebb2e4c9483853b1 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 24 Oct 2022 16:34:41 -0400 Subject: [PATCH 020/266] bugfix --- workflow/parse_nextflow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index 471a6dc110..5e8d6bb881 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -194,7 +194,8 @@ def normalize_scores(task_name, dataset_results): def drop_baselines(task_name, dataset_results): """Remove baseline methods from dataset results.""" - for method_name in dataset_results.keys(): + method_names = list(dataset_results.keys()) + for method_name in method_names: method = openproblems.api.utils.get_function(task_name, "methods", method_name) if method.metadata["is_baseline"]: del dataset_results[method_name] From c608d0e1e6c49da7c7b1a1e35f391f5d5ab9cd70 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 25 Oct 2022 09:18:37 -0400 Subject: [PATCH 021/266] Bump black from 22.8.0 to 22.10.0 (#641) * Bump black from 22.8.0 to 22.10.0 Bumps [black](https://github.com/psf/black) from 22.8.0 to 22.10.0. - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/22.8.0...22.10.0) --- updated-dependencies: - dependency-name: black dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] * Update .pre-commit-config.yaml Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eca5a31bdb..85cc5bd5b2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: hooks: - id: isort - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 22.10.0 hooks: - id: black args: ['--target-version', 'py36'] diff --git a/setup.py b/setup.py index 25a13c74bb..1b2b9ed983 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ test_requires = [ "pytest==7.1.*", "pytest-cov>=3.0,<4.1", - "black==22.8.0", + "black==22.10.0", "coverage>=6.4,<6.6", "codecov==2.1.*", "parameterized==0.8.*", From be641ef79a566826a7a453a77ed8ad5c30bbfcc4 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 25 Oct 2022 09:19:36 -0400 Subject: [PATCH 022/266] continue on error if log missing --- .github/workflows/run_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index cf1d43602e..c27506820a 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -424,6 +424,7 @@ jobs: if: always() run: | mv /mnt/openproblems-nextflow/cwd/${{ env.BRANCH }}/.nextflow.log /tmp/nextflow.log + continue-on-error: true - name: Upload nextflow log if: always() From 79bc4971bc54f62737b349a917de386fea4710b5 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 25 Oct 2022 09:33:24 -0400 Subject: [PATCH 023/266] link to .py in Implementation (https://github.com/openproblems-bio/website/issues/40) --- workflow/parse_nextflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index 5e8d6bb881..5942515e71 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -245,7 +245,7 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results): "Year": method.metadata["paper_year"], "Library": method.metadata["code_url"], "Implementation": "https://github.com/openproblems-bio/openproblems/" - f"blob/main/{method.__module__.replace('.', '/')}", + f"blob/main/{method.__module__.replace('.', '/')}.py", "Version": method_results["code_version"], "Runtime (min)": parse_time_to_min(method_results["realtime"]), "CPU (%)": float(method_results["%cpu"].replace("%", "")), From c4d4a888e45c62757a96768ccd10751240ca21e4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 25 Oct 2022 14:34:23 -0400 Subject: [PATCH 024/266] Update sagemaker requirement in /docker/openproblems (#645) Updates the requirements on [sagemaker](https://github.com/aws/sagemaker-python-sdk) to permit the latest version. - [Release notes](https://github.com/aws/sagemaker-python-sdk/releases) - [Changelog](https://github.com/aws/sagemaker-python-sdk/blob/master/CHANGELOG.md) - [Commits](https://github.com/aws/sagemaker-python-sdk/compare/v2.112.0...v2.113.0) --- updated-dependencies: - dependency-name: sagemaker dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems/requirements.txt b/docker/openproblems/requirements.txt index 709868274c..4467681a91 100644 --- a/docker/openproblems/requirements.txt +++ b/docker/openproblems/requirements.txt @@ -2,6 +2,6 @@ boto3==1.24.* cmake==3.22.* # pinned in #607 jupyter==1.0.* pip -sagemaker==2.112.* +sagemaker==2.113.* setuptools wheel From 5e353bd45d13b0982a6c89f4474f57194b2cef27 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 26 Oct 2022 09:09:07 -0400 Subject: [PATCH 025/266] Update lintr to 3.0.2 (#636) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-github-actions/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-github-actions/r_requirements.txt b/docker/openproblems-github-actions/r_requirements.txt index 7693ca4226..58e1751318 100644 --- a/docker/openproblems-github-actions/r_requirements.txt +++ b/docker/openproblems-github-actions/r_requirements.txt @@ -1,6 +1,6 @@ backports@1.4.1 docopt@0.7.1 git2r@0.30.1 -lintr@3.0.1 +lintr@3.0.2 styler@1.7.0 tibble@3.1.8 From 6807382ef61925ae7f59d5142cdb75216da9fea5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 26 Oct 2022 11:32:29 -0400 Subject: [PATCH 026/266] Update snakemake requirement from <7.15,>=7.8 to >=7.8,<7.17 (#650) Updates the requirements on [snakemake](https://github.com/snakemake/snakemake) to permit the latest version. - [Release notes](https://github.com/snakemake/snakemake/releases) - [Changelog](https://github.com/snakemake/snakemake/blob/main/CHANGELOG.md) - [Commits](https://github.com/snakemake/snakemake/compare/v7.8.0...v7.16.1) --- updated-dependencies: - dependency-name: snakemake dependency-type: direct:development ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1b2b9ed983..798bd4c99a 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ "anndata2ri==1.1.*", ] -evaluate_requires = ["snakemake>=7.8,<7.15", "tabulate<0.9"] +evaluate_requires = ["snakemake>=7.8,<7.17", "tabulate<0.9"] process_requires = ["numpyencoder==0.3.*"] From ce6d662e39b93be53c3a1e1311d90cb9c3ef5d46 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 26 Oct 2022 15:05:13 -0400 Subject: [PATCH 027/266] Update boto3 requirement in /docker/openproblems (#651) Updates the requirements on [boto3](https://github.com/boto/boto3) to permit the latest version. - [Release notes](https://github.com/boto/boto3/releases) - [Changelog](https://github.com/boto/boto3/blob/develop/CHANGELOG.rst) - [Commits](https://github.com/boto/boto3/compare/1.24.0...1.25.1) --- updated-dependencies: - dependency-name: boto3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems/requirements.txt b/docker/openproblems/requirements.txt index 4467681a91..c37625d334 100644 --- a/docker/openproblems/requirements.txt +++ b/docker/openproblems/requirements.txt @@ -1,4 +1,4 @@ -boto3==1.24.* +boto3==1.25.* cmake==3.22.* # pinned in #607 jupyter==1.0.* pip From fb991d4cca4759b7f0965b0bd9e81f6661fef2dc Mon Sep 17 00:00:00 2001 From: Nikolay Markov Date: Wed, 26 Oct 2022 18:53:07 -0500 Subject: [PATCH 028/266] Load Tabula Muris Senis raw count data #633 (#635) * Use cellXgene api to load raw counts for Tabula Muris Senis #633 * pre-commit * Add requests to requirements * Address pull-request comments #633 * pre-commit * Make empty organ_list checks explicit * Fix hasattr * Fix handling empty organ_list * Change tabula muris senis batch from mouse.id to donor_id * Download only 1-assay 1-tissue datasets, remove csv * Subsample genes & filter cells for test mode Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- openproblems/data/tabula_muris_senis.py | 138 +++++++++++------- .../tabula_muris_senis_data_objects.csv | 40 ----- .../datasets/tabula_muris_senis.py | 2 +- setup.py | 1 + 4 files changed, 84 insertions(+), 97 deletions(-) delete mode 100644 openproblems/data/tabula_muris_senis_data_objects/tabula_muris_senis_data_objects.csv diff --git a/openproblems/data/tabula_muris_senis.py b/openproblems/data/tabula_muris_senis.py index 28f709ea88..46ae6df957 100644 --- a/openproblems/data/tabula_muris_senis.py +++ b/openproblems/data/tabula_muris_senis.py @@ -1,62 +1,74 @@ from . import utils +import anndata as ad import os -import pandas as pd +import requests import scanpy as sc import scprep import tempfile +COLLECTION_ID = "0b9d8a04-bb9d-44da-aa27-705bb65b54eb" +DOMAIN = "cellxgene.cziscience.com" +API_BASE = f"https://api.{DOMAIN}" +METHOD_ALIASES = {"10x 3' v2": "droplet", "Smart-seq2": "facs"} -def get_filenames_and_urls(url_df, method_list=None, organ_list=None): - """Takes in dataframe and returns corresponding filename(s) and url(s). - Takes in dataframe (with sample information stored), a list of methods, - and a list of organs. - Returns filenames and figshare URLs associated with inputs. - If method_list or organ_list are None, do not filter based on that argument. - """ - subset_df = url_df.copy() - # If method_list specified, filter based on methods in list. - if method_list: - subset_df = subset_df.loc[subset_df.method.isin(method_list)] - # If organ_list specified, filter based on organs in list. - if organ_list: - subset_df = subset_df.loc[subset_df.organ.isin(organ_list)] +def check_unknown_organs(datasets, organ_list): + known_organs = set([t["label"] for d in datasets for t in d["tissue"]]) + unknown_organs = set(organ_list) - known_organs + if unknown_organs: + raise ValueError( + f"Unknown organs provided in `organ_list': {', '.join(unknown_organs)}. " + f"Known organs are {', '.join(known_organs)}" + ) - return subset_df +def matching_dataset(dataset, method_list, organ_list): + # if dataset has multiple methods, skip it + if len(dataset["assay"]) > 1: + return False -def make_anndata_from_filename_and_url(filename, url, test=False): - """Takes in filename and url pair. Returns corresponding anndata object.""" - with tempfile.TemporaryDirectory() as tempdir: - filepath = os.path.join(tempdir, filename) - scprep.io.download.download_url(url, filepath) - adata = sc.read_h5ad(filepath) - utils.filter_genes_cells(adata) + # if dataset has multiple tissues, skip it + if len(dataset["tissue"]) > 1: + return False - if test: - sc.pp.subsample(adata, n_obs=100) - adata = adata[:, :1000] - utils.filter_genes_cells(adata) + method = dataset["assay"][0]["label"] + method = METHOD_ALIASES[method] - return adata + # if organ_list is not empty, check for specific tissue + if len(organ_list) > 0 and dataset["tissue"][0]["label"] not in organ_list: + return False + # if method_list is not empty, check for specific method + if len(method_list) > 0 and method not in method_list: + return False -def make_anndata_list(subset_df, test): - """Makes anndata from filename/url pair. Adds to list of anndatas. + return True - Input dataframe that contains filenames and urls to make anndatas from. - Returns a list of anndata objects. - """ - adata_list = [] - for i in range(len(subset_df)): - row = subset_df.iloc[i] - adata_list.append( - make_anndata_from_filename_and_url(row.filename, row.figshare_url) - ) - if test: - return adata_list[0] - return adata_list + +def load_raw_counts(dataset): + dataset_id = dataset["id"] + assets_path = ( + f"/curation/v1/collections/{COLLECTION_ID}/datasets/{dataset_id}/assets" + ) + url = f"{API_BASE}{assets_path}" + res = requests.get(url=url) + assets = res.json() + assets = [asset for asset in assets if asset["filetype"] == "H5AD"] + assert len(assets) == 1 + asset = assets[0] + + filename = f"{COLLECTION_ID}_{dataset_id}_{asset['filename']}" + with tempfile.TemporaryDirectory() as tempdir: + filepath = os.path.join(tempdir, filename) + scprep.io.download.download_url(asset["presigned_url"], filepath) + adata = sc.read_h5ad(filepath) + + utils.filter_genes_cells(adata) + # If `raw` exists, raw counts are there + if getattr(adata, "raw", None) is not None: + return adata.raw.to_adata() + return adata @utils.loader( @@ -73,22 +85,36 @@ def load_tabula_muris_senis(test=False, method_list=None, organ_list=None): and droplet-fat anndata sets. (no facs-fat dataset available) """ - # df containing figshare links, method of collection, and organ for each - # tabula muris dataset - url_df = pd.read_csv( - os.path.join( - os.path.dirname(__file__), - "tabula_muris_senis_data_objects", - "tabula_muris_senis_data_objects.csv", - ), - header=0, - ) + if method_list is None: + method_list = [] + if organ_list is None: + organ_list = [] + method_list = [x.lower() for x in method_list] + organ_list = [x.lower() for x in organ_list] + + unknown_methods = set(method_list) - set(["facs", "droplet"]) + if unknown_methods: + raise ValueError( + f"Unknown methods provided in `method_list': {','.join(unknown_methods)}. " + "Known methods are `facs' and `droplet'" + ) + + datasets_path = f"/curation/v1/collections/{COLLECTION_ID}" + url = f"{API_BASE}{datasets_path}" + res = requests.get(url=url) + datasets = res.json()["datasets"] + check_unknown_organs(datasets, organ_list) + + adata_list = [] + for dataset in datasets: + if matching_dataset(dataset, method_list, organ_list): + adata_list.append(load_raw_counts(dataset)) + + assert len(adata_list) > 0 + adata = ad.concat(adata_list, join="outer") - subset_df = get_filenames_and_urls(url_df, method_list, organ_list) - adata_list = make_anndata_list(subset_df, test) - adata = adata_list[0].concatenate(adata_list[1:]) if test: - sc.pp.subsample(adata, n_obs=500) + adata = utils.subsample_even(adata, n_obs=500, even_obs="method") adata = adata[:, :1000] utils.filter_genes_cells(adata) return adata diff --git a/openproblems/data/tabula_muris_senis_data_objects/tabula_muris_senis_data_objects.csv b/openproblems/data/tabula_muris_senis_data_objects/tabula_muris_senis_data_objects.csv deleted file mode 100644 index 5a95df87f0..0000000000 --- a/openproblems/data/tabula_muris_senis_data_objects/tabula_muris_senis_data_objects.csv +++ /dev/null @@ -1,40 +0,0 @@ -filename,figshare_url,method,organ -tabula-muris-senis-facs-processed-official-annotations-Aorta.h5ad,https://ndownloader.figshare.com/files/23872460,facs,aorta -tabula-muris-senis-facs-processed-official-annotations-Kidney.h5ad,https://ndownloader.figshare.com/files/23872484,facs,kidney -tabula-muris-senis-facs-processed-official-annotations-Diaphragm.h5ad,https://ndownloader.figshare.com/files/23872487,facs,diaphragm -tabula-muris-senis-facs-processed-official-annotations-BAT.h5ad,https://ndownloader.figshare.com/files/23872493,facs,BAT -tabula-muris-senis-droplet-processed-official-annotations-Large_Intestine.h5ad,https://ndownloader.figshare.com/files/23872502,droplet,large_intestine -tabula-muris-senis-facs-processed-official-annotations-Spleen.h5ad,https://ndownloader.figshare.com/files/23872511,facs,spleen -tabula-muris-senis-facs-processed-official-annotations-Limb_Muscle.h5ad,https://ndownloader.figshare.com/files/23872517,facs,limb_muscle -tabula-muris-senis-facs-processed-official-annotations-Liver.h5ad,https://ndownloader.figshare.com/files/23872526,facs,liver -tabula-muris-senis-facs-processed-official-annotations-MAT.h5ad,https://ndownloader.figshare.com/files/23872544,facs,MAT -tabula-muris-senis-facs-processed-official-annotations-Thymus.h5ad,https://ndownloader.figshare.com/files/23872559,facs,thymus -tabula-muris-senis-facs-processed-official-annotations-Trachea.h5ad,https://ndownloader.figshare.com/files/23872568,facs,trachea -tabula-muris-senis-droplet-processed-official-annotations-Pancreas.h5ad,https://ndownloader.figshare.com/files/23872580,droplet,pancreas -tabula-muris-senis-facs-processed-official-annotations-GAT.h5ad,https://ndownloader.figshare.com/files/23872583,facs,GAT -tabula-muris-senis-facs-processed-official-annotations-SCAT.h5ad,https://ndownloader.figshare.com/files/23872601,facs,SCAT -tabula-muris-senis-facs-processed-official-annotations-Bladder.h5ad,https://ndownloader.figshare.com/files/23872610,facs,bladder -tabula-muris-senis-facs-processed-official-annotations-Lung.h5ad,https://ndownloader.figshare.com/files/23872619,facs,lung -tabula-muris-senis-facs-processed-official-annotations-Mammary_Gland.h5ad,https://ndownloader.figshare.com/files/23872637,facs,mammary_gland -tabula-muris-senis-facs-processed-official-annotations-Pancreas.h5ad,https://ndownloader.figshare.com/files/23872643,facs,pancreas -tabula-muris-senis-droplet-processed-official-annotations-Trachea.h5ad,https://ndownloader.figshare.com/files/23872655,droplet,trachea -tabula-muris-senis-facs-processed-official-annotations-Skin.h5ad,https://ndownloader.figshare.com/files/23872667,facs,skin -tabula-muris-senis-droplet-processed-official-annotations-Skin.h5ad,https://ndownloader.figshare.com/files/23872676,droplet,skin -tabula-muris-senis-facs-processed-official-annotations-Tongue.h5ad,https://ndownloader.figshare.com/files/23872703,facs,tongue -tabula-muris-senis-droplet-processed-official-annotations-Fat.h5ad,https://ndownloader.figshare.com/files/23872715,droplet,fat -tabula-muris-senis-droplet-processed-official-annotations-Thymus.h5ad,https://ndownloader.figshare.com/files/23872745,droplet,thymus -tabula-muris-senis-droplet-processed-official-annotations-Liver.h5ad,https://ndownloader.figshare.com/files/23872763,droplet,liver -tabula-muris-senis-facs-processed-official-annotations-Brain_Non-Myeloid.h5ad,https://ndownloader.figshare.com/files/23872787,facs,brain_non-myeloid -tabula-muris-senis-droplet-processed-official-annotations-Heart_and_Aorta.h5ad,https://ndownloader.figshare.com/files/23872799,droplet,heart_and_aorta -tabula-muris-senis-facs-processed-official-annotations-Heart.h5ad,https://ndownloader.figshare.com/files/23872838,facs,heart -tabula-muris-senis-droplet-processed-official-annotations-Mammary_Gland.h5ad,https://ndownloader.figshare.com/files/23872862,droplet,mammary_gland -tabula-muris-senis-facs-processed-official-annotations-Brain_Myeloid.h5ad,https://ndownloader.figshare.com/files/23872886,facs,brain_myeloid -tabula-muris-senis-droplet-processed-official-annotations-Bladder.h5ad,https://ndownloader.figshare.com/files/23872916,droplet,bladder -tabula-muris-senis-facs-processed-official-annotations-Large_Intestine.h5ad,https://ndownloader.figshare.com/files/23872931,facs,large_intestine -tabula-muris-senis-facs-processed-official-annotations-Marrow.h5ad,https://ndownloader.figshare.com/files/23872976,facs,marrow -tabula-muris-senis-droplet-processed-official-annotations-Lung.h5ad,https://ndownloader.figshare.com/files/23873012,droplet,lung -tabula-muris-senis-droplet-processed-official-annotations-Kidney.h5ad,https://ndownloader.figshare.com/files/23873024,droplet,kidney -tabula-muris-senis-droplet-processed-official-annotations-Limb_Muscle.h5ad,https://ndownloader.figshare.com/files/23873036,droplet,limb_muscle -tabula-muris-senis-droplet-processed-official-annotations-Spleen.h5ad,https://ndownloader.figshare.com/files/23873054,droplet,spleen -tabula-muris-senis-droplet-processed-official-annotations-Tongue.h5ad,https://ndownloader.figshare.com/files/23873081,droplet,tongue -tabula-muris-senis-droplet-processed-official-annotations-Marrow.h5ad,https://ndownloader.figshare.com/files/23873090,droplet,marrow diff --git a/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py b/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py index 60780fafa1..ec9274a4cc 100644 --- a/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py +++ b/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py @@ -16,7 +16,7 @@ def tabula_muris_senis_lung_random(test=False): test=test, organ_list=["lung"], method_list=["droplet"] ) adata.obs["labels"] = adata.obs["free_annotation"] - adata.obs["batch"] = adata.obs["mouse.id"] + adata.obs["batch"] = adata.obs["donor_id"] adata.obs["is_train"] = np.random.choice( [True, False], adata.shape[0], replace=True, p=[0.8, 0.2] ) diff --git a/setup.py b/setup.py index 798bd4c99a..012acde47c 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,7 @@ "colorama==0.4.*", "packaging==21.3", "umap-learn==0.5.*", + "requests==2.28.*", ] r_requires = [ From 235b251a55743b615166b7cd760d65d89ab64204 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 26 Oct 2022 19:54:12 -0400 Subject: [PATCH 029/266] Update styler to 1.8.0 (#652) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-github-actions/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-github-actions/r_requirements.txt b/docker/openproblems-github-actions/r_requirements.txt index 58e1751318..68a2f54962 100644 --- a/docker/openproblems-github-actions/r_requirements.txt +++ b/docker/openproblems-github-actions/r_requirements.txt @@ -2,5 +2,5 @@ backports@1.4.1 docopt@0.7.1 git2r@0.30.1 lintr@3.0.2 -styler@1.7.0 +styler@1.8.0 tibble@3.1.8 From f2512dd3073ad94e4a549eb5f1383a70aa16317d Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 27 Oct 2022 08:28:11 -0400 Subject: [PATCH 030/266] make merge_keys available in adata.uns (#648) * make merge_keys available in adata.uns * use np testing to fix array assertion --- openproblems/tasks/_cell_cell_communication/_common/api.py | 5 +++++ .../datasets/tnbc_wu2021.py | 1 + .../datasets/allen_brain_atlas.py | 1 + 3 files changed, 7 insertions(+) diff --git a/openproblems/tasks/_cell_cell_communication/_common/api.py b/openproblems/tasks/_cell_cell_communication/_common/api.py index 6ef59191fe..9135f1a8eb 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/api.py +++ b/openproblems/tasks/_cell_cell_communication/_common/api.py @@ -61,6 +61,9 @@ def check_dataset(adata, merge_keys): assert "label" in adata.obs assert "ccc_target" in adata.uns + assert "merge_keys" in adata.uns + np.testing.assert_array_equal(adata.uns["merge_keys"], merge_keys) + # check target organism assert "target_organism" in adata.uns assert isinstance(adata.uns["target_organism"], numbers.Integral) @@ -186,6 +189,8 @@ def sample_dataset(merge_keys): """Create a simple dataset to use for testing methods in this task.""" adata = load_sample_data() + adata.uns["merge_keys"] = merge_keys + # keep only the top 10 most variable sc.pp.highly_variable_genes(adata, n_top_genes=len(SAMPLE_RECEPTOR_NAMES)) adata = adata[:, adata.var["highly_variable"]] diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py index 275a24edb2..77401ad9e6 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py @@ -21,6 +21,7 @@ def tnbc_data(test=False): adata = map_gene_symbols( adata, pathlib.Path(__file__).parent.joinpath("tnbc_wu2021_gene_symbols.csv") ) + adata.uns["merge_keys"] = ["ligand", "target"] adata.uns["ligand_receptor_resource"] = ligand_receptor_resource( adata.uns["target_organism"] ) diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py index f46e17c1f6..59a4a12bd8 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py @@ -15,6 +15,7 @@ ) def mouse_brain_atlas(test=False): adata = load_mouse_brain_atlas(test=test) + adata.uns["merge_keys"] = ["source", "target"] adata.uns["ligand_receptor_resource"] = ligand_receptor_resource( adata.uns["target_organism"] ) From fc6b3280d10ac743bb8c094b0d92bfad74deacde Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 27 Oct 2022 14:52:17 -0400 Subject: [PATCH 031/266] use tabula muris senis in addition to pancreas (#612) * add tabula muris senis * remove broken obs key --- openproblems/data/tabula_muris_senis.py | 3 ++ .../datasets/__init__.py | 3 ++ .../datasets/tabula_muris_senis.py | 54 +++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 openproblems/tasks/spatial_decomposition/datasets/tabula_muris_senis.py diff --git a/openproblems/data/tabula_muris_senis.py b/openproblems/data/tabula_muris_senis.py index 46ae6df957..93e1ac5016 100644 --- a/openproblems/data/tabula_muris_senis.py +++ b/openproblems/data/tabula_muris_senis.py @@ -113,6 +113,9 @@ def load_tabula_muris_senis(test=False, method_list=None, organ_list=None): assert len(adata_list) > 0 adata = ad.concat(adata_list, join="outer") + # this obs key causes write errors + del adata.obs["is_primary_data"] + if test: adata = utils.subsample_even(adata, n_obs=500, even_obs="method") adata = adata[:, :1000] diff --git a/openproblems/tasks/spatial_decomposition/datasets/__init__.py b/openproblems/tasks/spatial_decomposition/datasets/__init__.py index 7b405366a9..1a6f6a7474 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/__init__.py +++ b/openproblems/tasks/spatial_decomposition/datasets/__init__.py @@ -2,3 +2,6 @@ from .pancreas import pancreas_alpha_0_5 from .pancreas import pancreas_alpha_1 from .pancreas import pancreas_alpha_5 +from .tabula_muris_senis import tabula_muris_senis_alpha_0_5 +from .tabula_muris_senis import tabula_muris_senis_alpha_1 +from .tabula_muris_senis import tabula_muris_senis_alpha_5 diff --git a/openproblems/tasks/spatial_decomposition/datasets/tabula_muris_senis.py b/openproblems/tasks/spatial_decomposition/datasets/tabula_muris_senis.py new file mode 100644 index 0000000000..47b7f0eeda --- /dev/null +++ b/openproblems/tasks/spatial_decomposition/datasets/tabula_muris_senis.py @@ -0,0 +1,54 @@ +from ....data.tabula_muris_senis import load_tabula_muris_senis +from ....data.utils import filter_genes_cells +from ....tools.decorators import dataset +from .utils import generate_synthetic_dataset + +import functools + + +def _tabula_muris_senis(alpha, test, n_obs): + adata = load_tabula_muris_senis( + test=test, organ_list=["lung"], method_list=["droplet"] + ) + adata = adata[adata.obs["age"] == "30m"].copy() + adata.obs["label"] = adata.obs["free_annotation"] + + merged_adata = generate_synthetic_dataset( + adata, n_obs=n_obs, alpha=alpha, test=test + ) + filter_genes_cells(merged_adata) + return merged_adata + + +_tabula_muris_senis_dataset = functools.partial( + dataset, + data_url=load_tabula_muris_senis.metadata["data_url"], + data_reference=load_tabula_muris_senis.metadata["data_reference"], +) + + +@_tabula_muris_senis_dataset( + "Tabula muris senis (alpha=1)", + dataset_summary="Mouse lung cells aggregated from single-cell" + " (Dirichlet alpha=1)", +) +def tabula_muris_senis_alpha_1(test=False, n_obs=100): + return _tabula_muris_senis(alpha=1, test=test, n_obs=n_obs) + + +@_tabula_muris_senis_dataset( + "Tabula muris senis (alpha=5)", + dataset_summary="Mouse lung cells aggregated from single-cell" + " (Dirichlet alpha=5)", +) +def tabula_muris_senis_alpha_5(test=False, n_obs=100): + return _tabula_muris_senis(alpha=5, test=test, n_obs=n_obs) + + +@_tabula_muris_senis_dataset( + "Tabula muris senis (alpha=0.5)", + dataset_summary="Mouse lung cells aggregated from single-cell" + " (Dirichlet alpha=0.5)", +) +def tabula_muris_senis_alpha_0_5(test=False, n_obs=100): + return _tabula_muris_senis(alpha=0.5, test=test, n_obs=n_obs) From 814f12f4186670423ddba9d6417bc160846aa2b6 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 27 Oct 2022 14:52:27 -0400 Subject: [PATCH 032/266] use just one batch of pancreas data (#649) * use just one batch of pancreas data * keep_techs or [...] --- openproblems/data/pancreas.py | 40 +++-------- .../datasets/pancreas.py | 68 ++++++++++--------- 2 files changed, 45 insertions(+), 63 deletions(-) diff --git a/openproblems/data/pancreas.py b/openproblems/data/pancreas.py index 91288961ee..a67943a1fc 100644 --- a/openproblems/data/pancreas.py +++ b/openproblems/data/pancreas.py @@ -1,6 +1,5 @@ from . import utils -import anndata as ad import numpy as np import os import scanpy as sc @@ -12,17 +11,17 @@ @utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/s41592-021-01336-8") -def load_pancreas(test=False, integer_only=False): +def load_pancreas(test=False, keep_techs=None): """Download pancreas data from figshare.""" if test: # load full data first, cached if available - adata = load_pancreas(test=False, integer_only=integer_only) + adata = load_pancreas( + test=False, + keep_techs=keep_techs or ["celseq", "inDrop4", "smarter"], + ) keep_celltypes = adata.obs["celltype"].dtype.categories[[0, 3]] - keep_techs = adata.obs["tech"].dtype.categories[[0, -3, -2]] - keep_tech_idx = adata.obs["tech"].isin(keep_techs) - keep_celltype_idx = adata.obs["celltype"].isin(keep_celltypes) - adata = adata[keep_tech_idx & keep_celltype_idx].copy() + adata = adata[adata.obs["celltype"].isin(keep_celltypes)].copy() # Subsample pancreas data adata = adata[:, :500].copy() @@ -52,36 +51,15 @@ def load_pancreas(test=False, integer_only=False): scprep.io.download.download_url(URL, filepath) adata = sc.read(filepath) + if keep_techs is not None: + adata = adata[adata.obs["tech"].isin(keep_techs)].copy() + # NOTE: adata.X contains log-normalized data, so we're moving it adata.layers["log_normalized"] = adata.X adata.X = adata.layers["counts"] del adata.layers["counts"] - if integer_only: - adata = _get_pancreas_integer(adata) - # Ensure there are no cells or genes with 0 counts utils.filter_genes_cells(adata) return adata - - -def _get_pancreas_integer(adata: ad.AnnData): - """Transform counts to integer. - - For some platforms the pancreas data set only have processed counts. - Here we grab those with integer counts. - See https://github.com/theislab/scib-reproducibility/tree/main/notebooks/data_preprocessing/pancreas # noqa: E501 - """ - is_int = ["smartseq2"] - is_int += ["inDrop{}".format(x) for x in range(1, 5)] - - keep = np.zeros(len(adata)).astype(bool) - - for tech in is_int: - idx = adata.obs.tech.values == tech - keep = keep | idx - - adata = adata[keep, :].copy() - - return adata diff --git a/openproblems/tasks/spatial_decomposition/datasets/pancreas.py b/openproblems/tasks/spatial_decomposition/datasets/pancreas.py index 7c27ebf758..7263b55007 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/pancreas.py +++ b/openproblems/tasks/spatial_decomposition/datasets/pancreas.py @@ -2,54 +2,58 @@ from ....data.utils import filter_genes_cells from ....tools.decorators import dataset from .utils import generate_synthetic_dataset +from typing import List +from typing import Optional +import functools import scanpy as sc - -@dataset( - "Pancreas (alpha=1)", +_pancreas_dataset = functools.partial( + dataset, data_url=load_pancreas.metadata["data_url"], data_reference=load_pancreas.metadata["data_reference"], - dataset_summary="Human pancreas cells aggregated from single-cell" - " (Dirichlet alpha=1)", ) -def pancreas_alpha_1(test=False, n_obs=100): - adata = load_pancreas(test=test, integer_only=True) +_DATASET_SUMMARY = ( + "Human pancreas cells aggregated from single-cell (Dirichlet alpha={})" +) + + +def _pancreas_synthetic( + alpha: float, + test: bool = False, + n_obs: int = 100, + keep_techs: Optional[List[str]] = None, +): + adata = load_pancreas(test=test, keep_techs=keep_techs or ["inDrop3"]) sc.pp.filter_genes(adata, min_counts=10) adata.obs["label"] = adata.obs["celltype"] - merged_adata = generate_synthetic_dataset(adata, n_obs=n_obs, alpha=1, test=test) + merged_adata = generate_synthetic_dataset( + adata, n_obs=n_obs, alpha=alpha, test=test + ) filter_genes_cells(merged_adata) return merged_adata -@dataset( - "Pancreas (alpha=5)", - data_url=load_pancreas.metadata["data_url"], - data_reference=load_pancreas.metadata["data_reference"], - dataset_summary="Human pancreas cells aggregated from single-cell" - " (Dirichlet alpha=5)", +@_pancreas_dataset( + "Pancreas (alpha=1)", + dataset_summary=_DATASET_SUMMARY.format(1), ) -def pancreas_alpha_5(test=False, n_obs=100): - adata = load_pancreas(test=test, integer_only=True) - adata.obs["label"] = adata.obs["celltype"] +def pancreas_alpha_1(test=False, n_obs=100, keep_techs: Optional[List[str]] = None): + return _pancreas_synthetic(test=test, n_obs=n_obs, alpha=1, keep_techs=keep_techs) - merged_adata = generate_synthetic_dataset(adata, n_obs=n_obs, alpha=5) - filter_genes_cells(merged_adata) - return merged_adata +@_pancreas_dataset( + "Pancreas (alpha=5)", + dataset_summary=_DATASET_SUMMARY.format(5), +) +def pancreas_alpha_5(test=False, n_obs=100, keep_techs: Optional[List[str]] = None): + return _pancreas_synthetic(test=test, n_obs=n_obs, alpha=5, keep_techs=keep_techs) -@dataset( + +@_pancreas_dataset( "Pancreas (alpha=0.5)", - data_url=load_pancreas.metadata["data_url"], - data_reference=load_pancreas.metadata["data_reference"], - dataset_summary="Human pancreas cells aggregated from single-cell" - " (Dirichlet alpha=0.5)", + dataset_summary=_DATASET_SUMMARY.format(0.5), ) -def pancreas_alpha_0_5(test=False, n_obs=100): - adata = load_pancreas(test=test, integer_only=True) - adata.obs["label"] = adata.obs["celltype"] - - merged_adata = generate_synthetic_dataset(adata, n_obs=n_obs, alpha=0.5) - filter_genes_cells(merged_adata) - return merged_adata +def pancreas_alpha_0_5(test=False, n_obs=100, keep_techs: Optional[List[str]] = None): + return _pancreas_synthetic(test=test, n_obs=n_obs, alpha=0.5, keep_techs=keep_techs) From 5af4ab14b52faf869d5ed378caa2dc98194b5df1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 27 Oct 2022 14:52:34 -0400 Subject: [PATCH 033/266] Update e1071 to 1.7-12 (#653) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-extras/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 68dc5863b7..919c8c0278 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -13,7 +13,7 @@ devtools@2.4.5 dmcable/spacexr@eeb02a2 # master downlit@0.4.2 dplyr@1.0.10 -e1071@1.7-11 +e1071@1.7-12 ellipsis@0.3.2 forecast@8.18 hardhat@1.1.0 From b38682038c74d401b5825f38797c97d9af5a52c7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 27 Oct 2022 14:53:14 -0400 Subject: [PATCH 034/266] Add openproblems-cli test instructions (#557) * add openproblems-cli test instructions * fix typo * include aws cli in instructions * make KEY_NAME more obvious * document describe and shutdown * use jq to get status * split aws instructions into separate md * Add AWS profile to EC2 instructions, fix launching command * Update EC2.md * Reference docker * lint * ubuntu requires su * ubuntu requires su Co-authored-by: Scott Gigante Co-authored-by: Nikolay Markov Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- CONTRIBUTING.md | 35 +++++++++++++-- EC2.md | 110 +++++++++++++++++++++++++++++++++++++++++++++++ docker/README.md | 14 +++--- 3 files changed, 150 insertions(+), 9 deletions(-) create mode 100644 EC2.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4442a45674..152b0224e7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -304,13 +304,42 @@ ease of use, we provide a collection of common normalization functions in stored in `adata.X` is automatically stored in `adata.layers["counts"]` for later reference in the case the a metric needs to access the unnormalized data. +#### Testing method performance + To test the performance of a dataset, method, or metric, you can use the command-line -interface: +interface `openproblems-cli test`. -```shell -openproblems-cli test --help +First, you must launch a Docker image containing the relevant dependencies for the +dataset/method/metric you wish to test. You can then run `openproblems-cli test` with +any/all of `--dataset`, `--method`, and `--metric` as desired. E.g., + +```bash +cd openproblems +docker run \ + -v $(pwd):/usr/src/singlecellopenproblems -v /tmp:/tmp \ + -it singlecellopenproblems/openproblems-python-extras bash +openproblems-cli test \ + --task label_projection \ + --dataset zebrafish_labels \ + --method logistic_regression_log_cpm \ + --metric f1 ``` +which will print the benchmark score for the method evaluated by the metric on the +dataset you chose. + +Notes: + +* If you have updated Docker images to run your method, you must first rebuild the + images -- see the [Docker README](docker/README.md) for details. +* If your dataset/method/metric cannot be run on the same docker image, you may wish to + `load`, `run`, and `evaluate` separately. You can do this using each of these commands + independently; however, this workflow is not documented. +* These commands are not guaranteed to work with Apple silicon (M1 chip). +* If your local machine cannot run the test due to memory constraints or OS + incompatibility, you may use your AWS credentials to launch a VM for testing purposes. + See the [EC2 README](./EC2.md) for details. + ### Adding a new task The task directory structure is as follows diff --git a/EC2.md b/EC2.md new file mode 100644 index 0000000000..a28cce7238 --- /dev/null +++ b/EC2.md @@ -0,0 +1,110 @@ +# AWS EC2 Usage Instructions + +The following instructions give a step-by-step guide to launching an AWS virtual machine +with all the required prerequisites to run `openproblems`. + +## Code of conduct + +**Please be respectful of our finite resources!** + +* The use of the `openproblems` AWS account is a privilege, not a right. +* This privilege is given solely for the purposes of testing methods with + `openproblems-cli test`. +* Developers who have their own compute resources should use them; please help us + conserve our resources for those who need them. +* If developers are found to be using resources irresponsibly, we may have to revoke + this privilege. + +## Requirements + +* a Unix-based OS (Mac or Linux), though you should be +able to amend the commands for use on Windows (or consider [Windows Subsystem for +Linux](https://docs.microsoft.com/en-us/windows/wsl/install)). +* The [AWS CLI](https://aws.amazon.com/cli/) +* [jq](https://stedolan.github.io/jq/download/) + +## Instructions + +The following instructions are for `bash`, other shell users may need to modify commands +slightly. + +First, if you have recieved openproblems AWS credentials, configure AWS to use them. +Note: `openproblems` uses `us-west-2` as default region. If you have other AWS accounts, +you can configure AWS with multiple accounts by using the `AWS_PROFILE` environment +variable. + +```shell +export AWS_PROFILE=openproblems +aws configure +``` + +Second, create a key pair (only do this once): + +```shell +KEY_NAME="my_openproblems_key" # name this whatever you like, but it must be unique +aws ec2 create-key-pair --key-name $KEY_NAME --key-format pem \ +--query "KeyMaterial" --output text > ${KEY_NAME}.pem +chmod 400 ${KEY_NAME}.pem +``` + +Now, create an instance with your key pair: + +```shell +OWNER_NAME="this_is_your_name" +AWS_EC2_INSTANCE_TYPE="t2.micro" +INSTANCE_ID=$( +aws ec2 run-instances --count 1 --image-id ami-01219569b1bbf9fb2 \ + --instance-type $AWS_EC2_INSTANCE_TYPE --key-name $KEY_NAME \ + --security-group-ids sg-002d2b9db29bb43dd \ + --tag-specifications "ResourceType=instance,Tags=[{Key=owner,Value=${OWNER_NAME}}]" | + jq '.["Instances"][0]["InstanceId"]' | + tr -d '"' +) +``` + +Get the public DNS address for your instance + +```shell +sleep 30 # wait for boot +PUBLIC_DNS_NAME=$( +aws ec2 describe-instances --instance-id $INSTANCE_ID | + jq '.["Reservations"][0]["Instances"][0]["PublicDnsName"]' | + tr -d '"' +) +``` + +Now you can SSH into your instance: + +```shell +# check the status of your instance +aws ec2 describe-instance-status --instance-id ${INSTANCE_ID} | \ +jq '.["InstanceStatuses"][0]["SystemStatus"]' +ssh -i ${KEY_NAME}.pem ubuntu@${PUBLIC_DNS_NAME} +``` + +The instance will by default contain all dependencies to use `openproblems`. You can +run `openproblems` with + +```shell +git clone https://github.com/openproblems-bio/openproblems +cd openproblems +sudo docker run \ + -v $(pwd):/usr/src/singlecellopenproblems -v /tmp:/tmp \ + -it singlecellopenproblems/openproblems bash +openproblems-cli --help +``` + +For more information on using the CLI, see +[CONTRIBUTING.md](CONTRIBUTING.md#testing-method-performance). + +When you are done, make sure to shut down your instance: + +```shell +aws ec2 terminate-instances --instance-ids ${INSTANCE_ID} +``` + +Finally, make sure you don't have any instances left running: + +```shell +aws ec2 describe-instances --filters "Name=tag:owner,Values=${OWNER_NAME}" +``` diff --git a/docker/README.md b/docker/README.md index d5949faf26..77a87a7101 100644 --- a/docker/README.md +++ b/docker/README.md @@ -125,7 +125,7 @@ example, to install the `openproblems` base container, you can run the following docker build -f docker/openproblems/Dockerfile -t singlecellopenproblems/openproblems . ``` -or to update all available Docker images: +or to update all available Docker images, updating only when necessary: ```shell cd workflow && snakemake -j 10 docker @@ -215,14 +215,16 @@ You can then run commands within a docker container using `docker run`. Consult [Docker documentation](https://docs.docker.com/engine/reference/commandline/run/) to learn more about the `run` command. -**Using `IMAGE ID`** - ```shell -docker run -it 90a9110c7d69 /bin/bash +cd openproblems +docker run \ + -v $(pwd):/usr/src/singlecellopenproblems -v /tmp:/tmp \ + -it singlecellopenproblems/openproblems-python-extras bash ``` -**Using `RESPOSITORY:TAG`** +You may also specify the docker image by its ID, rather than its name: ```shell -docker run -it singlecellopenproblems/openproblems-python-extras:latest /bin/bash +cd openproblems +docker run -v $(pwd):/usr/src/singlecellopenproblems -v /tmp:/tmp -it 90a9110c7d69 bash ``` From 17bc62ce5c9ccdeb1cb8b469abe04040dcbe74e4 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 28 Oct 2022 09:44:14 -0400 Subject: [PATCH 035/266] bump scIB to 77ab015 --- docker/openproblems-python-batch-integration/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-python-batch-integration/requirements.txt b/docker/openproblems-python-batch-integration/requirements.txt index 6f1fe411b3..9c92302df9 100644 --- a/docker/openproblems-python-batch-integration/requirements.txt +++ b/docker/openproblems-python-batch-integration/requirements.txt @@ -1,6 +1,6 @@ annoy==1.17.1 bbknn==1.5.* git+https://github.com/scottgigante-immunai/mnnpy@eb4c551 # branch: patch-2 -git+https://github.com/theislab/scib@a35e300 +git+https://github.com/theislab/scib@77ab015 scanorama==1.7.0 scvi-tools~=0.16 # pinned in #313 From 3124cb338d78417c70619e3b10a31fa360dbade2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 28 Oct 2022 12:13:29 -0400 Subject: [PATCH 036/266] Update rpy2 requirement from <3.5.5 to <3.5.6 (#654) Updates the requirements on [rpy2](https://github.com/rpy2/rpy2) to permit the latest version. - [Release notes](https://github.com/rpy2/rpy2/releases) - [Changelog](https://github.com/rpy2/rpy2/blob/master/NEWS) - [Commits](https://github.com/rpy2/rpy2/commits) --- updated-dependencies: - dependency-name: rpy2 dependency-type: direct:development ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 012acde47c..2fee0e2b0e 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ ] r_requires = [ - "rpy2<3.5.5", + "rpy2<3.5.6", "anndata2ri==1.1.*", ] From dfa0034bb7027cbe7dd4e858bf778274931b7b82 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 28 Oct 2022 16:35:55 -0400 Subject: [PATCH 037/266] Update ragg to 1.2.4 (#657) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-extras/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 919c8c0278..4ad4270e67 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -28,7 +28,7 @@ pkgdown@2.0.6 pkgload@1.3.0 profvis@0.3.7 proxy@0.4-27 -ragg@1.2.2 +ragg@1.2.4 Rcpp@1.0.9 RcppTOML@0.1.7 reticulate@1.26 From b8e394fcb583d96013d6558621a92af1abd349a5 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 31 Oct 2022 09:58:30 -0400 Subject: [PATCH 038/266] don't fail job if hash fails --- openproblems/api/hash.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/openproblems/api/hash.py b/openproblems/api/hash.py index 89eac0d8fa..c2bfcfc027 100644 --- a/openproblems/api/hash.py +++ b/openproblems/api/hash.py @@ -4,8 +4,10 @@ import importlib import json import os +import random import scprep import subprocess +import warnings _MODULE = type(os) @@ -72,17 +74,24 @@ def docker_labels_from_api(image_name, tag="latest"): def docker_hash(image_name): """Get the docker image hash associated with an image.""" try: - return _run( - [ - "docker", - "inspect", - "-f='{{ index .Config.Labels \"bio.openproblems.hash\"}}'", - image_name, - ] + try: + return _run( + [ + "docker", + "inspect", + "-f='{{ index .Config.Labels \"bio.openproblems.hash\"}}'", + image_name, + ] + ) + except (RuntimeError, FileNotFoundError): # pragma: nocover + # docker is unavailable or the image is not locally available; use the API + return docker_labels_from_api(image_name)["bio.openproblems.hash"] + except Exception: # pragma: nocover + warnings.warn( + "Failed to access docker or the docker API; docker image hash failed. " + f"All jobs using {image_name} will not be cached." ) - except (RuntimeError, FileNotFoundError): # pragma: nocover - # docker is unavailable or the image is not locally available; use the API - return docker_labels_from_api(image_name)["bio.openproblems.hash"] + return random.getrandbits(256) def get_context(obj, context=None): From fa5dd646b9538825d0819efef906fcbacbb1bd2d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 31 Oct 2022 10:04:30 -0400 Subject: [PATCH 039/266] Update sagemaker requirement in /docker/openproblems (#658) Updates the requirements on [sagemaker](https://github.com/aws/sagemaker-python-sdk) to permit the latest version. - [Release notes](https://github.com/aws/sagemaker-python-sdk/releases) - [Changelog](https://github.com/aws/sagemaker-python-sdk/blob/master/CHANGELOG.md) - [Commits](https://github.com/aws/sagemaker-python-sdk/compare/v2.113.0...v2.116.0) --- updated-dependencies: - dependency-name: sagemaker dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems/requirements.txt b/docker/openproblems/requirements.txt index c37625d334..6b3cba6dba 100644 --- a/docker/openproblems/requirements.txt +++ b/docker/openproblems/requirements.txt @@ -2,6 +2,6 @@ boto3==1.25.* cmake==3.22.* # pinned in #607 jupyter==1.0.* pip -sagemaker==2.113.* +sagemaker==2.116.* setuptools wheel From 93fe7446b55dfb0cd076e15ce7a2c92e1d90a1e0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 31 Oct 2022 18:35:12 -0400 Subject: [PATCH 040/266] Update RcppAnnoy to 0.0.20 (#659) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-base/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-base/r_requirements.txt b/docker/openproblems-r-base/r_requirements.txt index 1b94692a0c..13f85978bb 100644 --- a/docker/openproblems-r-base/r_requirements.txt +++ b/docker/openproblems-r-base/r_requirements.txt @@ -1,3 +1,3 @@ bioc::scran@1.24.1 IRkernel@1.3 -RcppAnnoy@0.0.19 +RcppAnnoy@0.0.20 From 5a49d1410729e1c7f6a5cd396ab913688e803e40 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 1 Nov 2022 09:15:11 -0400 Subject: [PATCH 041/266] bugfix int -> str --- openproblems/api/hash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/api/hash.py b/openproblems/api/hash.py index c2bfcfc027..bb1af2dd7c 100644 --- a/openproblems/api/hash.py +++ b/openproblems/api/hash.py @@ -91,7 +91,7 @@ def docker_hash(image_name): "Failed to access docker or the docker API; docker image hash failed. " f"All jobs using {image_name} will not be cached." ) - return random.getrandbits(256) + return str(random.getrandbits(256)) def get_context(obj, context=None): From 3b87a28fe266cc4c176e7c170a3275d131e748e0 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 1 Nov 2022 09:16:35 -0400 Subject: [PATCH 042/266] scale 1.0.2 --- .../requirements.txt | 2 +- .../batch_integration_graph/methods/scalex.py | 24 +++++++++---------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index bf43b0957b..c51a5d771a 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -9,7 +9,7 @@ magic-impute==3.0.* phate==1.0.* pybedtools==0.9.* pyensembl==2.0.* -scalex==1.0.0 +scalex==1.0.2 scvi-tools==0.16.* tangram-sc==1.0.* tensorflow-cpu==2.9.* diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py index d5a2c0998d..36843b81ed 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py @@ -3,7 +3,6 @@ from typing import Optional import functools -import tempfile _scalex_method = functools.partial( method, @@ -41,18 +40,17 @@ def _scalex( min_cells = min_cells or 1 - with tempfile.TemporaryDirectory() as outdir: - adata = scalex.SCALEX( - adata, - batch_key="batch", - ignore_umap=True, - impute=adata.obs["batch"].cat.categories[0] if compute_features else False, - max_iteration=max_iteration, - min_features=min_features, - min_cells=min_cells, - n_top_features=n_top_features, - outdir=outdir, - ) + adata = scalex.SCALEX( + adata, + batch_key="batch", + ignore_umap=True, + impute=adata.obs["batch"].cat.categories[0] if compute_features else False, + max_iteration=max_iteration, + min_features=min_features, + min_cells=min_cells, + n_top_features=n_top_features, + outdir=None, + ) adata.obsm["X_emb"] = adata.obsm["latent"] if compute_features: adata.X = adata.layers["impute"] From b5329c4a61833b1c1f8e72ceefea0b4e221dac03 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 1 Nov 2022 13:01:52 -0400 Subject: [PATCH 043/266] Update torch requirement in /docker/openproblems-r-pytorch (#661) Updates the requirements on [torch](https://github.com/pytorch/pytorch) to permit the latest version. - [Release notes](https://github.com/pytorch/pytorch/releases) - [Changelog](https://github.com/pytorch/pytorch/blob/master/RELEASE.md) - [Commits](https://github.com/pytorch/pytorch/compare/v1.12.0-rc1...v1.13.0) --- updated-dependencies: - dependency-name: torch dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-pytorch/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index 7121777e82..e7c8df42b7 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -1,3 +1,3 @@ git+https://github.com/theislab/scib@v1.0.2 harmony-pytorch==0.1.* -torch==1.12.* +torch==1.13.* From 2a1c8fb1399fca413ff9f016869428c669d51bab Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 1 Nov 2022 17:53:07 -0400 Subject: [PATCH 044/266] Revert "bump louvain to 0.8 (#639)" (#666) This reverts commit 58966921311fe6049f1719db14d9b1336a04fdc1. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 2fee0e2b0e..a6a1c213f4 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,8 @@ "scprep>=1.2.1", "scipy>=1.8,<1.10", "scanpy>=1.6", - "louvain==0.8.*", - "python-igraph==0.10.*", + "louvain==0.7.*", + "python-igraph<0.10", "decorator<5.0", # pinned in #324 "memory-profiler==0.60", "colorama==0.4.*", From c1f01ce8be78767c3f577c114977e383e3bbddc3 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Wed, 2 Nov 2022 09:45:48 -0400 Subject: [PATCH 045/266] additional labels tests (#665) --- test/test_5_cli.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_5_cli.py b/test/test_5_cli.py index 389b35d35c..17d7924fa7 100644 --- a/test/test_5_cli.py +++ b/test/test_5_cli.py @@ -149,6 +149,12 @@ def test_hash(task, function_type, function_name): def test_hash_docker_api(): assert docker_labels_from_api("circleci/python", tag="3.8-bullseye") is None + labels = docker_labels_from_api("singlecellopenproblems/openproblems", tag="latest") + assert "bio.openproblems.build" in labels + assert "bio.openproblems.hash" in labels + assert isinstance(labels["bio.openproblems.build"], str) + assert isinstance(labels["bio.openproblems.hash"], str) + assert labels["bio.openproblems.build"] in ["github_actions", "local"] @parameterized.parameterized.expand( From e481460fa99ba79ddc93aa1077815349611dce39 Mon Sep 17 00:00:00 2001 From: Daniel Strobl <50872326+danielStrobl@users.noreply.github.com> Date: Wed, 2 Nov 2022 15:01:48 +0100 Subject: [PATCH 046/266] add short metric descriptions to README (#647) * add short metric descriptions to README * pre-commit * move metrics in subtask readmes * pre-commit * reformated graph metrics * change format of READMEs in all batch integration subtasks * pre-commit * line breaks * more line breaks * pre-commit Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: LuckyMD --- .../batch_integration_embed/README.md | 78 ++++++++----------- .../batch_integration_feature/README.md | 26 ++++--- .../batch_integration_graph/README.md | 25 +++++- 3 files changed, 70 insertions(+), 59 deletions(-) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md index 8ca0f3b096..eb39b62e10 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md @@ -2,6 +2,8 @@ # Batch integration embedding +## The task + This is a sub-task of the overall batch integration task. Batch (or data) integration integrates datasets across batches that arise from various biological and technical sources. Methods that integrate batches typically have three different types of output: @@ -18,6 +20,36 @@ This sub-task was taken from a [benchmarking study of data integration methods](https://www.biorxiv.org/content/10.1101/2020.05.22.111161v2). +## The metrics + +Metrics for batch integration (embed) measure how well batches are mixed while +biological signals are preserved. They are divided into batch correction and biological +variance conservation metrics. + +### Batch correction + +* **kBET**: kBET determines whether the label composition of a k nearest neighborhood of +a cell is similar to the expected (global) label composition +(Buettner et al., Nat Meth 2019). The test is repeated for a random subset of cells, +and the results are summarized as a rejection rate over all tested neighborhoods. +* **Silhouette batch score**: The absolute silhouette width is computed over batch +labels per cell. As 0 then indicates that batches are well mixed and any deviation from +0 indicates a batch effect, we use the 1-abs(ASW) to map the score to the scale [0;1]. +* **Principal component regression (PC regression)**: This compare the explained +variance by batch before and after integration. It returns a score between 0 and 1 +(scaled=True) with 0 if the variance contribution hasn’t changed. The larger the score, +the more different the variance contributions are before and after integration. + +### Biological variance conservation + +* **Cell cycle score**: The cell-cycle conservation score evaluates how well the +cell-cycle effect can be captured before and after integration. +* **Isolated label silhouette**: This score evaluates the compactness for the label(s) +that is(are) shared by fewest batches. It indicates how well rare cell types can be +preserved after integration. +* **Cell type ASW**: The absolute silhouette with is computed on cell identity labels, +measuring their compactness. + ## API WARNING: other than most tasks, `adata.X` should contain log-normalized data. @@ -59,49 +91,3 @@ Metrics can compare: To reuse metrics functions from `scIB`, [`metrics._utils._get_split`](metrics/_utils.py) separates the combined anndata into an integrated and an unintegrated anndata object. - -## Metrics - -In the following, we will give a short description of the implemented metrics. We split -by metrics capturing batch correction meaning the removal of batch effects and metrics -describing biological conservation, meaning how well the biological differences between -cell states are conserved. - -### Batch correction metrics - -#### kBET - -The kBET algorithm (v.0.99.6, release 4c9dafa) determines whether the label composition -of a k nearest neighborhood of a cell is similar to the expected (global) label -composition (Buettner et al., Nat Meth 2019). The test is repeated for a random subset -of cells, and the results are summarized as a rejection rate over all tested -neighborhoods. - -#### Silhouette batch score - -We consider the absolute silhouette width, s(i), on -batch labels per cell i. Here, 0 indicates that batches are well mixed, and any -deviation from 0 indicates a batch effect. - -#### Principal component regression - -Compare the explained variance by before and after integration. Returns a score between -0 and 1 (scaled=True) with 0 if the variance contribution hasn’t changed. The larger the -score, the more different the variance contributions are before and after integration. - -### Biological conservation metrics - -#### Cell cycle score - -The cell-cycle conservation score evaluates how well the cell-cycle effect can be -captured before and after integration. - -#### Isolated label silhouette - -This score evaluates the compactness for the label(s) that is(are) shared by fewest -batches. This indicates how well rare cell types can be preserved after integration. - -#### Cell type ASW - -For the bio-conservation score, the ASW is computed on cell identity labels, measuring -their compactness diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md index 76ce3f1e33..022f1aed65 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md @@ -2,12 +2,15 @@ # Batch integration feature +## The task + This is a sub-task of the overall batch integration task. Batch (or data) integration integrates datasets across batches that arise from various biological and technical sources. Methods that integrate batches typically have three different types of output: a corrected feature matrix, a joint embedding across batches, and/or an integrated cell-cell similarity graph (e.g., a kNN graph). This sub-task focuses on all methods -that can output feature matrices. Other sub-tasks for batch integration can be found for: +that can output feature matrices. Other sub-tasks for batch integration can be found +for: * [graphs](../batch_integration_graph/), and * [embeddings](../batch_integration_embed/) @@ -15,6 +18,17 @@ that can output feature matrices. Other sub-tasks for batch integration can be f This sub-task was taken from a [benchmarking study of data integration methods](https://www.biorxiv.org/content/10.1101/2020.05.22.111161v2). +## The metrics + +Metrics for batch integration (feature) measure how well feature-level information is +batch corrected. This is only done on by capturing biological variance conservation. +Further metrics for batch correction and biological variance conservation that are +calculated on lower dimensional feature spaces extrapolated from corrected feature +outputs can be found in the batch integration embed and graph tasks. + +* **HVG conservation**: This metric computes the average percentage of overlapping +highly variable genes per batch before and after integration. + ## API WARNING: other than most tasks, `adata.X` should contain log-normalized data. @@ -39,7 +53,8 @@ that can be installed without package conflicts. For R methods, the `openproblems-r-extras` container is used. -Most methods in the current task are run in four different scenarios that include scaling +Most methods in the current task are run in four different scenarios that include +caling and highly variable gene selection: * `full_unscaled` @@ -53,10 +68,3 @@ Metrics for this task compare: To reuse metrics functions from `scIB`, [`metrics._utils._get_split`](metrics/_utils.py) separates the combined anndata into an integrated and an unintegrated anndata object. - -## Metrics - -### HVG conservation - -Metric that computes the average percentage of overlapping highly variable genes per -batch before and after integration. diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/README.md b/openproblems/tasks/_batch_integration/batch_integration_graph/README.md index 97a30d8432..e3ab961389 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/README.md @@ -21,10 +21,27 @@ methods](https://www.biorxiv.org/content/10.1101/2020.05.22.111161v2). ## The metrics -Metrics for batch integration (graph) aim to TODO - -* **TODO**: TODO -* **TODO**: TODO +Metrics for batch integration (graph) measure how well batches are mixed while +biological signals are preserved. They are divided into batch correction and biological +variance conservation metrics. + +### Batch correction + +* **Graph connectivity**: The graph connectivity metric assesses whether the kNN graph +representation, G, of the integrated data connects all cells with the same cell identity +label. + +### Biological variance removal + +* **Adjusted rand index (ARI)**: The Rand index compares the overlap of two clusterings; +it considers both correct clustering overlaps while also counting correct disagreements +between two clusterings. +* **Iso label F1 score**: Isolated cell labels are identified as the labels present in +the least number of batches in the integration task. The score evaluates how well these +isolated labels separate from other cell identities based on clustering. +* **Normalized mutual information (NMI)**: NMI compares the overlap of two clusterings. +We used NMI to compare the cell-type labels with Louvain clusters computed on the +integrated dataset. ## API From 9382c3eefcb60ff041d38eaa631fa0d7aa1e28c5 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 2 Nov 2022 10:12:34 -0400 Subject: [PATCH 047/266] explicitly push prod images on tag --- .github/workflows/run_tests.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index c27506820a..e9a209d854 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -390,6 +390,21 @@ jobs: docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} done + - name: Upload Docker images for full benchmark + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + if: >- + startsWith(github.ref, 'refs/tags') || + startsWith(github.ref, 'refs/heads/test_benchmark') + run: | + ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" + for image in $(cd docker && ls -1d */ | tr -d '/'); do + docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} + docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} + done + - name: Run test benchmark env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} From 358f454ba1822bebda2ea5698696fe9878db88cc Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 2 Nov 2022 12:42:33 -0400 Subject: [PATCH 048/266] don't run full benchmark if tester fails --- .github/workflows/run_tests.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index e9a209d854..57d4147467 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -449,13 +449,19 @@ jobs: path: /tmp/nextflow.log run_full_benchmark: - needs: run_test_benchmark + needs: + - run_test_benchmark + - run_tester runs-on: ubuntu-latest if: >- always() && (needs.run_test_benchmark.result == 'success' || needs.run_test_benchmark.result == 'skipped') && !endsWith(github.event.head_commit.message, '# ci skip') && github.event_name == 'push' && + ( + needs.run_tester.result == 'success' || + startsWith(github.ref, 'refs/heads/test_benchmark') + ) && ( startsWith(github.ref, 'refs/tags') || startsWith(github.ref, 'refs/heads/test_benchmark') From e3f56463dd589e3710237ae824413ecb4e3ae285 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 2 Nov 2022 12:43:28 -0400 Subject: [PATCH 049/266] pass branch: prod --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 57d4147467..d698eee8e3 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -480,7 +480,7 @@ jobs: cat < Date: Wed, 2 Nov 2022 13:15:04 -0400 Subject: [PATCH 050/266] retry failed request (#668) * retry failed request * another one --- openproblems/data/tabula_muris_senis.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/openproblems/data/tabula_muris_senis.py b/openproblems/data/tabula_muris_senis.py index 93e1ac5016..b687e3c401 100644 --- a/openproblems/data/tabula_muris_senis.py +++ b/openproblems/data/tabula_muris_senis.py @@ -6,6 +6,7 @@ import scanpy as sc import scprep import tempfile +import time COLLECTION_ID = "0b9d8a04-bb9d-44da-aa27-705bb65b54eb" DOMAIN = "cellxgene.cziscience.com" @@ -13,6 +14,17 @@ METHOD_ALIASES = {"10x 3' v2": "droplet", "Smart-seq2": "facs"} +def _get_json(url, retries=5, sleep=0.05, backoff=2): + try: + res = requests.get(url=url, headers={"Content-Type": "application/json"}) + return res.json() + except Exception: # pragma: nocover + if retries > 0: + time.sleep(sleep) + return _get_json(url, retries - 1, sleep * backoff, backoff) + raise + + def check_unknown_organs(datasets, organ_list): known_organs = set([t["label"] for d in datasets for t in d["tissue"]]) unknown_organs = set(organ_list) - known_organs @@ -52,8 +64,7 @@ def load_raw_counts(dataset): f"/curation/v1/collections/{COLLECTION_ID}/datasets/{dataset_id}/assets" ) url = f"{API_BASE}{assets_path}" - res = requests.get(url=url) - assets = res.json() + assets = _get_json(url) assets = [asset for asset in assets if asset["filetype"] == "H5AD"] assert len(assets) == 1 asset = assets[0] @@ -101,8 +112,7 @@ def load_tabula_muris_senis(test=False, method_list=None, organ_list=None): datasets_path = f"/curation/v1/collections/{COLLECTION_ID}" url = f"{API_BASE}{datasets_path}" - res = requests.get(url=url) - datasets = res.json()["datasets"] + datasets = _get_json(url)["datasets"] check_unknown_organs(datasets, organ_list) adata_list = [] From e36b4b6c6badc7661d1ab42f476ddf93d7deadb7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 2 Nov 2022 17:18:21 -0400 Subject: [PATCH 051/266] retry installs in case of 403 --- scripts/install_renv.R | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/scripts/install_renv.R b/scripts/install_renv.R index b3c6698850..8a2fcaa21e 100644 --- a/scripts/install_renv.R +++ b/scripts/install_renv.R @@ -35,12 +35,27 @@ strip_comments <- function(remote) { gsub("\\s*#.*", "", remote) } +install_with_retries <- function(remotes, attempts = 3, ...) { + result <- NULL + attempt <- 1 + while (is.null(result) && attempt <= attempts - 1) { + attempt <- attempt + 1 + try( + result <- renv::install(remotes, ...) + ) + } + if (is.null(result)) { + # last attempt + renv::install(remotes, ...) + } +} + install_renv <- function(requirements_file, ...) { remotes <- scan(requirements_file, what = character(), sep = "\n") remotes <- sapply(remotes, strip_comments) remotes_installed <- sapply(remotes, check_available) remotes_to_install <- remotes[!remotes_installed] if (length(remotes_to_install) > 0) { - renv::install(remotes_to_install, ...) + install_with_retries(remotes_to_install, ...) } } From 028a95ffdbba81a608a300a66d717b5d9c1ffa6f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Nov 2022 09:35:02 -0400 Subject: [PATCH 052/266] backoff retries --- scripts/install_renv.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/install_renv.R b/scripts/install_renv.R index 8a2fcaa21e..1aac46d418 100644 --- a/scripts/install_renv.R +++ b/scripts/install_renv.R @@ -35,7 +35,11 @@ strip_comments <- function(remote) { gsub("\\s*#.*", "", remote) } -install_with_retries <- function(remotes, attempts = 3, ...) { +install_with_retries <- function(remotes, + attempts = 3, + sleep = 3, + backoff = 2, + ...) { result <- NULL attempt <- 1 while (is.null(result) && attempt <= attempts - 1) { @@ -43,6 +47,8 @@ install_with_retries <- function(remotes, attempts = 3, ...) { try( result <- renv::install(remotes, ...) ) + Sys.sleep(sleep) + sleep <- sleep * backoff } if (is.null(result)) { # last attempt From 9eb04245a0229e88b12070ae96b23e0e93ddb179 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Nov 2022 09:55:30 -0400 Subject: [PATCH 053/266] bugfix matrix in obs --- .../tasks/spatial_decomposition/datasets/destvi/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py index 89a1d36f2b..a5dc35b03f 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py @@ -137,7 +137,7 @@ def generate_synthetic_dataset( ) sc_anndata.obs["cell_type"] = cell_types_sc[:, :K_sampled].reshape(-1, 1) sc_anndata.obs["label"] = sc_anndata.obs["cell_type"].astype(str).astype("category") - sc_anndata.obs["n_counts"] = np.sum(sc_anndata.X, axis=1) + sc_anndata.obs["n_counts"] = np.sum(sc_anndata.X, axis=1).A.flatten() sc_anndata.obsm["gamma"] = gamma_sc[:, :K_sampled].reshape(-1, gamma.shape[-1]) sc_anndata.obsm["spatial"] = location_sc[:, :K_sampled].reshape(-1, 2) if n_cells is not None: From a8d6ecf5ced39feddc8b25d6753e8ae5fad02610 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Nov 2022 10:05:38 -0400 Subject: [PATCH 054/266] update scran? --- docker/openproblems-r-base/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-base/r_requirements.txt b/docker/openproblems-r-base/r_requirements.txt index 13f85978bb..4bdb31b529 100644 --- a/docker/openproblems-r-base/r_requirements.txt +++ b/docker/openproblems-r-base/r_requirements.txt @@ -1,3 +1,3 @@ -bioc::scran@1.24.1 +bioc::scran@1.26.0 IRkernel@1.3 RcppAnnoy@0.0.20 From 116a7c1948ef9ead717d80de61475b120c674e68 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Nov 2022 11:35:09 -0400 Subject: [PATCH 055/266] force update batchelor --- docker/openproblems-r-extras/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 4ad4270e67..7d24ab077b 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -1,4 +1,4 @@ -bioc::batchelor@1.12.3 +bioc::batchelor@1.14.0 bioc::ComplexHeatmap@2.12.1 bioc::scater@1.24.0 bioc::scran@1.24.1 From 3b4b74f70523a725641a620b3f3daa367f999b78 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Nov 2022 12:58:12 -0400 Subject: [PATCH 056/266] force update scuttle --- docker/openproblems-r-extras/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 7d24ab077b..e77f9cf26c 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -2,7 +2,7 @@ bioc::batchelor@1.14.0 bioc::ComplexHeatmap@2.12.1 bioc::scater@1.24.0 bioc::scran@1.24.1 -bioc::scuttle@1.6.3 +bioc::scuttle@1.8.0 bslib@0.4.0 caret@6.0-93 cli@3.4.1 From 41f54b4475218bb166f555a87c3af96d930c4b18 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Nov 2022 09:55:30 -0400 Subject: [PATCH 057/266] bugfix matrix in obs --- .../tasks/spatial_decomposition/datasets/destvi/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py index 89a1d36f2b..a5dc35b03f 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py @@ -137,7 +137,7 @@ def generate_synthetic_dataset( ) sc_anndata.obs["cell_type"] = cell_types_sc[:, :K_sampled].reshape(-1, 1) sc_anndata.obs["label"] = sc_anndata.obs["cell_type"].astype(str).astype("category") - sc_anndata.obs["n_counts"] = np.sum(sc_anndata.X, axis=1) + sc_anndata.obs["n_counts"] = np.sum(sc_anndata.X, axis=1).A.flatten() sc_anndata.obsm["gamma"] = gamma_sc[:, :K_sampled].reshape(-1, gamma.shape[-1]) sc_anndata.obsm["spatial"] = location_sc[:, :K_sampled].reshape(-1, 2) if n_cells is not None: From ab8ada992eff924337e4ed855bb075a415fa54cc Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Nov 2022 14:10:36 -0400 Subject: [PATCH 058/266] fix bioc version: --- docker/openproblems-r-base/Dockerfile | 1 + scripts/install_renv.R | 4 ++++ scripts/upgrade_renv.R | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/docker/openproblems-r-base/Dockerfile b/docker/openproblems-r-base/Dockerfile index 5c70677019..6e9a2c1baa 100644 --- a/docker/openproblems-r-base/Dockerfile +++ b/docker/openproblems-r-base/Dockerfile @@ -27,6 +27,7 @@ RUN apt-get update -qq RUN apt-get install -yq --no-install-suggests --no-install-recommends r-base-dev=4.2\* RUN apt-get clean -y && apt-get autoremove -y ENV R_HOME=/usr/lib/R +ENV BIOCVERSION="3.15" # Install R packages RUN R -e "install.packages('renv'); renv::consent(TRUE)" diff --git a/scripts/install_renv.R b/scripts/install_renv.R index b3c6698850..bae096b485 100644 --- a/scripts/install_renv.R +++ b/scripts/install_renv.R @@ -1,3 +1,7 @@ +if (nchar(Sys.getenv("BIOCVERSION")) > 0) { + renv::settings$bioconductor.version(Sys.getenv("BIOCVERSION")) +} + as_integer_version <- function(v) { class(v) <- "list" v[[1]] diff --git a/scripts/upgrade_renv.R b/scripts/upgrade_renv.R index 6c74adb033..fccc679b7d 100644 --- a/scripts/upgrade_renv.R +++ b/scripts/upgrade_renv.R @@ -1,3 +1,7 @@ +if (nchar(Sys.getenv("BIOCVERSION")) > 0) { + renv::settings$bioconductor.version(Sys.getenv("BIOCVERSION")) +} + upgraded_remote_version <- function(remote) { if (remote$Source == "Repository") { out <- paste0(remote$Package, "@", remote$Version) From dfe07755ba8e8ed6abdc579143544b3c45d088ce Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Nov 2022 14:11:43 -0400 Subject: [PATCH 059/266] Revert "force update scuttle" This reverts commit 3b4b74f70523a725641a620b3f3daa367f999b78. --- docker/openproblems-r-extras/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index e77f9cf26c..7d24ab077b 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -2,7 +2,7 @@ bioc::batchelor@1.14.0 bioc::ComplexHeatmap@2.12.1 bioc::scater@1.24.0 bioc::scran@1.24.1 -bioc::scuttle@1.8.0 +bioc::scuttle@1.6.3 bslib@0.4.0 caret@6.0-93 cli@3.4.1 From 36a6aec2ed972deefacd9d9e74b8900def6f8500 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Nov 2022 14:11:51 -0400 Subject: [PATCH 060/266] Revert "force update batchelor" This reverts commit 116a7c1948ef9ead717d80de61475b120c674e68. --- docker/openproblems-r-extras/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 7d24ab077b..4ad4270e67 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -1,4 +1,4 @@ -bioc::batchelor@1.14.0 +bioc::batchelor@1.12.3 bioc::ComplexHeatmap@2.12.1 bioc::scater@1.24.0 bioc::scran@1.24.1 From 3724051190071172ddd123311d02e65b8ee79874 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 3 Nov 2022 14:12:01 -0400 Subject: [PATCH 061/266] Revert "update scran?" This reverts commit a8d6ecf5ced39feddc8b25d6753e8ae5fad02610. --- docker/openproblems-r-base/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-base/r_requirements.txt b/docker/openproblems-r-base/r_requirements.txt index 4bdb31b529..13f85978bb 100644 --- a/docker/openproblems-r-base/r_requirements.txt +++ b/docker/openproblems-r-base/r_requirements.txt @@ -1,3 +1,3 @@ -bioc::scran@1.26.0 +bioc::scran@1.24.1 IRkernel@1.3 RcppAnnoy@0.0.20 From 4fa54644f1be3e98ae1265cf92374254626b1a02 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Nov 2022 09:17:21 -0500 Subject: [PATCH 062/266] convert coo to csr --- openproblems/data/utils.py | 12 ++++++++++++ test/test_3_datasets.py | 1 + 2 files changed, 13 insertions(+) diff --git a/openproblems/data/utils.py b/openproblems/data/utils.py index ce1366ca92..3b038d54d7 100644 --- a/openproblems/data/utils.py +++ b/openproblems/data/utils.py @@ -6,6 +6,7 @@ import logging import os import scanpy as sc +import scipy.sparse log = logging.getLogger("openproblems") @@ -33,6 +34,12 @@ def _cache_path(func, *args, **kwargs): return os.path.join(TEMPDIR, filename) +def _fix_sparse_format(X): + if isinstance(X, scipy.sparse.coo_matrix): + X = X.tocsr() + return X + + def loader(data_url, data_reference): """Decorate a data loader function. @@ -66,6 +73,11 @@ def apply_func(*args, **kwargs): adata.uns["_from_cache"] = False if "var_names_all" not in adata.uns: adata.uns["var_names_all"] = adata.var.index.to_numpy() + adata.X = _fix_sparse_format(adata.X) + for layer in adata.layers: + adata.layers[layer] = _fix_sparse_format(adata.layers[layer]) + for obsm in adata.obsm: + adata.obsm[obsm] = _fix_sparse_format(adata.obsm[obsm]) if "counts" not in adata.layers: adata.layers["counts"] = adata.X try: diff --git a/test/test_3_datasets.py b/test/test_3_datasets.py index cc0638baa4..7c1acca22a 100644 --- a/test/test_3_datasets.py +++ b/test/test_3_datasets.py @@ -94,6 +94,7 @@ def test_adata_shape(self): def test_sparse(self): """Ensure output is sparse.""" assert scipy.sparse.issparse(self.adata.X) + assert isinstance(self.adata.X, scipy.sparse.csr_matrix) def test_not_bytes(self): """Ensure output does not contain byte strings.""" From 322b2ad4a4fb892caf15870a0799b5a54893d4c3 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Nov 2022 09:20:20 -0500 Subject: [PATCH 063/266] refactor --- openproblems/data/utils.py | 43 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/openproblems/data/utils.py b/openproblems/data/utils.py index 3b038d54d7..042882e96f 100644 --- a/openproblems/data/utils.py +++ b/openproblems/data/utils.py @@ -28,6 +28,10 @@ def _hash_function(func, *args, **kwargs): def _cache_path(func, *args, **kwargs): + try: + os.mkdir(TEMPDIR) + except OSError: + pass if hasattr(func, "__wrapped__"): func = func.__wrapped__ filename = "openproblems_{}.h5ad".format(_hash_function(func, *args, **kwargs)) @@ -40,6 +44,19 @@ def _fix_sparse_format(X): return X +def _fix_adata(adata): + adata.strings_to_categoricals() + if "var_names_all" not in adata.uns: + adata.uns["var_names_all"] = adata.var.index.to_numpy() + adata.X = _fix_sparse_format(adata.X) + for layer in adata.layers: + adata.layers[layer] = _fix_sparse_format(adata.layers[layer]) + for obsm in adata.obsm: + adata.obsm[obsm] = _fix_sparse_format(adata.obsm[obsm]) + if "counts" not in adata.layers: + adata.layers["counts"] = adata.X + + def loader(data_url, data_reference): """Decorate a data loader function. @@ -55,35 +72,17 @@ def decorator(func): @functools.wraps(func) def apply_func(*args, **kwargs): filepath = _cache_path(func, *args, **kwargs) + dataset_name = f"{func.__name__}({args}, {kwargs})" if os.path.isfile(filepath): - log.debug( - "Loading cached {}({}, {}) dataset".format( - func.__name__, args, kwargs - ) - ) + log.debug(f"Loading cached {dataset_name} dataset") adata = anndata.read_h5ad(filepath) adata.uns["_from_cache"] = True return adata else: - log.debug( - "Downloading {}({}, {}) dataset".format(func.__name__, args, kwargs) - ) + log.debug(f"Downloading {dataset_name} dataset") adata = func(*args, **kwargs) - adata.strings_to_categoricals() + _fix_adata(adata) adata.uns["_from_cache"] = False - if "var_names_all" not in adata.uns: - adata.uns["var_names_all"] = adata.var.index.to_numpy() - adata.X = _fix_sparse_format(adata.X) - for layer in adata.layers: - adata.layers[layer] = _fix_sparse_format(adata.layers[layer]) - for obsm in adata.obsm: - adata.obsm[obsm] = _fix_sparse_format(adata.obsm[obsm]) - if "counts" not in adata.layers: - adata.layers["counts"] = adata.X - try: - os.mkdir(TEMPDIR) - except OSError: - pass adata.write_h5ad(filepath) return adata From 768c0c8ea2fe235fd609cb07122d8e71c53753db Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 7 Nov 2022 11:24:33 -0500 Subject: [PATCH 064/266] convert everything --- openproblems/data/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/data/utils.py b/openproblems/data/utils.py index 042882e96f..d4e8312f52 100644 --- a/openproblems/data/utils.py +++ b/openproblems/data/utils.py @@ -39,7 +39,7 @@ def _cache_path(func, *args, **kwargs): def _fix_sparse_format(X): - if isinstance(X, scipy.sparse.coo_matrix): + if scipy.sparse.issparse(X) and not isinstance(X, scipy.sparse.csr_matrix): X = X.tocsr() return X From 0b4ed3841bb34c78ffc2b1693470e786193f86fd Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 8 Nov 2022 11:54:41 -0500 Subject: [PATCH 065/266] convert to csr --- openproblems/tasks/_cell_cell_communication/_common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/_cell_cell_communication/_common/utils.py b/openproblems/tasks/_cell_cell_communication/_common/utils.py index 25c43187ec..25e458bb2e 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/utils.py +++ b/openproblems/tasks/_cell_cell_communication/_common/utils.py @@ -91,7 +91,7 @@ def map_gene_symbols(adata, map_filename: Union[str, pathlib.Path]): ) return anndata.AnnData( - X=scipy.sparse.hstack([adata_one_to_any.X] + many_to_one_X), + X=scipy.sparse.hstack([adata_one_to_any.X] + many_to_one_X).tocsr(), obs=adata.obs, var=pd.DataFrame( index=np.concatenate([adata_one_to_any.var.index, many_to_one_genes]) From 1559d1aafa95a2f1e78a7d2519d9731115a67b36 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 8 Nov 2022 14:26:48 -0500 Subject: [PATCH 066/266] Update pkgload to 1.3.1 (#664) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-extras/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 4ad4270e67..b251088306 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -25,7 +25,7 @@ igraph@1.3.5 lifecycle@1.0.3 Matrix@1.5-1 pkgdown@2.0.6 -pkgload@1.3.0 +pkgload@1.3.1 profvis@0.3.7 proxy@0.4-27 ragg@1.2.4 From ef4e1f5c0c43daa80f05f9e8410018b7c3e0add7 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 8 Nov 2022 15:07:23 -0500 Subject: [PATCH 067/266] run pip check (#675) --- test/docker_run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test/docker_run.sh b/test/docker_run.sh index da7108c9c4..c15a73c5e6 100644 --- a/test/docker_run.sh +++ b/test/docker_run.sh @@ -15,6 +15,7 @@ if [ ! -f ~/.install_complete ]; then if echo "$FREEZE" | grep -q annoy; then python3 -m pip install --force "$(echo "$FREEZE" | grep annoy)" fi + python3 -m pip check touch ~/.install_complete fi From 1aca2e6037a986348d84a383944edca8b76b04d2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 9 Nov 2022 09:01:17 -0500 Subject: [PATCH 068/266] Update boto3 requirement in /docker/openproblems (#669) Updates the requirements on [boto3](https://github.com/boto/boto3) to permit the latest version. - [Release notes](https://github.com/boto/boto3/releases) - [Changelog](https://github.com/boto/boto3/blob/develop/CHANGELOG.rst) - [Commits](https://github.com/boto/boto3/compare/1.25.0...1.26.0) --- updated-dependencies: - dependency-name: boto3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems/requirements.txt b/docker/openproblems/requirements.txt index 6b3cba6dba..af970653a2 100644 --- a/docker/openproblems/requirements.txt +++ b/docker/openproblems/requirements.txt @@ -1,4 +1,4 @@ -boto3==1.25.* +boto3==1.26.* cmake==3.22.* # pinned in #607 jupyter==1.0.* pip From cead2be2406610de24527805ed1cf27451128f80 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 9 Nov 2022 09:02:31 -0500 Subject: [PATCH 069/266] tocsr yet again --- openproblems/tasks/_cell_cell_communication/_common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/_cell_cell_communication/_common/utils.py b/openproblems/tasks/_cell_cell_communication/_common/utils.py index 25e458bb2e..1118087d15 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/utils.py +++ b/openproblems/tasks/_cell_cell_communication/_common/utils.py @@ -99,7 +99,7 @@ def map_gene_symbols(adata, map_filename: Union[str, pathlib.Path]): layers={ layer_name: scipy.sparse.hstack( [adata_one_to_any.layers[layer_name]] + many_to_one_layers[layer_name] - ) + ).tocsr() for layer_name in adata.layers }, uns=adata.uns, From 212b917691f098972be448059938d789a51f7488 Mon Sep 17 00:00:00 2001 From: Daniel Dimitrov <50865230+dbdimitrov@users.noreply.github.com> Date: Wed, 9 Nov 2022 16:44:15 +0100 Subject: [PATCH 070/266] Metric pr auc cell cell comm + aggregation of scores (#627) * add prauc metric * prc /w subsampling and without + odds_ratios reworked added * ascending to odds ratio and clarify further * revert to last commit * add additional metadata to datasets * aggregate methods, rename auprc, odds w props * add aggregation to baseline + improve method names * pre-commit * reformat names, private non-aggr methods * remove true events * add check if interactions are unique to merge_keys * remove merge_keys from metrics, format names, assert gt is not duplicated * sample method drop duplicates * test gt for odds * change to inf * remove unneeded if * remove unneeded import * remove uneeded metric files * precommit * remove file * fix unittest issues * simplify baselines * pre-commit check * Remove (sum) from name * better error handling * deduplicate sample method * handle both numerator and denominator * add and aggregations to README, rearrange tests * pre-commit * modify READMEs, explcitly aggregate * pre-commit Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../openproblems-r-extras/r_requirements.txt | 2 +- .../tasks/_cell_cell_communication/README.md | 43 ++-- .../_cell_cell_communication/_common/api.py | 101 ++++++---- .../_common/methods/__init__.py | 18 +- .../_common/methods/baseline.py | 3 + .../_common/methods/liana.py | 187 ++++++++++++++---- .../_common/metrics/__init__.py | 1 + .../_common/metrics/auprc.py | 13 ++ .../_common/metrics/odds_ratio.py | 40 ++-- .../_cell_cell_communication/_common/utils.py | 21 ++ .../README.md | 33 ++-- .../datasets/tnbc_wu2021.py | 9 +- .../methods/__init__.py | 18 +- .../metrics/__init__.py | 3 +- .../metrics/odds_ratio.py | 8 - .../README.md | 16 +- .../datasets/allen_brain_atlas.py | 9 +- .../methods/__init__.py | 18 +- .../metrics/__init__.py | 3 +- .../metrics/odds_ratio.py | 8 - test/test_4_cell_cell_communication.py | 17 +- 21 files changed, 404 insertions(+), 167 deletions(-) create mode 100644 openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py delete mode 100644 openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/metrics/odds_ratio.py delete mode 100644 openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/metrics/odds_ratio.py diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index b251088306..efd4ceeba6 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -36,7 +36,7 @@ rlang@1.0.6 rliger@1.0.0 rmarkdown@2.2 RSQLite@2.2.4 -saezlab/liana@0.1.6 +saezlab/liana@0.1.7 saezlab/OmnipathR@679bb79 # master sass@0.4.2 sctransform@0.3.4 diff --git a/openproblems/tasks/_cell_cell_communication/README.md b/openproblems/tasks/_cell_cell_communication/README.md index c0247e73cb..c51a0b559a 100644 --- a/openproblems/tasks/_cell_cell_communication/README.md +++ b/openproblems/tasks/_cell_cell_communication/README.md @@ -17,7 +17,7 @@ as a foundation for this task. The challenges in evaluating the tools are further exacerbated by the lack of a gold standard to benchmark the performance of CCC methods. In an attempt to address this, Dimitrov et al use alternative data modalities, including -the spatial proximity of cell types and inferred +the spatial proximity of cell types and downstream cytokine activities, to generate an inferred ground truth. However, these modalities are only approximations of biological reality and come with their own assumptions and limitations. In time, the inclusion of more @@ -52,6 +52,9 @@ scenario odds ratios quantify the strength of association between the ability of methods to prioritize interactions and those interactions assigned to the positive class. +* **AUPRC**: a single number _[0-1]_ that summarizes the area under the curve where +x is the recall and y is the precision. + ## API ### Datasets @@ -63,41 +66,53 @@ al](https://doi.org/10.1038/s41467-022-30755-0) for more details. `adata.uns["ccc_target"]` should be a Pandas DataFrame containing: -* `response`: `int`, binary response variable indicating whether an interaction is - assumed to have occurred - -and at least one of the following columns: +* `response`: `int`, binary response variable _[0; 1]_ indicating whether an interaction is + assumed to have occurred and at least one of the following columns: * `source`: `str`, name of source cell type in interaction * `target`: `str`, name of target cell type in interaction * `ligand`: `str`, gene symbol of the ligand in an interaction * `receptor`: `str`, gene symbol of the receptor in an interaction -The datasets should also include a -[NCBI taxonomy ID](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi) +The datasets should also include a [NCBI taxonomy ID](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi) in `adata.uns["target_organism"]` - used to convert the (typically human) prior knowledge of the CCC methods to the corresponding gene homologs. `adata.X` should contain the raw counts matrix. -For subtasks including ligands or receptors in the inferred interactions, provide a -prior-k - ### Methods Methods should predict interactions between cell types without using `adata.uns["ccc_target"]`. Predicted interactions should be stored in -`adata.uns["ccc_pred"]` as a Pandas DataFrame containing all of the following columns: +`adata.uns["ccc_pred"]` as a Pandas DataFrame containing: * `score`: `float`, score between `-inf` to `+inf` giving a predicted strength of the inferred interaction + +and at least two of the following columns: + * `source`: `str`, name of source cell type in interaction * `target`: `str`, name of target cell type in interaction * `ligand`: `str`, gene symbol of the ligand in an interaction * `receptor`: `str`, gene symbol of the receptor in an interaction -Methods should infer a score for each _intersecting interaction_ in the harmonized -prior-knowledge resource provided by LIANA. We define _intersecting interactions_ as -those for which the relevant genes are both present in the dataset and the resource. +The relevance of these columns is determined by the subtask in question +via `adata.uns["merge_keys"]`, a list of at least two columns from the +aforementioned columns corresponding to the assumed +truth in `adata.uns["ccc_target"]`. + +Methods should infer a score for each _intersecting interaction_, +where these represent the intersecting columns between `adata.uns["ccc_pred"]` and +`adata.uns["ccc_target"]`. + +In case, `ligand` and/or `receptor` columns are present +in `adata.uns["ccc_target"]`, we further define _intersecting interactions_ as +those for which the relevant genes are present in both the dataset and +the prior-knowledge resource provided by LIANA. + +The predictions of any method which do not uniquely map +to the columns in `adata.uns["merge_keys"]` are to be **aggregated**. +By default, aggregation is carried as the `max` and `sum` +according to columns in the `merge_keys`. The prior-knowledge resource is available via the `cell_cell_communication.utils.ligand_receptor_resource` function, which returns a diff --git a/openproblems/tasks/_cell_cell_communication/_common/api.py b/openproblems/tasks/_cell_cell_communication/_common/api.py index 9135f1a8eb..8ebdb66c3d 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/api.py +++ b/openproblems/tasks/_cell_cell_communication/_common/api.py @@ -73,31 +73,57 @@ def check_dataset(adata, merge_keys): assert "response" in adata.uns["ccc_target"] assert np.issubdtype(adata.uns["ccc_target"]["response"].dtype, int) assert np.all(np.isin(adata.uns["ccc_target"]["response"], [0, 1])) + assert any(adata.uns["ccc_target"][merge_keys].duplicated()) is False - # check against resource if "ligand" in merge_keys or "receptor" in merge_keys: assert "ligand_receptor_resource" in adata.uns - assert "receptor_genesymbol" in adata.uns["ligand_receptor_resource"] - assert "ligand_genesymbol" in adata.uns["ligand_receptor_resource"] assert "var_names_all" in adata.uns - assert_is_subset( - flatten_complex_subunits( - adata.uns["ligand_receptor_resource"]["receptor_genesymbol"] - ), - adata.uns["var_names_all"], - "resource receptor names", - "gene names", - 0.1, - ) - assert_is_subset( - flatten_complex_subunits( - adata.uns["ligand_receptor_resource"]["ligand_genesymbol"] - ), - adata.uns["var_names_all"], - "resource ligand names", - "gene names", - 0.1, - ) + + if "receptor" in merge_keys: + assert "receptor" in adata.uns["ccc_target"] + assert "receptor_genesymbol" in adata.uns["ligand_receptor_resource"] + + # verify target receptors are in resource + assert_is_subset( + adata.uns["ccc_target"]["receptor"].unique(), + np.unique(adata.uns["ligand_receptor_resource"]["receptor_genesymbol"]), + "target receptor names", + "resource receptor names", + ) + + # verify resource receptors are in the data + assert_is_subset( + flatten_complex_subunits( + adata.uns["ligand_receptor_resource"]["receptor_genesymbol"] + ), + adata.uns["var_names_all"], + "resource receptor names", + "gene names", + 0.1, + ) + + if "ligand" in merge_keys: + assert "ligand" in adata.uns["ccc_target"] + assert "ligand_genesymbol" in adata.uns["ligand_receptor_resource"] + + # verify target ligands are in resource + assert_is_subset( + adata.uns["ccc_target"]["ligand"].unique(), + np.unique(adata.uns["ligand_receptor_resource"]["ligand_genesymbol"]), + "target ligand names", + "resource ligand names", + ) + + # verify resource ligands are in the data + assert_is_subset( + flatten_complex_subunits( + adata.uns["ligand_receptor_resource"]["ligand_genesymbol"] + ), + adata.uns["var_names_all"], + "resource ligand names", + "gene names", + 0.1, + ) # check merge keys if "source" in merge_keys: @@ -117,25 +143,6 @@ def check_dataset(adata, merge_keys): "cell types", ) - if "receptor" in merge_keys: - # verify target receptors are in resource - assert "receptor" in adata.uns["ccc_target"] - assert_is_subset( - adata.uns["ccc_target"]["receptor"].unique(), - np.unique(adata.uns["ligand_receptor_resource"]["receptor_genesymbol"]), - "target receptor names", - "resource receptor names", - ) - if "ligand" in merge_keys: - # verify target ligands are in resource - assert "ligand" in adata.uns["ccc_target"] - assert_is_subset( - adata.uns["ccc_target"]["ligand"].unique(), - np.unique(adata.uns["ligand_receptor_resource"]["ligand_genesymbol"]), - "target ligand names", - "resource ligand names", - ) - return True @@ -148,6 +155,9 @@ def check_method(adata, merge_keys, is_baseline=False): assert "score" in adata.uns["ccc_pred"] assert np.all(np.isreal(adata.uns["ccc_pred"]["score"])) + # Check if a single prediction is returned for every merge_key combo + assert (adata.uns["ccc_pred"].groupby(merge_keys).size() == 1).all() + # check merge keys if "ligand" in merge_keys: assert "ligand" in adata.uns["ccc_pred"] @@ -204,13 +214,19 @@ def sample_dataset(merge_keys): # generate target interactions adata.uns["ccc_target"] = pd.DataFrame( { - "response": np.random.binomial(1, 0.2, 50), "ligand": np.random.choice(adata.var.index, 50), "receptor": np.random.choice(adata.var.index, 50), "source": np.random.choice(list(set(adata.obs.label)), 50), "target": np.random.choice(list(set(adata.obs.label)), 50), } ) + # drop duplicates + adata.uns["ccc_target"] = adata.uns["ccc_target"].drop_duplicates(subset=merge_keys) + # ensure positive response class is always present + n_rows = adata.uns["ccc_target"].shape[0] + response = np.zeros(n_rows, dtype=np.int64) + response[0 : np.int(n_rows * 0.3)] = 1 + adata.uns["ccc_target"]["response"] = response # subset columns adata.uns["ccc_target"] = adata.uns["ccc_target"][["response"] + merge_keys] @@ -271,6 +287,9 @@ def sample_method(adata, merge_keys): # subset columns df = df[["score"] + merge_keys] + # deduplicate + df = df.loc[~df[merge_keys].duplicated()] + adata.uns["ccc_pred"] = df return adata diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py b/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py index ab4c4f2b6d..abd29a6804 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py @@ -1,8 +1,14 @@ from .baseline import random_events from .baseline import true_events -from .liana import cellphonedb -from .liana import connectome -from .liana import liana -from .liana import logfc -from .liana import natmi -from .liana import sca +from .liana import cellphonedb_max +from .liana import cellphonedb_sum +from .liana import connectome_max +from .liana import connectome_sum +from .liana import liana_max +from .liana import liana_sum +from .liana import logfc_max +from .liana import logfc_sum +from .liana import natmi_max +from .liana import natmi_sum +from .liana import sca_max +from .liana import sca_sum diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py b/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py index 582e71b93b..2b8054fc17 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py @@ -27,6 +27,9 @@ def random_events(adata, test=False, n_events=1000): "score": np.random.uniform(0, 1, n_events), } ) + adata.uns["ccc_pred"] = adata.uns["ccc_pred"].loc[ + ~adata.uns["ccc_pred"][adata.uns["merge_keys"]].duplicated() + ] adata.uns["method_code_version"] = check_version("openproblems") return adata diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py index 636e3ac2aa..dc2bba944c 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py @@ -2,6 +2,7 @@ from .....tools.decorators import method from .....tools.normalize import log_cpm from .....tools.utils import check_r_version +from ..utils import aggregate_method_scores from ..utils import ligand_receptor_resource import functools @@ -30,13 +31,9 @@ def _p_filt(x, y): ) -@_liana_method( - method_name="LIANA", -) -def liana( +def _liana( adata, score_col="aggregate_rank", - ascending=True, min_expression_prop=0.1, test=False, **kwargs, @@ -58,7 +55,6 @@ def liana( # Format results liana_res["score"] = liana_res[score_col] - liana_res.sort_values("score", ascending=ascending, inplace=True) adata.uns["ccc_pred"] = liana_res adata.uns["method_code_version"] = check_r_version("liana") @@ -67,18 +63,41 @@ def liana( @_liana_method( - method_name="CellPhoneDB", + method_name="LIANA Rank Aggregate (max)", +) +def liana_max(adata, test=False): + adata = _liana(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="max") + + return adata + + +@_liana_method( + method_name="LIANA Rank Aggregate (sum)", +) +def liana_sum(adata, test=False): + adata = _liana(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="sum") + + return adata + + +_cellphonedb_method = functools.partial( + method, paper_name="CellPhoneDB: inferring cell–cell communication from " "combined expression of multi-subunit ligand–receptor complexes", paper_url="https://www.nature.com/articles/s41596-020-0292-x", paper_year=2020, + code_url="https://github.com/saezlab/liana", + image="openproblems-r-extras", ) -def cellphonedb(adata, test=False): - adata = liana( + + +def _cellphonedb(adata, test=False): + adata = _liana( adata, method="cellphonedb", score_col="lr.mean", - ascending=False, test=test, complex_policy="min", ) @@ -86,51 +105,153 @@ def cellphonedb(adata, test=False): adata.uns["ccc_pred"]["score"] = adata.uns["ccc_pred"].apply( lambda x: _p_filt(x.pvalue, x["lr.mean"]), axis=1 ) - adata.uns["ccc_pred"].sort_values("score", ascending=False, inplace=True) return adata -@_liana_method( - method_name="Connectome", +@_cellphonedb_method( + method_name="CellPhoneDB (max)", +) +def cellphonedb_max(adata, test=False): + adata = _cellphonedb(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="max") + + return adata + + +@_cellphonedb_method( + method_name="CellPhoneDB (sum)", +) +def cellphonedb_sum(adata, test=False): + adata = _cellphonedb(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="sum") + + return adata + + +_connectome_method = functools.partial( + method, paper_name="Computation and visualization of cell–cell signaling " "topologies in single-cell systems data using Connectome", paper_url="https://www.nature.com/articles/s41598-022-07959-x", paper_year=2022, + code_url="https://github.com/saezlab/liana", + image="openproblems-r-extras", ) -def connectome(adata, test=False): - return liana( - adata, method="connectome", score_col="weight_sc", ascending=False, test=test - ) -@_liana_method( - method_name="Mean log2FC", +def _connectome(adata, test=False): + return _liana(adata, method="connectome", score_col="weight_sc", test=test) + + +@_connectome_method( + method_name="Connectome (max)", ) -def logfc(adata, test=False): - return liana( - adata, method="logfc", score_col="logfc_comb", ascending=False, test=test - ) +def connectome_max(adata, test=False): + adata = _connectome(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="max") + return adata -@_liana_method( - method_name="NATMI", + +@_connectome_method( + method_name="Connectome (sum)", +) +def connectome_sum(adata, test=False): + adata = _connectome(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="sum") + + return adata + + +def _logfc(adata, test=False): + return _liana(adata, method="logfc", score_col="logfc_comb", test=test) + + +@_connectome_method( + method_name="Log2FC (max)", +) +def logfc_max(adata, test=False): + adata = _logfc(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="max") + + return adata + + +@_connectome_method( + method_name="Log2FC (sum)", +) +def logfc_sum(adata, test=False): + adata = _logfc(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="sum") + + return adata + + +_natmi_method = functools.partial( + method, paper_name="Predicting cell-to-cell communication networks using NATMI", paper_url="https://www.nature.com/articles/s41467-020-18873-z", paper_year=2021, + code_url="https://github.com/saezlab/liana", + image="openproblems-r-extras", ) -def natmi(adata, test=False): - return liana( - adata, method="natmi", score_col="edge_specificity", ascending=False, test=test - ) -@_liana_method( - method_name="SingleCellSignalR", +def _natmi(adata, test=False): + return _liana(adata, method="natmi", score_col="edge_specificity", test=test) + + +@_natmi_method( + method_name="NATMI (max)", +) +def natmi_max(adata, test=False): + adata = _natmi(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="max") + + return adata + + +@_natmi_method( + method_name="NATMI (sum)", +) +def natmi_sum(adata, test=False): + adata = _natmi(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="sum") + + return adata + + +_sca_method = functools.partial( + method, paper_name="SingleCellSignalR: inference of intercellular networks " "from single-cell transcriptomics", paper_url="https://academic.oup.com/nar/article/48/10/e55/5810485", paper_year=2021, + code_url="https://github.com/saezlab/liana", + image="openproblems-r-extras", +) + + +def _sca(adata, test=False): + return _liana(adata, method="sca", score_col="LRscore", test=test) + + +@_sca_method( + method_name="SingleCellSignalR (max)", ) -def sca(adata, test=False): - return liana(adata, method="sca", score_col="LRscore", ascending=False, test=test) +def sca_max(adata, test=False): + adata = _sca(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="max") + + return adata + + +@_sca_method( + method_name="SingleCellSignalR (sum)", +) +def sca_sum(adata, test=False): + adata = _sca(adata, test=test) + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="sum") + + return adata diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/__init__.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/__init__.py index e7de268379..ce716b5cfa 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/__init__.py @@ -1 +1,2 @@ +from .auprc import auprc from .odds_ratio import odds_ratio diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py new file mode 100644 index 0000000000..430541190b --- /dev/null +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py @@ -0,0 +1,13 @@ +from .....tools.decorators import metric +from ..utils import join_truth_and_pred +from sklearn.metrics import auc +from sklearn.metrics import precision_recall_curve + + +@metric(metric_name="Precision-recall AUC", maximize=True) +def auprc(adata): + gt = join_truth_and_pred(adata) + precision, recall, _ = precision_recall_curve( + gt["response"], gt["score"], pos_label=1 + ) + return auc(recall, precision) diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py index 74f67252dd..5b4c1dcfe0 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py @@ -1,32 +1,40 @@ from .....tools.decorators import metric +from ..utils import join_truth_and_pred import numpy as np -import scipy.stats as stats @metric(metric_name="Odds Ratio", maximize=True) -def odds_ratio(adata, merge_keys, top_n=100): +def odds_ratio(adata, top_prop=0.05): # Join benchmark (assumed truth) and ccc results # Get /w ccc_target and a response [0, 1] column - gt = ( - adata.uns["ccc_target"] - .merge(adata.uns["ccc_pred"], on=merge_keys, how="inner") - .sort_values("score", ascending=False) - ) + gt = join_truth_and_pred(adata) + gt = gt.sort_values("score", ascending=False) + top_n = np.int(adata.uns["ccc_target"].shape[0] * top_prop) # assign the top rank interactions to 1 a = np.zeros(len(gt["score"])) a[0:top_n] = 1 gt.loc[:, ["top_n"]] = a - # Shape to contingency table - table = np.array(gt.pivot_table(index=["top_n", "response"], aggfunc="size")) - - # if positive or negative class is not in top_n - if table.shape != (4,): - return 1 - - # Fisher ET - oddsratio, _ = stats.fisher_exact(table.reshape(2, 2)) + top = gt[gt["top_n"] == 1] + tp = np.sum(top.response == 1) + fp = np.sum(top.response == 0) + + bot = gt[gt["top_n"] == 0] + fn = np.sum(bot.response == 1) + tn = np.sum(bot.response == 0) + + numerator = tp * tn + denominator = fp * fn + if denominator == 0: + if numerator == 0: + # undefined + oddsratio = np.nan + else: + # perfect score + oddsratio = np.inf + else: + oddsratio = numerator / denominator return oddsratio diff --git a/openproblems/tasks/_cell_cell_communication/_common/utils.py b/openproblems/tasks/_cell_cell_communication/_common/utils.py index 1118087d15..a33be818db 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/utils.py +++ b/openproblems/tasks/_cell_cell_communication/_common/utils.py @@ -105,3 +105,24 @@ def map_gene_symbols(adata, map_filename: Union[str, pathlib.Path]): uns=adata.uns, obsm=adata.obsm, ) + + +# Join predictions to target +def join_truth_and_pred(adata): + merge_keys = list(adata.uns["merge_keys"]) + gt = adata.uns["ccc_target"].merge(adata.uns["ccc_pred"], on=merge_keys, how="left") + + gt.loc[gt["response"].isna(), "response"] = 0 + gt.loc[gt["score"].isna(), "score"] = np.nanmin(gt["score"]) - np.finfo(float).eps + + return gt + + +def aggregate_method_scores(adata, how): + merge_keys = list(adata.uns["merge_keys"]) + return ( + adata.uns["ccc_pred"] + .groupby(merge_keys) + .agg(score=("score", how)) + .reset_index() + ) diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md index 501284657a..86bed74f71 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md @@ -17,7 +17,7 @@ as a foundation for this task. The challenges in evaluating the tools are further exacerbated by the lack of a gold standard to benchmark the performance of CCC methods. In an attempt to address this, Dimitrov et al use alternative data modalities, including -the spatial proximity of cell types and inferred +the spatial proximity of cell types and downstream cytokine activities, to generate an inferred ground truth. However, these modalities are only approximations of biological reality and come with their own assumptions and limitations. In time, the inclusion of more @@ -43,6 +43,9 @@ scenario odds ratios quantify the strength of association between the ability of methods to prioritize interactions and those interactions assigned to the positive class. +* **AUPRC**: a single number _[0-1]_ that summarizes the area under the curve where +x is the recall and y is the precision. + ## API ### Datasets @@ -55,7 +58,7 @@ al](https://doi.org/10.1038/s41467-022-30755-0) for more details. `adata.uns["ccc_target"]` should be a Pandas DataFrame containing all the following columns: -* `response`: `int`, binary response variable indicating whether an interaction is +* `response`: `int`, binary response variable _[0; 1]_ indicating whether an interaction is assumed to have occurred * `ligand`: `str`, gene symbol of the ligand in an interaction * `target`: `str`, name of target cell type in interaction @@ -77,16 +80,17 @@ Methods should predict interactions between cell types without using * `ligand`: `str`, gene symbol of the ligand in an interaction * `target`: `str`, name of target cell type in interaction -Methods should infer a score for each _intersecting interaction_ in the harmonized -prior-knowledge resource provided by LIANA. We define _intersecting interactions_ as -those for which the relevant genes are both present in the dataset and the resource. +Methods should infer a `score` for each _intersecting interaction_ +between a `ligand` and a `target`. +We define _intersecting interactions_ as +those for which the `ligand` genes are present in both the dataset and +the prior-knowledge resource provided by LIANA, while a `target` is any +target cell identity label in the dataset. -The prior-knowledge resource is available via the -`cell_cell_communication.utils.ligand_receptor_resource` function, which returns a -DataFrame containing the columns `ligand_genesymbol` and `receptor_genesymbol`, which -correspond to the ligand and receptor genes, respectively. These may contain complexes -with subunits separated with `_`. Hence, **methods should be able to deal with -complex-containing interactions**. +The predictions of any method which do not uniquely map +to the columns in `adata.uns["merge_keys"]` are to be **aggregated**. +By default, aggregation is carried as the `max` and `sum` +according to columns in the `merge_keys`. ## Prior-knowledge @@ -109,6 +113,13 @@ To ensure the consistency between the IDs in the dataset and those in the resource we use a reference map, obtained via BioConductor-v3.15 `org.Hs.eg.db`, and are provided in `tnbc_wu2021_gene_symbols.csv`. +The prior-knowledge resource is available via the +`cell_cell_communication.utils.ligand_receptor_resource` function, which returns a +DataFrame containing the columns `ligand_genesymbol` and `receptor_genesymbol`, which +correspond to the ligand and receptor genes, respectively. These may contain complexes +with subunits separated with `_`. Hence, **methods should be able to deal with +complex-containing interactions**. + ### Metrics Metrics should evaluate the concordance between `adata.uns["ccc_target"]` and diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py index 77401ad9e6..76e5282886 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py @@ -10,10 +10,11 @@ "Triple negative breast cancer atlas", data_url=load_tnbc_data.metadata["data_url"], data_reference=load_tnbc_data.metadata["data_reference"], - dataset_summary="A single-cell atlas of human breast cancers with inferred " - "cytokine activities as assumed true cell-cell communication. Cytokine " - "activities were estimated by fitting a multivariate linear model with " - "cytokine-focused signatures (see Dimitrov et al., 2022).", + dataset_summary="Human breast cancer atlas (Wu et al., 2021), " + "with cytokine activities, inferred using a multivariate " + "linear model with cytokine-focused signatures, as assumed true " + "cell-cell communication (Dimitrov et al., 2022). " + "42512 cells x 28078 features with 29 cell types from 10 patients", image="openproblems-r-extras", ) def tnbc_data(test=False): diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py index d8e04875fc..15e21695aa 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py @@ -1,8 +1,14 @@ -from ..._common.methods import cellphonedb -from ..._common.methods import connectome -from ..._common.methods import liana -from ..._common.methods import logfc -from ..._common.methods import natmi +from ..._common.methods import cellphonedb_max +from ..._common.methods import cellphonedb_sum +from ..._common.methods import connectome_max +from ..._common.methods import connectome_sum +from ..._common.methods import liana_max +from ..._common.methods import liana_sum +from ..._common.methods import logfc_max +from ..._common.methods import logfc_sum +from ..._common.methods import natmi_max +from ..._common.methods import natmi_sum from ..._common.methods import random_events -from ..._common.methods import sca +from ..._common.methods import sca_max +from ..._common.methods import sca_sum from ..._common.methods import true_events diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/metrics/__init__.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/metrics/__init__.py index e7de268379..b38d36885f 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/metrics/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/metrics/__init__.py @@ -1 +1,2 @@ -from .odds_ratio import odds_ratio +from ..._common.metrics.auprc import auprc +from ..._common.metrics.odds_ratio import odds_ratio diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/metrics/odds_ratio.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/metrics/odds_ratio.py deleted file mode 100644 index e5c113c52d..0000000000 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/metrics/odds_ratio.py +++ /dev/null @@ -1,8 +0,0 @@ -from .....tools.decorators import metric -from ..._common.metrics import odds_ratio as _odds_ratio -from ..api import MERGE_KEYS - - -@metric(**_odds_ratio.metadata) -def odds_ratio(adata): - return _odds_ratio(adata, merge_keys=MERGE_KEYS) diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md index 3d35dde314..a42f94aa9a 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md @@ -17,7 +17,7 @@ as a foundation for this task. The challenges in evaluating the tools are further exacerbated by the lack of a gold standard to benchmark the performance of CCC methods. In an attempt to address this, Dimitrov et al use alternative data modalities, including -the spatial proximity of cell types and inferred +the spatial proximity of cell types and downstream cytokine activities, to generate an inferred ground truth. However, these modalities are only approximations of biological reality and come with their own assumptions and limitations. In time, the inclusion of more @@ -42,6 +42,9 @@ scenario odds ratios quantify the strength of association between the ability of methods to prioritize interactions and those interactions assigned to the positive class. +* **AUPRC**: a single number _[0-1]_ that summarizes the area under the curve where +x is the recall and y is the precision. + ## API ### Datasets @@ -54,7 +57,7 @@ al](https://doi.org/10.1038/s41467-022-30755-0) for more details. `adata.uns["ccc_target"]` should be a Pandas DataFrame containing all of the following columns: -* `response`: `int`, binary response variable indicating whether an interaction is +* `response`: `int`, binary response variable _[0; 1]_ indicating whether an interaction is assumed to have occurred * `source`: `str`, name of source cell type in interaction * `target`: `str`, name of target cell type in interaction @@ -76,6 +79,15 @@ Methods should predict interactions between cell types without using * `source`: `str`, name of source cell type in interaction * `target`: `str`, name of target cell type in interaction +Methods should infer a `score` for each _intersecting interaction_ +between a `source` and a `target`, which correspond to all possible combinations +of the cell identity labels in the dataset. + +The predictions of any method which do not uniquely map +to the columns in `adata.uns["merge_keys"]` are to be **aggregated**. +By default, aggregation is carried as the `max` and `sum` +according to columns in the `merge_keys`. + ### Prior-knowledge Resource Each dataset should be supplemented with a prior knowledge resource of diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py index 59a4a12bd8..40fedd1c1b 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py @@ -7,10 +7,11 @@ "Mouse brain atlas", data_url=load_mouse_brain_atlas.metadata["data_url"], data_reference=load_mouse_brain_atlas.metadata["data_reference"], - dataset_summary="A murine brain atlas with inferred spatially-adjacent " - "cell types as assumed benchmark truth. Adjacent cell types are inferred " - "from z-transformed deconvolution proportion correlations. Generated from " - "murine brain 10x Visium slides (see Dimitrov et al., 2022).", + dataset_summary="A murine brain atlas with adjacent cell types as assumed " + "benchmark truth, inferred from deconvolution proportion " + "correlations using matching 10x Visium slides " + "(see Dimitrov et al., 2022)." + " 14249 cells x 34617 features with 23 cell type labels.", image="openproblems-r-extras", ) def mouse_brain_atlas(test=False): diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py index d8e04875fc..15e21695aa 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py @@ -1,8 +1,14 @@ -from ..._common.methods import cellphonedb -from ..._common.methods import connectome -from ..._common.methods import liana -from ..._common.methods import logfc -from ..._common.methods import natmi +from ..._common.methods import cellphonedb_max +from ..._common.methods import cellphonedb_sum +from ..._common.methods import connectome_max +from ..._common.methods import connectome_sum +from ..._common.methods import liana_max +from ..._common.methods import liana_sum +from ..._common.methods import logfc_max +from ..._common.methods import logfc_sum +from ..._common.methods import natmi_max +from ..._common.methods import natmi_sum from ..._common.methods import random_events -from ..._common.methods import sca +from ..._common.methods import sca_max +from ..._common.methods import sca_sum from ..._common.methods import true_events diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/metrics/__init__.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/metrics/__init__.py index e7de268379..b38d36885f 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/metrics/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/metrics/__init__.py @@ -1 +1,2 @@ -from .odds_ratio import odds_ratio +from ..._common.metrics.auprc import auprc +from ..._common.metrics.odds_ratio import odds_ratio diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/metrics/odds_ratio.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/metrics/odds_ratio.py deleted file mode 100644 index e5c113c52d..0000000000 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/metrics/odds_ratio.py +++ /dev/null @@ -1,8 +0,0 @@ -from .....tools.decorators import metric -from ..._common.metrics import odds_ratio as _odds_ratio -from ..api import MERGE_KEYS - - -@metric(**_odds_ratio.metadata) -def odds_ratio(adata): - return _odds_ratio(adata, merge_keys=MERGE_KEYS) diff --git a/test/test_4_cell_cell_communication.py b/test/test_4_cell_cell_communication.py index 192069df79..d44678e2c8 100644 --- a/test/test_4_cell_cell_communication.py +++ b/test/test_4_cell_cell_communication.py @@ -118,12 +118,19 @@ def test_odds_ratio_no_match(): task = openproblems.tasks.cell_cell_communication_source_target metric = task.metrics.odds_ratio - adata = task.api.sample_dataset() - adata = task.api.sample_method(adata) openproblems.log.debug( "Testing {} metric from {} task".format(metric.__name__, task.__name__) ) - adata.uns["ccc_target"]["response"] = np.nan - m = metric(adata) - assert m == 1 + adata = task.api.sample_dataset() + + adata = task.api.sample_method(adata) + m = metric(adata, top_prop=0) # force numerator exception + assert m is np.nan + + m = metric(adata, top_prop=0.5) # check non-exception output + assert np.issubdtype("float64", m) + + adata = task.methods.true_events(adata) + m = metric(adata, top_prop=0.9) # force denominator exception + assert m is np.inf From 94a8855d12ca0b8d4e457e3853f569063ec95ae0 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 9 Nov 2022 10:48:29 -0500 Subject: [PATCH 071/266] precommit --- openproblems/tasks/_cell_cell_communication/README.md | 4 ++-- .../cell_cell_communication_ligand_target/README.md | 4 ++-- .../cell_cell_communication_source_target/README.md | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/openproblems/tasks/_cell_cell_communication/README.md b/openproblems/tasks/_cell_cell_communication/README.md index c51a0b559a..3d40f17c0a 100644 --- a/openproblems/tasks/_cell_cell_communication/README.md +++ b/openproblems/tasks/_cell_cell_communication/README.md @@ -66,8 +66,8 @@ al](https://doi.org/10.1038/s41467-022-30755-0) for more details. `adata.uns["ccc_target"]` should be a Pandas DataFrame containing: -* `response`: `int`, binary response variable _[0; 1]_ indicating whether an interaction is - assumed to have occurred and at least one of the following columns: +* `response`: `int`, binary response variable _[0; 1]_ indicating whether an interaction + is assumed to have occurred and at least one of the following columns: * `source`: `str`, name of source cell type in interaction * `target`: `str`, name of target cell type in interaction diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md index 86bed74f71..f1c19872cf 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md @@ -58,8 +58,8 @@ al](https://doi.org/10.1038/s41467-022-30755-0) for more details. `adata.uns["ccc_target"]` should be a Pandas DataFrame containing all the following columns: -* `response`: `int`, binary response variable _[0; 1]_ indicating whether an interaction is - assumed to have occurred +* `response`: `int`, binary response variable _[0; 1]_ indicating whether an interaction + is assumed to have occurred * `ligand`: `str`, gene symbol of the ligand in an interaction * `target`: `str`, name of target cell type in interaction diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md index a42f94aa9a..1d8f0e9ef7 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md @@ -57,8 +57,8 @@ al](https://doi.org/10.1038/s41467-022-30755-0) for more details. `adata.uns["ccc_target"]` should be a Pandas DataFrame containing all of the following columns: -* `response`: `int`, binary response variable _[0; 1]_ indicating whether an interaction is - assumed to have occurred +* `response`: `int`, binary response variable _[0; 1]_ indicating whether an interaction + is assumed to have occurred * `source`: `str`, name of source cell type in interaction * `target`: `str`, name of target cell type in interaction From 8862b73022dc8574a78540089e1b215936d4b6f3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 10 Nov 2022 10:14:36 -0500 Subject: [PATCH 072/266] Update IRkernel to 1.3.1 (#672) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-base/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-base/r_requirements.txt b/docker/openproblems-r-base/r_requirements.txt index 13f85978bb..6ee9be204a 100644 --- a/docker/openproblems-r-base/r_requirements.txt +++ b/docker/openproblems-r-base/r_requirements.txt @@ -1,3 +1,3 @@ bioc::scran@1.24.1 -IRkernel@1.3 +IRkernel@1.3.1 RcppAnnoy@0.0.20 From b17377a77b5dbecd4d6b660b2d472d043445dde4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 10 Nov 2022 14:26:12 -0500 Subject: [PATCH 073/266] Update styler to 1.8.1 (#676) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-github-actions/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-github-actions/r_requirements.txt b/docker/openproblems-github-actions/r_requirements.txt index 68a2f54962..23dafb913f 100644 --- a/docker/openproblems-github-actions/r_requirements.txt +++ b/docker/openproblems-github-actions/r_requirements.txt @@ -2,5 +2,5 @@ backports@1.4.1 docopt@0.7.1 git2r@0.30.1 lintr@3.0.2 -styler@1.8.0 +styler@1.8.1 tibble@3.1.8 From 94b2481fba4224879b396e39a9c058d8411f8c76 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 10 Nov 2022 15:40:31 -0500 Subject: [PATCH 074/266] pip check and downgrade leidenalg --- docker/openproblems-python-extras/requirements.txt | 1 + test/docker_run.sh | 1 + workflow/Snakefile | 1 + 3 files changed, 3 insertions(+) diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index c51a5d771a..147d48f380 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -5,6 +5,7 @@ git+https://github.com/jorvis/Multicore-TSNE@6832575 git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix git+https://github.com/scottgigante-immunai/knn-smoothing@python_package +leidenalg==0.8.10 magic-impute==3.0.* phate==1.0.* pybedtools==0.9.* diff --git a/test/docker_run.sh b/test/docker_run.sh index c15a73c5e6..d2b1582f6b 100644 --- a/test/docker_run.sh +++ b/test/docker_run.sh @@ -9,6 +9,7 @@ export PYTHONPATH="$WORKDIR" if [ ! -f ~/.install_complete ]; then python3 -m pip install --upgrade pip + python3 -m pip check python3 -m pip install --upgrade-strategy=only-if-needed --no-cache-dir --editable "${CODEDIR}" python3 -m pip install --upgrade coverage FREEZE="$(python3 -m pip freeze)" diff --git a/workflow/Snakefile b/workflow/Snakefile index c8378a3cea..b6e83aa25b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -65,6 +65,7 @@ COPY . /usr/src/singlecellopenproblems RUN cd /usr/src/singlecellopenproblems && sudo git clean -fxdq """) + r""" RUN sudo pip install --no-cache-dir --editable /usr/src/singlecellopenproblems +RUN sudo pip check ' > {output}""" ruleorder: update_docker_image > build_docker_image From a5738bfac5afb268ff0f279123bc85d19139fd71 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 11 Nov 2022 10:50:28 -0500 Subject: [PATCH 075/266] break up _metrics into separate functions (#673) * break up _metrics into separate functions When _metrics fails, it is impossible to debug due to many computations being squashed into on NJIT function. This separates each metric into its own function, using NJIT when appropriate. See https://github.com/openproblems-bio/openproblems/actions/runs/3388609861/jobs/5633837315\#step:19:2770 * vectorize lcmc --- .../metrics/nn_ranking.py | 72 +++++++++++++++---- 1 file changed, 59 insertions(+), 13 deletions(-) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py index 1a546790b6..946a4d5095 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py @@ -67,18 +67,9 @@ def _coranking_matrix(R1: np.ndarray, R2: np.ndarray) -> np.ndarray: # pragma: @njit(cache=True, fastmath=True) -def _metrics( - Q: np.ndarray, -) -> Tuple[ - np.ndarray, np.ndarray, np.ndarray, float, np.ndarray, int, float, float -]: # pragma: no cover - Q = Q[1:, 1:] - m = len(Q) +def _trustworthiness(Q: np.ndarray, m: int) -> np.ndarray: # pragma: no cover T = np.zeros(m - 1) # trustworthiness - C = np.zeros(m - 1) # continuity - QNN = np.zeros(m) # Co-k-nearest neighbor size - LCMC = np.zeros(m) # Local Continuity Meta Criterion for k in range(m - 1): Qs = Q[k:, :k] @@ -86,24 +77,79 @@ def _metrics( W = np.arange(Qs.shape[0]).reshape(-1, 1) # 1 - normalized hard-k-intrusions. lower-left region. # weighted by rank error (rank - k) - T[k] = 1 - np.sum(Qs * W) / (k + 1) / m / (m - 1 - k) + T[k] = 1 - np.sum(Qs * W) / ((k + 1) * m * (m - 1 - k)) + + return T + + +@njit(cache=True, fastmath=True) +def _continuity(Q: np.ndarray, m: int) -> np.ndarray: # pragma: no cover + + C = np.zeros(m - 1) # continuity + + for k in range(m - 1): Qs = Q[:k, k:] # a row vector of weights. weight = rank error = actual_rank - k W = np.arange(Qs.shape[1]).reshape(1, -1) # 1 - normalized hard-k-extrusions. upper-right region - C[k] = 1 - np.sum(Qs * W) / (k + 1) / m / (m - 1 - k) + C[k] = 1 - np.sum(Qs * W) / ((k + 1) * m * (m - 1 - k)) + + return C + + +@njit(cache=True, fastmath=True) +def _qnn(Q: np.ndarray, m: int) -> np.ndarray: # pragma: no cover + + QNN = np.zeros(m) # Co-k-nearest neighbor size for k in range(m): # Q[0,0] is always m. 0-th nearest neighbor is always the point itself. # Exclude Q[0,0] QNN[k] = np.sum(Q[: k + 1, : k + 1]) / ((k + 1) * m) - LCMC[k] = QNN[k] - (k + 1) / (m - 1) + return QNN + + +def _lcmc(QNN: np.ndarray, m: int) -> np.ndarray: + LCMC = QNN - (np.arange(m) + 1) / (m - 1) + return LCMC + + +def _kmax(LCMC: np.ndarray) -> int: kmax = np.argmax(LCMC) + return kmax # type: ignore + + +def _q_local(QNN: np.ndarray, kmax: int) -> float: Qlocal = np.sum(QNN[: kmax + 1]) / (kmax + 1) + return Qlocal + + +def _q_global(QNN: np.ndarray, kmax: int, m: int) -> float: # skip the last. The last is (m-1)-nearest neighbor, including all samples. Qglobal = np.sum(QNN[kmax:-1]) / (m - kmax - 1) + return Qglobal + + +def _qnn_auc(QNN: np.ndarray) -> float: AUC = np.mean(QNN) + return AUC # type: ignore + + +def _metrics( + Q: np.ndarray, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, float, np.ndarray, int, float, float]: + Q = Q[1:, 1:] + m = len(Q) + + T = _trustworthiness(Q, m) + C = _continuity(Q, m) + QNN = _qnn(Q, m) + LCMC = _lcmc(QNN, m) + kmax = _kmax(LCMC) + Qlocal = _q_local(QNN, kmax) + Qglobal = _q_global(QNN, kmax, m) + AUC = _qnn_auc(QNN) return T, C, QNN, AUC, LCMC, kmax, Qlocal, Qglobal From 372f8c7d1275f2221acc5dea8bbe538d0426d840 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 11 Nov 2022 10:51:19 -0500 Subject: [PATCH 076/266] downgrade leidenalg --- docker/openproblems-python-extras/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index 147d48f380..c51a5d771a 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -5,7 +5,6 @@ git+https://github.com/jorvis/Multicore-TSNE@6832575 git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix git+https://github.com/scottgigante-immunai/knn-smoothing@python_package -leidenalg==0.8.10 magic-impute==3.0.* phate==1.0.* pybedtools==0.9.* From bbecf4e9ad90007c2711394e7fbd8e49cbd3e4a1 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 14 Nov 2022 11:54:15 -0500 Subject: [PATCH 077/266] Update knn_smoothing.py --- openproblems/tasks/denoising/methods/knn_smoothing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/denoising/methods/knn_smoothing.py b/openproblems/tasks/denoising/methods/knn_smoothing.py index 6d8aa86d4c..dd5da6e941 100644 --- a/openproblems/tasks/denoising/methods/knn_smoothing.py +++ b/openproblems/tasks/denoising/methods/knn_smoothing.py @@ -3,7 +3,7 @@ @method( - method_name="KNN smoothing", + method_name="Iterative KNN smoothing", paper_name="K-nearest neighbor smoothing for high-throughput " "single-cell RNA-Seq data", paper_url="https://doi.org/10.1101/217737", From 2af9a4918ed3370859f71774558068961f6d22c6 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 14 Nov 2022 11:57:35 -0500 Subject: [PATCH 078/266] Update magic.py --- openproblems/tasks/denoising/methods/magic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/denoising/methods/magic.py b/openproblems/tasks/denoising/methods/magic.py index fa1feae071..53a6f437bb 100644 --- a/openproblems/tasks/denoising/methods/magic.py +++ b/openproblems/tasks/denoising/methods/magic.py @@ -60,7 +60,7 @@ def magic_approx(adata, test=False): @method( - method_name="KNN Smoothing", + method_name="KNN smoothing", paper_name="KNN Smoothing (baseline)", paper_url="https://openproblems.bio", paper_year=2022, From 9b896bda1c69c27bc125da52e92e3079bf98a922 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 14 Nov 2022 13:38:10 -0500 Subject: [PATCH 079/266] Handle NaNs in nextflow export (#681) * export raw results * py 3.8 * export raw and processed * copy * convert non-finite scores to strings * # publish * lint: * don't push raw files * remove raw on publish # publish * move cleanup to the end # publish * can't request review from PR author * remove all-nan metrics # publish * fix headers # publish * can't change dict during iteration # publish * implementation -> code * code is not its own column * exclude baselines from stub computation --- .github/workflows/process_results.yml | 63 +++++++++++++----------- workflow/parse_nextflow.py | 70 ++++++++++++++++++--------- workflow/workflow_utils.py | 5 +- 3 files changed, 86 insertions(+), 52 deletions(-) diff --git a/.github/workflows/process_results.yml b/.github/workflows/process_results.yml index 2a8bf1868a..5838f52773 100644 --- a/.github/workflows/process_results.yml +++ b/.github/workflows/process_results.yml @@ -36,7 +36,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: ${{ matrix.config.python }} + python-version: "3.8" - name: Install AWS CLI run: | @@ -71,20 +71,29 @@ jobs: python openproblems/workflow/parse_nextflow.py /tmp website/data/results python openproblems/workflow/generate_website_markdown.py website/content/benchmarks - - name: AWS S3 cleanup - if: "github.event_name == 'repository_dispatch'" - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 + - name: Upload results + uses: actions/upload-artifact@main + with: + name: results + path: website/data/results + + - name: Upload markdown + uses: actions/upload-artifact@main + with: + name: markdown + path: website/content/benchmarks + + - name: Remove raw output + if: | + github.event_name == 'repository_dispatch' || + endsWith(github.event.head_commit.message, '# publish') run: | - aws s3 rm --recursive "s3://openproblems-nextflow/work_main" - aws s3 rm --recursive "s3://openproblems-nextflow/cwd_example" - aws s3 cp --recursive "s3://openproblems-nextflow/cwd_main" "s3://openproblems-nextflow/cwd_example" - aws s3 rm --recursive "s3://openproblems-nextflow/cwd_main" + rm website/data/results/*/*.raw.json - name: Push to openproblems-bio/website - if: "github.event_name == 'repository_dispatch'" + if: | + github.event_name == 'repository_dispatch' || + endsWith(github.event.head_commit.message, '# publish') shell: bash working-directory: './website' env: @@ -93,29 +102,29 @@ jobs: git push origin "${UPDATE_BRANCH_NAME}" - name: Create Pull Request - if: "github.event_name == 'repository_dispatch'" + if: | + github.event_name == 'repository_dispatch' || + endsWith(github.event.head_commit.message, '# publish') uses: peter-evans/create-pull-request@v4 with: branch: ${{ env.UPDATE_BRANCH_NAME }} delete-branch: true base: main title: '[auto] Update benchmark results' - reviewers: scottgigante, dburkhardt + reviewers: scottgigante-immunai,rcannood,dburkhardt path: './website' token: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} author: "openproblems-bio " commit-message: "Update benchmark results # ci skip" - - name: Upload results on test - if: "github.event_name != 'repository_dispatch'" - uses: actions/upload-artifact@main - with: - name: results - path: website/data/results - - - name: Upload markdown on test - if: "github.event_name != 'repository_dispatch'" - uses: actions/upload-artifact@main - with: - name: markdown - path: website/content/benchmarks + - name: AWS S3 cleanup + if: "github.event_name == 'repository_dispatch'" + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + run: | + aws s3 rm --recursive "s3://openproblems-nextflow/work_main" + aws s3 rm --recursive "s3://openproblems-nextflow/cwd_example" + aws s3 cp --recursive "s3://openproblems-nextflow/cwd_main" "s3://openproblems-nextflow/cwd_example" + aws s3 rm --recursive "s3://openproblems-nextflow/cwd_main" diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index 5942515e71..a88fdf5417 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -1,4 +1,5 @@ import collections +import copy import json import numpy as np import numpyencoder @@ -154,10 +155,10 @@ def normalize_scores(task_name, dataset_results): """Normalize method scores to [0, 1] based on baseline method scores.""" for method_name in dataset_results: # store original unnormalized results - dataset_results[method_name]["metrics_raw"] = dataset_results[method_name][ - "metrics" - ] - metric_names = list(dataset_results.values())[0]["metrics"].keys() + dataset_results[method_name]["metrics_raw"] = copy.copy( + dataset_results[method_name]["metrics"] + ) + metric_names = list(list(dataset_results.values())[0]["metrics"].keys()) for metric_name in metric_names: metric = openproblems.api.utils.get_function(task_name, "metrics", metric_name) metric_scores = np.array( @@ -166,6 +167,10 @@ def normalize_scores(task_name, dataset_results): for method_name in dataset_results ] ) + if np.all(np.isnan(metric_scores)): + for method_name in dataset_results: + del dataset_results[method_name]["metrics"][metric_name] + continue baseline_methods = [ method_name for method_name in dataset_results @@ -182,8 +187,9 @@ def normalize_scores(task_name, dataset_results): for method_name in baseline_methods ] ) - metric_scores -= baseline_scores.min() - baseline_range = baseline_scores.max() - baseline_scores.min() + baseline_min = np.nanmin(baseline_scores) + baseline_range = np.nanmax(baseline_scores) - baseline_min + metric_scores -= baseline_min metric_scores /= np.where(baseline_range != 0, baseline_range, 1) if not metric.metadata["maximize"]: metric_scores = 1 - metric_scores @@ -194,6 +200,7 @@ def normalize_scores(task_name, dataset_results): def drop_baselines(task_name, dataset_results): """Remove baseline methods from dataset results.""" + dataset_results = copy.copy(dataset_results) method_names = list(dataset_results.keys()) for method_name in method_names: method = openproblems.api.utils.get_function(task_name, "methods", method_name) @@ -208,10 +215,14 @@ def compute_ranking(dataset_results): metric_names = list(dataset_results.values())[0]["metrics"].keys() method_names = list(dataset_results.keys()) for metric_name in metric_names: - metric_scores = [ - dataset_results[method_name]["metrics"][metric_name] - for method_name in method_names - ] + metric_scores = np.array( + [ + dataset_results[method_name]["metrics"][metric_name] + for method_name in method_names + ] + ) + metric_scores[np.isnan(metric_scores) | np.isneginf(metric_scores)] = 0 + metric_scores[np.isinf(metric_scores)] = 1 metric_sums += metric_scores final_ranking = { @@ -221,18 +232,18 @@ def compute_ranking(dataset_results): return final_ranking -def dataset_results_to_json(task_name, dataset_name, dataset_results): +def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): """Convert the raw dataset results to pretty JSON for web.""" dataset = openproblems.api.utils.get_function(task_name, "datasets", dataset_name) output = dict( name=dataset.metadata["dataset_name"], data_url=dataset.metadata["data_url"], data_reference=dataset.metadata["data_reference"], - headers=dict(names=["Rank"], fixed=["Name", "Paper", "Website", "Code"]), + headers=dict(names=["Rank"], fixed=["Name", "Paper", "Library"]), results=list(), ) - dataset_results = normalize_scores(task_name, dataset_results) - dataset_results = drop_baselines(task_name, dataset_results) + dataset_results_raw = normalize_scores(task_name, dataset_results_raw) + dataset_results = drop_baselines(task_name, dataset_results_raw) ranking = compute_ranking(dataset_results) metric_names = set() for method_name, rank in ranking.items(): @@ -256,6 +267,12 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results): metric = openproblems.api.utils.get_function( task_name, "metrics", metric_name ) + if np.isnan(metric_result): + metric_result = "NaN" + elif np.isneginf(metric_result): + metric_result = "-Inf" + elif np.isinf(metric_result): + metric_result = "Inf" result[metric.metadata["metric_name"]] = metric_result metric_names.add(metric.metadata["metric_name"]) output["results"].append(result) @@ -267,11 +284,11 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results): "CPU (%)", "Name", "Paper", - "Code", "Year", + "Library", ] ) - return output + return output, dataset_results_raw def results_to_json(results, outdir): @@ -279,27 +296,32 @@ def results_to_json(results, outdir): if not os.path.isdir(outdir): os.mkdir(outdir) for task_name, task_results in results.items(): - if workflow_utils.task_is_incomplete( - openproblems.api.utils.str_to_task(task_name) - ): - # don't write results for incomplete tasks - continue for dataset_name, dataset_results in task_results.items(): results_dir = os.path.join(outdir, task_name) if not os.path.isdir(results_dir): os.mkdir(results_dir) filename = os.path.join(results_dir, "{}.json".format(dataset_name)) + filename_raw = os.path.join(results_dir, "{}.raw.json".format(dataset_name)) try: - dataset_results_json = dataset_results_to_json( + dataset_results_json, dataset_results_raw = dataset_results_to_json( task_name, dataset_name, dataset_results ) except openproblems.api.utils.NoSuchFunctionError: continue - with open(filename, "w") as handle: + with open(filename_raw, "w") as handle: dump_json( - dataset_results_json, + dataset_results_raw, handle, ) + if not workflow_utils.task_is_incomplete( + openproblems.api.utils.str_to_task(task_name) + ): + # don't write results for incomplete tasks + with open(filename, "w") as handle: + dump_json( + dataset_results_json, + handle, + ) def main(results_path, outdir): diff --git a/workflow/workflow_utils.py b/workflow/workflow_utils.py index b0e7d0cb47..2cdfdb0e5c 100644 --- a/workflow/workflow_utils.py +++ b/workflow/workflow_utils.py @@ -6,7 +6,10 @@ def task_is_incomplete(task): if len(task.DATASETS) < TASK_MIN_DATASETS: return True - if len(task.METHODS) < TASK_MIN_METHODS: + non_baseline_methods = [ + method for method in task.METHODS if not method.metadata["is_baseline"] + ] + if len(non_baseline_methods) < TASK_MIN_METHODS: return True if len(task.METRICS) < TASK_MIN_METRICS: return True From 7af21762cc28d94d69c18755605bb4401dee62af Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 12:03:36 -0500 Subject: [PATCH 080/266] use v1.9 --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index d698eee8e3..e155688fe4 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -417,7 +417,7 @@ jobs: RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" cd /mnt/openproblems-nextflow/cwd/${BRANCH} nextflow run \ - -revision v1.8 \ + -revision v1.9 \ -with-tower \ -ansi-log false -resume \ -profile aws,test \ From a05969e4f0906f5986ecea766940e495b0c93776 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 12:30:46 -0500 Subject: [PATCH 081/266] remove pip check --- test/docker_run.sh | 1 - workflow/Snakefile | 1 - 2 files changed, 2 deletions(-) diff --git a/test/docker_run.sh b/test/docker_run.sh index d2b1582f6b..c15a73c5e6 100644 --- a/test/docker_run.sh +++ b/test/docker_run.sh @@ -9,7 +9,6 @@ export PYTHONPATH="$WORKDIR" if [ ! -f ~/.install_complete ]; then python3 -m pip install --upgrade pip - python3 -m pip check python3 -m pip install --upgrade-strategy=only-if-needed --no-cache-dir --editable "${CODEDIR}" python3 -m pip install --upgrade coverage FREEZE="$(python3 -m pip freeze)" diff --git a/workflow/Snakefile b/workflow/Snakefile index b6e83aa25b..c8378a3cea 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -65,7 +65,6 @@ COPY . /usr/src/singlecellopenproblems RUN cd /usr/src/singlecellopenproblems && sudo git clean -fxdq """) + r""" RUN sudo pip install --no-cache-dir --editable /usr/src/singlecellopenproblems -RUN sudo pip check ' > {output}""" ruleorder: update_docker_image > build_docker_image From 1ee0393a946b051447272f79aeffb46f35c886ee Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 12:50:40 -0500 Subject: [PATCH 082/266] remove pip checl --- test/docker_run.sh | 1 - workflow/Snakefile | 1 - 2 files changed, 2 deletions(-) diff --git a/test/docker_run.sh b/test/docker_run.sh index d2b1582f6b..c15a73c5e6 100644 --- a/test/docker_run.sh +++ b/test/docker_run.sh @@ -9,7 +9,6 @@ export PYTHONPATH="$WORKDIR" if [ ! -f ~/.install_complete ]; then python3 -m pip install --upgrade pip - python3 -m pip check python3 -m pip install --upgrade-strategy=only-if-needed --no-cache-dir --editable "${CODEDIR}" python3 -m pip install --upgrade coverage FREEZE="$(python3 -m pip freeze)" diff --git a/workflow/Snakefile b/workflow/Snakefile index b6e83aa25b..c8378a3cea 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -65,7 +65,6 @@ COPY . /usr/src/singlecellopenproblems RUN cd /usr/src/singlecellopenproblems && sudo git clean -fxdq """) + r""" RUN sudo pip install --no-cache-dir --editable /usr/src/singlecellopenproblems -RUN sudo pip check ' > {output}""" ruleorder: update_docker_image > build_docker_image From 365688df64ee23dbd8868d3c6509f2fc882c3761 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 12:51:21 -0500 Subject: [PATCH 083/266] allow 1000 character dataset summary --- test/test_3_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_3_datasets.py b/test/test_3_datasets.py index 7c1acca22a..97666a0ed7 100644 --- a/test/test_3_datasets.py +++ b/test/test_3_datasets.py @@ -13,7 +13,7 @@ import utils.name DATASET_SUMMARY_MINLEN = 40 -DATASET_SUMMARY_MAXLEN = 280 +DATASET_SUMMARY_MAXLEN = 1000 pytestmark = pytest.mark.skipif( len(utils.git.list_modified_tasks()) == 0, reason="No tasks have been modified" From f0ac89e3ac9511975b9e915a6bc8b9c0b30df46e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 12:52:33 -0500 Subject: [PATCH 084/266] fix dataset description spacing --- .../datasets/allen_brain_atlas.py | 4 ++-- .../datasets/mouse_hspc_nestorowa2016.py | 2 +- .../tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py | 2 +- .../tasks/spatial_decomposition/datasets/destvi/generate.py | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py index 40fedd1c1b..009f2bdc90 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py @@ -10,8 +10,8 @@ dataset_summary="A murine brain atlas with adjacent cell types as assumed " "benchmark truth, inferred from deconvolution proportion " "correlations using matching 10x Visium slides " - "(see Dimitrov et al., 2022)." - " 14249 cells x 34617 features with 23 cell type labels.", + "(see Dimitrov et al., 2022). " + "14249 cells x 34617 features with 23 cell type labels.", image="openproblems-r-extras", ) def mouse_brain_atlas(test=False): diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py index 5aa7f68404..a4b7a783cd 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py @@ -7,7 +7,7 @@ data_url=load_mouse_hspc_nestorowa2016.metadata["data_url"], data_reference=load_mouse_hspc_nestorowa2016.metadata["data_reference"], dataset_summary="1.6k hematopoietic stem and progenitor cells from mouse bone " - "marrow. Sequenced by Smart-seq2." + "marrow. Sequenced by Smart-seq2. " "1920 cells x 43258 features with 3 cell type labels", ) def mouse_hspc_nestorowa2016(test=False): diff --git a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py index 42e3558c1e..758d0dc78b 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py @@ -8,7 +8,7 @@ data_reference=load_tenx_5k_pbmc.metadata["data_reference"], dataset_summary=( "5k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. " - "Sequenced on 10X v3 chemistry in July 2019 by 10X Genomics." + "Sequenced on 10X v3 chemistry in July 2019 by 10X Genomics. " "5247 cells x 20822 features with no cell type labels" ), ) diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py index 84a408d876..86005d07f2 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py @@ -7,9 +7,9 @@ data_url="https://github.com/romain-lopez/DestVI-reproducibility/" "blob/master/simulations/make_dataset.py", data_reference="https://doi.org/10.1038/s41587-022-01272-8", - dataset_summary="scRNA-seq is generated based on learn NB parameters" - "from the destVI manuscripts leveraging sparsePCA. Number of cells and" - "cell types present in each spatial spot is computed via combination of" + dataset_summary="scRNA-seq is generated based on learn NB parameters " + "from the destVI manuscripts leveraging sparsePCA. Number of cells and " + "cell types present in each spatial spot is computed via combination of " "kernel-based parametrization of a categorical distribution and the NB model.", image="openproblems-python-extras", ) From 4c7cba8be5443cbbb6af81ce398bcdb9879e36ed Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 13:53:57 -0500 Subject: [PATCH 085/266] use ubuntu-latest --- .github/workflows/run_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index d698eee8e3..82f99b53ab 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -28,7 +28,7 @@ jobs: build_images: needs: cancel_previous_runs - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest if: | !endsWith(github.event.head_commit.message, '# ci skip') && ( @@ -215,7 +215,7 @@ jobs: run_test_benchmark: needs: run_tester - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest if: >- always() && !endsWith(github.event.head_commit.message, '# ci skip') && From db0320ba140789696d4beb0d6f8e63b205398108 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 15:16:03 -0500 Subject: [PATCH 086/266] skip --- .github/workflows/run_tests.yml | 80 ++++++++++++++++----------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 61e9d35c87..e1f7d6b245 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -365,45 +365,45 @@ jobs: snakemake -j $(nproc) docker_pull cd .. - - name: Build Docker images - if: | - !( - startsWith(github.ref, 'refs/heads/main') && - github.repository == 'openproblems-bio/openproblems' - ) - run: | - cd workflow - SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker - cd .. - - - name: Upload Docker images - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - run: | - ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" - aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ - docker login --username AWS --password-stdin $ECR_ENDPOINT - for image in $(cd docker && ls -1d */ | tr -d '/'); do - docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} - docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} - done - - - name: Upload Docker images for full benchmark - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - if: >- - startsWith(github.ref, 'refs/tags') || - startsWith(github.ref, 'refs/heads/test_benchmark') - run: | - ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" - for image in $(cd docker && ls -1d */ | tr -d '/'); do - docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} - docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} - done +# - name: Build Docker images +# if: | +# !( +# startsWith(github.ref, 'refs/heads/main') && +# github.repository == 'openproblems-bio/openproblems' +# ) +# run: | +# cd workflow +# SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker +# cd .. + +# - name: Upload Docker images +# env: +# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} +# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +# AWS_DEFAULT_REGION: us-west-2 +# run: | +# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" +# aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ +# docker login --username AWS --password-stdin $ECR_ENDPOINT +# for image in $(cd docker && ls -1d */ | tr -d '/'); do +# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} +# docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} +# done +# +# - name: Upload Docker images for full benchmark +# env: +# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} +# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +# AWS_DEFAULT_REGION: us-west-2 +# if: >- +# startsWith(github.ref, 'refs/tags') || +# startsWith(github.ref, 'refs/heads/test_benchmark') +# run: | +# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" +# for image in $(cd docker && ls -1d */ | tr -d '/'); do +# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} +# docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} +# done - name: Run test benchmark env: @@ -417,7 +417,7 @@ jobs: RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" cd /mnt/openproblems-nextflow/cwd/${BRANCH} nextflow run \ - -revision v1.9 \ + -revision master \ -with-tower \ -ansi-log false -resume \ -profile aws,test \ From da6cc4bd12c40c3f4c1b01c475411d085a0f42f4 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 15:53:29 -0500 Subject: [PATCH 087/266] update tag --- .github/workflows/run_tests.yml | 82 ++++++++++++++++----------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index e1f7d6b245..69eb675464 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -365,45 +365,45 @@ jobs: snakemake -j $(nproc) docker_pull cd .. -# - name: Build Docker images -# if: | -# !( -# startsWith(github.ref, 'refs/heads/main') && -# github.repository == 'openproblems-bio/openproblems' -# ) -# run: | -# cd workflow -# SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker -# cd .. - -# - name: Upload Docker images -# env: -# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} -# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} -# AWS_DEFAULT_REGION: us-west-2 -# run: | -# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" -# aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ -# docker login --username AWS --password-stdin $ECR_ENDPOINT -# for image in $(cd docker && ls -1d */ | tr -d '/'); do -# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} -# docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} -# done -# -# - name: Upload Docker images for full benchmark -# env: -# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} -# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} -# AWS_DEFAULT_REGION: us-west-2 -# if: >- -# startsWith(github.ref, 'refs/tags') || -# startsWith(github.ref, 'refs/heads/test_benchmark') -# run: | -# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" -# for image in $(cd docker && ls -1d */ | tr -d '/'); do -# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} -# docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} -# done + - name: Build Docker images + if: | + !( + startsWith(github.ref, 'refs/heads/main') && + github.repository == 'openproblems-bio/openproblems' + ) + run: | + cd workflow + SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker + cd .. + + - name: Upload Docker images + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + run: | + ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" + aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ + docker login --username AWS --password-stdin $ECR_ENDPOINT + for image in $(cd docker && ls -1d */ | tr -d '/'); do + docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} + docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} + done + + - name: Upload Docker images for full benchmark + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + if: >- + startsWith(github.ref, 'refs/tags') || + startsWith(github.ref, 'refs/heads/test_benchmark') + run: | + ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" + for image in $(cd docker && ls -1d */ | tr -d '/'); do + docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} + docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} + done - name: Run test benchmark env: @@ -417,7 +417,7 @@ jobs: RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" cd /mnt/openproblems-nextflow/cwd/${BRANCH} nextflow run \ - -revision master \ + -revision v1.9 \ -with-tower \ -ansi-log false -resume \ -profile aws,test \ @@ -473,7 +473,7 @@ jobs: env: TOWER_WATCH_URL: https://tower.nf/orgs/openproblems-bio/workspaces/openproblems-bio/watch TOWER_WORKSPACE_ID: 53907369739130 - TOWER_ACTION_ID: 7jylKuFGbSN65qSA4NfdFY + TOWER_ACTION_ID: bVQhVSNah1JmJfnKkfyjg run: | generate_parameters() { From c7b488923f338a5b20ac814786526ef1f5175b46 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 15:54:08 -0500 Subject: [PATCH 088/266] ubuntu-latest --- .github/workflows/pre-commit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 0929ca3594..47c14ff13e 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -8,7 +8,7 @@ on: jobs: pre-commit: - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest container: image: singlecellopenproblems/openproblems-github-actions:latest From bbf9b58fad4fe4037b4425c106c55d734fd6112b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 15:59:10 -0500 Subject: [PATCH 089/266] gitlab -> github --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 85cc5bd5b2..57a3fce70c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: rev: v1.5.4 hooks: - id: autopep8 - - repo: https://gitlab.com/pycqa/flake8 + - repo: https://github.com/pycqa/flake8 rev: 3.8.4 hooks: - id: flake8 From 4d4ebfd99094422ddc77a78f48845c762f605549 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 16:14:08 -0500 Subject: [PATCH 090/266] nothing commit --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index b0c7316e85..b80460ab9b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,9 +2,9 @@ ignore = # top-level module docstring D100, D104, - # space before : conflicts with black + # space before: conflicts with black E203, - # import not in alphabetical : conflicts with isort + # import not in alphabetical: conflicts with isort H306 per-file-ignores = # imported but unused From c2470ce02e6f196267cec1c554ba7ae389c0956a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 17:19:16 -0500 Subject: [PATCH 091/266] only import big packages at runtime --- openproblems/data/allen_brain_atlas.py | 3 ++- openproblems/data/cengen.py | 3 ++- openproblems/data/immune_cells.py | 3 ++- openproblems/data/mouse_blood_olssen_labelled.py | 3 ++- openproblems/data/mouse_hspc_nestorowa2016.py | 3 ++- openproblems/data/multimodal/utils.py | 3 ++- openproblems/data/pancreas.py | 3 ++- openproblems/data/tabula_muris_senis.py | 3 ++- openproblems/data/tenx.py | 5 ++++- openproblems/data/tnbc_wu2021.py | 3 ++- openproblems/data/utils.py | 5 ++++- .../_batch_integration/batch_integration_embed/api.py | 3 ++- .../batch_integration_feature/api.py | 3 ++- .../_batch_integration/batch_integration_graph/api.py | 3 ++- .../batch_integration_graph/datasets/immune.py | 4 ++-- .../batch_integration_graph/datasets/pancreas.py | 4 ++-- .../tasks/_cell_cell_communication/_common/api.py | 3 ++- .../_cell_cell_communication/_common/metrics/auprc.py | 5 +++-- openproblems/tasks/denoising/datasets/utils.py | 2 +- openproblems/tasks/denoising/methods/dca.py | 9 +++++---- openproblems/tasks/denoising/metrics/mse.py | 7 +++---- openproblems/tasks/denoising/metrics/poisson.py | 4 ++-- openproblems/tasks/dimensionality_reduction/api.py | 3 ++- .../tasks/dimensionality_reduction/methods/densmap.py | 3 ++- .../tasks/dimensionality_reduction/methods/pca.py | 4 ++-- .../tasks/dimensionality_reduction/methods/tsne.py | 4 ++-- .../tasks/dimensionality_reduction/methods/umap.py | 4 ++-- .../tasks/dimensionality_reduction/metrics/density.py | 4 ++-- .../dimensionality_reduction/metrics/nn_ranking.py | 6 ++++-- .../metrics/root_mean_square_error.py | 10 ++++++---- .../metrics/trustworthiness.py | 3 ++- .../tasks/label_projection/methods/knn_classifier.py | 5 ++++- .../label_projection/methods/logistic_regression.py | 3 ++- openproblems/tasks/label_projection/methods/mlp.py | 3 ++- openproblems/tasks/label_projection/methods/sklearn.py | 5 +++-- openproblems/tasks/label_projection/methods/utils.py | 6 ++---- .../tasks/label_projection/metrics/accuracy.py | 3 ++- openproblems/tasks/label_projection/metrics/f1.py | 6 +++--- .../multimodal_data_integration/methods/baseline.py | 5 ++++- .../methods/harmonic_alignment.py | 2 +- .../multimodal_data_integration/methods/procrustes.py | 6 +++--- .../multimodal_data_integration/metrics/knn_auc.py | 5 +++-- .../tasks/regulatory_effect_prediction/methods/beta.py | 8 +++++++- .../metrics/correlation.py | 5 +++-- .../spatial_decomposition/datasets/destvi/utils.py | 9 +++++---- .../tasks/spatial_decomposition/datasets/pancreas.py | 3 ++- openproblems/tools/conversion.py | 1 + openproblems/tools/decorators.py | 2 +- openproblems/tools/normalize.py | 8 +++++++- 49 files changed, 131 insertions(+), 79 deletions(-) diff --git a/openproblems/data/allen_brain_atlas.py b/openproblems/data/allen_brain_atlas.py index 66412172ee..fecbca85d1 100644 --- a/openproblems/data/allen_brain_atlas.py +++ b/openproblems/data/allen_brain_atlas.py @@ -2,7 +2,6 @@ import numpy as np import os -import scanpy as sc import scprep import tempfile @@ -17,6 +16,8 @@ def load_mouse_brain_atlas(test=False): to the dataset is available at: https://figshare.com/articles/dataset/allen_brain_h5ad/20338089 """ + import scanpy as sc + if test: # load full data first, cached if available adata = load_mouse_brain_atlas(test=False) diff --git a/openproblems/data/cengen.py b/openproblems/data/cengen.py index e47f312fa4..859755f558 100644 --- a/openproblems/data/cengen.py +++ b/openproblems/data/cengen.py @@ -1,7 +1,6 @@ from . import utils import os -import scanpy as sc import scprep import tempfile @@ -22,6 +21,8 @@ def load_cengen(test=False): To learn about WormBase curation efforts for C. elegans single cell data visit https://wormbase.github.io/single-cell/ """ + import scanpy as sc + with tempfile.TemporaryDirectory() as tempdir: filepath = os.path.join(tempdir, "cengen.h5ad") scprep.io.download.download_url(URL, filepath) diff --git a/openproblems/data/immune_cells.py b/openproblems/data/immune_cells.py index abed1d0a28..ce740d14f0 100644 --- a/openproblems/data/immune_cells.py +++ b/openproblems/data/immune_cells.py @@ -1,7 +1,6 @@ from . import utils import os -import scanpy as sc import scprep import tempfile @@ -12,6 +11,8 @@ @utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/s41592-021-01336-8") def load_immune(test=False): """Download immune human data from figshare.""" + import scanpy as sc + if test: # load full data first, cached if available adata = load_immune(test=False) diff --git a/openproblems/data/mouse_blood_olssen_labelled.py b/openproblems/data/mouse_blood_olssen_labelled.py index d04ecb353f..9deb09a4e1 100644 --- a/openproblems/data/mouse_blood_olssen_labelled.py +++ b/openproblems/data/mouse_blood_olssen_labelled.py @@ -1,7 +1,6 @@ from . import utils import os -import scanpy as sc import scprep import tempfile @@ -13,6 +12,8 @@ @utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/nature19348") def load_olsson_2016_mouse_blood(test=False): """Download Olsson, 2016_mouse_blood, Nature, 2016 data from Figshare.""" + import scanpy as sc + if test: # load full data first, cached if available adata = load_olsson_2016_mouse_blood(test=False) diff --git a/openproblems/data/mouse_hspc_nestorowa2016.py b/openproblems/data/mouse_hspc_nestorowa2016.py index 81218ef9f0..d409c2ce70 100644 --- a/openproblems/data/mouse_hspc_nestorowa2016.py +++ b/openproblems/data/mouse_hspc_nestorowa2016.py @@ -1,7 +1,6 @@ from . import utils import os -import scanpy as sc import scprep import tempfile @@ -15,6 +14,8 @@ ) def load_mouse_hspc_nestorowa2016(test=False): """Download Nesterova data from Figshare.""" + import scanpy as sc + if test: # load full data first, cached if available adata = load_mouse_hspc_nestorowa2016(test=False) diff --git a/openproblems/data/multimodal/utils.py b/openproblems/data/multimodal/utils.py index 6757ba0675..d4b4245d8a 100644 --- a/openproblems/data/multimodal/utils.py +++ b/openproblems/data/multimodal/utils.py @@ -1,7 +1,6 @@ import anndata import numpy as np import pandas as pd -import scanpy as sc import scprep @@ -17,6 +16,8 @@ def subset_mode2_genes(adata, keep_genes): def filter_joint_data_empty_cells(adata): """Remove empty cells and genes from a multimodal dataset.""" + import scanpy as sc + assert np.all(adata.uns["mode2_obs"] == adata.obs.index) # filter cells n_cells_mode1 = scprep.utils.toarray(adata.X.sum(axis=1)).flatten() diff --git a/openproblems/data/pancreas.py b/openproblems/data/pancreas.py index a67943a1fc..ff0c0af843 100644 --- a/openproblems/data/pancreas.py +++ b/openproblems/data/pancreas.py @@ -2,7 +2,6 @@ import numpy as np import os -import scanpy as sc import scprep import tempfile @@ -13,6 +12,8 @@ @utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/s41592-021-01336-8") def load_pancreas(test=False, keep_techs=None): """Download pancreas data from figshare.""" + import scanpy as sc + if test: # load full data first, cached if available adata = load_pancreas( diff --git a/openproblems/data/tabula_muris_senis.py b/openproblems/data/tabula_muris_senis.py index b687e3c401..c634e85d43 100644 --- a/openproblems/data/tabula_muris_senis.py +++ b/openproblems/data/tabula_muris_senis.py @@ -3,7 +3,6 @@ import anndata as ad import os import requests -import scanpy as sc import scprep import tempfile import time @@ -59,6 +58,8 @@ def matching_dataset(dataset, method_list, organ_list): def load_raw_counts(dataset): + import scanpy as sc + dataset_id = dataset["id"] assets_path = ( f"/curation/v1/collections/{COLLECTION_ID}/datasets/{dataset_id}/assets" diff --git a/openproblems/data/tenx.py b/openproblems/data/tenx.py index 73cccee21c..59f9cb93da 100644 --- a/openproblems/data/tenx.py +++ b/openproblems/data/tenx.py @@ -1,7 +1,6 @@ from . import utils import os -import scanpy as sc import scprep import tempfile @@ -17,6 +16,8 @@ @utils.loader(data_url=PBMC_1K_URL, data_reference=REFERENCE_URL) def load_tenx_1k_pbmc(test=False): """Download PBMC data from Figshare.""" + import scanpy as sc + if test: adata = load_tenx_1k_pbmc(test=False) sc.pp.subsample(adata, n_obs=100) @@ -34,6 +35,8 @@ def load_tenx_1k_pbmc(test=False): @utils.loader(data_url=PBMC_5K_URL, data_reference=REFERENCE_URL) def load_tenx_5k_pbmc(test=False): """Download 5k PBMCs from 10x Genomics.""" + import scanpy as sc + if test: # load full data first, cached if available adata = load_tenx_5k_pbmc(test=False) diff --git a/openproblems/data/tnbc_wu2021.py b/openproblems/data/tnbc_wu2021.py index 58dcc53d8b..c0a3e6a941 100644 --- a/openproblems/data/tnbc_wu2021.py +++ b/openproblems/data/tnbc_wu2021.py @@ -2,7 +2,6 @@ import numpy as np import os -import scanpy as sc import scipy.sparse import scprep import tempfile @@ -19,6 +18,8 @@ def load_tnbc_data(test=False): https://figshare.com/articles/dataset/TNBC_Data_from_Wu_et_al_2021/20338536 """ + import scanpy as sc + if test: # load full data first, cached if available adata = load_tnbc_data(test=False) diff --git a/openproblems/data/utils.py b/openproblems/data/utils.py index d4e8312f52..9bc05a6e8b 100644 --- a/openproblems/data/utils.py +++ b/openproblems/data/utils.py @@ -5,7 +5,6 @@ import hashlib import logging import os -import scanpy as sc import scipy.sparse log = logging.getLogger("openproblems") @@ -94,6 +93,8 @@ def apply_func(*args, **kwargs): def filter_genes_cells(adata): """Remove empty cells and genes.""" + import scanpy as sc + if "var_names_all" not in adata.uns: # fill in original var names before filtering adata.uns["var_names_all"] = adata.var.index.to_numpy() @@ -117,6 +118,8 @@ def subsample_even(adata, n_obs, even_obs): adata : AnnData Subsampled AnnData object """ + import scanpy as sc + values = adata.obs[even_obs].unique() adatas = [] n_obs_per_value = n_obs // len(values) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py index c867a03157..52f26ee0f0 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py @@ -2,7 +2,6 @@ from ....tools.decorators import dataset import numpy as np -import scanpy as sc def check_dataset(adata): @@ -25,6 +24,8 @@ def check_method(adata, is_baseline=False): @dataset() def sample_dataset(): """Create a simple dataset to use for testing methods in this task.""" + import scanpy as sc + adata = load_sample_data() adata.var.index = adata.var.gene_short_name.astype(str) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py index adf42b38c2..088cd38ea9 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py @@ -2,7 +2,6 @@ from ....tools.decorators import dataset import numpy as np -import scanpy as sc def check_dataset(adata): @@ -29,6 +28,8 @@ def check_method(adata, is_baseline=False): @dataset() def sample_dataset(): """Create a simple dataset to use for testing methods in this task.""" + import scanpy as sc + adata = load_sample_data() adata.var.index = adata.var.gene_short_name.astype(str) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/api.py b/openproblems/tasks/_batch_integration/batch_integration_graph/api.py index 123860d6f2..5cc9831f99 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/api.py @@ -2,7 +2,6 @@ from ....tools.decorators import dataset import numpy as np -import scanpy as sc def check_dataset(adata): @@ -34,6 +33,8 @@ def check_method(adata, is_baseline=False): @dataset() def sample_dataset(): """Create a simple dataset to use for testing methods in this task.""" + import scanpy as sc + adata = load_sample_data() adata.obsm["X_uni_pca"] = sc.pp.pca(adata.X) adata.layers["log_normalized"] = adata.X diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py index 8c39c0fe28..bee6e7699f 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py @@ -1,8 +1,6 @@ from .....data.immune_cells import load_immune from .....tools.decorators import dataset -import scanpy as sc - @dataset( dataset_name="Immune (by batch)", @@ -14,6 +12,8 @@ image="openproblems", ) def immune_batch(test=False): + import scanpy as sc + adata = load_immune(test) adata.obs["labels"] = adata.obs["final_annotation"] diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py index 6d8af4f505..23dcdd016b 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py @@ -1,8 +1,6 @@ from .....data.pancreas import load_pancreas from .....tools.decorators import dataset -import scanpy as sc - @dataset( dataset_name="Pancreas (by batch)", @@ -14,6 +12,8 @@ image="openproblems", ) def pancreas_batch(test=False): + import scanpy as sc + adata = load_pancreas(test) adata.obs["labels"] = adata.obs["celltype"] adata.obs["batch"] = adata.obs["tech"] diff --git a/openproblems/tasks/_cell_cell_communication/_common/api.py b/openproblems/tasks/_cell_cell_communication/_common/api.py index 8ebdb66c3d..2010de8c2b 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/api.py +++ b/openproblems/tasks/_cell_cell_communication/_common/api.py @@ -3,7 +3,6 @@ import numbers import numpy as np import pandas as pd -import scanpy as sc SAMPLE_RECEPTOR_NAMES = [ "LGALS9", @@ -197,6 +196,8 @@ def check_method(adata, merge_keys, is_baseline=False): def sample_dataset(merge_keys): """Create a simple dataset to use for testing methods in this task.""" + import scanpy as sc + adata = load_sample_data() adata.uns["merge_keys"] = merge_keys diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py index 430541190b..9365998a30 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py @@ -1,11 +1,12 @@ from .....tools.decorators import metric from ..utils import join_truth_and_pred -from sklearn.metrics import auc -from sklearn.metrics import precision_recall_curve @metric(metric_name="Precision-recall AUC", maximize=True) def auprc(adata): + from sklearn.metrics import auc + from sklearn.metrics import precision_recall_curve + gt = join_truth_and_pred(adata) precision, recall, _ = precision_recall_curve( gt["response"], gt["score"], pos_label=1 diff --git a/openproblems/tasks/denoising/datasets/utils.py b/openproblems/tasks/denoising/datasets/utils.py index ec31002e72..3a91e9cebb 100644 --- a/openproblems/tasks/denoising/datasets/utils.py +++ b/openproblems/tasks/denoising/datasets/utils.py @@ -1,6 +1,5 @@ import anndata import numpy as np -import scipy.sparse def split_data( @@ -11,6 +10,7 @@ def split_data( Stores "train" and "test" dataset using the AnnData.obsm property. """ import molecular_cross_validation.util + import scipy.sparse random_state = np.random.RandomState(seed) diff --git a/openproblems/tasks/denoising/methods/dca.py b/openproblems/tasks/denoising/methods/dca.py index eee6c78354..fddfb85d51 100644 --- a/openproblems/tasks/denoising/methods/dca.py +++ b/openproblems/tasks/denoising/methods/dca.py @@ -1,18 +1,19 @@ from ....tools.decorators import method from ....tools.utils import check_version -import scanpy as sc - def _dca(adata, test=False, epochs=None): + from dca.api import dca + + import anndata + if test: epochs = epochs or 30 else: # pragma: nocover epochs = epochs or 300 - from dca.api import dca # make adata object with train counts - adata_train = sc.AnnData(adata.obsm["train"]) + adata_train = anndata.AnnData(adata.obsm["train"]) # run DCA dca(adata_train, epochs=epochs) diff --git a/openproblems/tasks/denoising/metrics/mse.py b/openproblems/tasks/denoising/metrics/mse.py index 63c8f17a16..4a663ed6e6 100644 --- a/openproblems/tasks/denoising/metrics/mse.py +++ b/openproblems/tasks/denoising/metrics/mse.py @@ -1,13 +1,12 @@ from ....tools.decorators import metric -import anndata -import scanpy as sc -import sklearn.metrics - @metric(metric_name="Mean-squared error", maximize=False) def mse(adata): + import anndata + import scanpy as sc import scprep + import sklearn.metrics test_data = anndata.AnnData(X=adata.obsm["test"], obs=adata.obs, var=adata.var) denoised_data = anndata.AnnData( diff --git a/openproblems/tasks/denoising/metrics/poisson.py b/openproblems/tasks/denoising/metrics/poisson.py index e4f0f6a749..ebd2a73378 100644 --- a/openproblems/tasks/denoising/metrics/poisson.py +++ b/openproblems/tasks/denoising/metrics/poisson.py @@ -1,12 +1,12 @@ from ....tools.decorators import metric -import scprep - @metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-extras") def poisson(adata): from molecular_cross_validation.mcv_sweep import poisson_nll_loss + import scprep + test_data = adata.obsm["test"] denoised_data = adata.obsm["denoised"] diff --git a/openproblems/tasks/dimensionality_reduction/api.py b/openproblems/tasks/dimensionality_reduction/api.py index c58f025a1c..d217c2df58 100644 --- a/openproblems/tasks/dimensionality_reduction/api.py +++ b/openproblems/tasks/dimensionality_reduction/api.py @@ -2,7 +2,6 @@ from ...tools.decorators import dataset import numpy as np -import scanpy as sc def check_dataset(adata): @@ -27,6 +26,8 @@ def sample_dataset(): def sample_method(adata): """Create sample method output for testing metrics in this task.""" + import scanpy as sc + sc.tl.pca(adata) adata.obsm["X_emb"] = adata.obsm["X_pca"][:, :2] return adata diff --git a/openproblems/tasks/dimensionality_reduction/methods/densmap.py b/openproblems/tasks/dimensionality_reduction/methods/densmap.py index 900e9f78f2..b7a0285774 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/densmap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/densmap.py @@ -3,7 +3,6 @@ from ....tools.utils import check_version import functools -import scanpy as sc _densmap_method = functools.partial( method, @@ -36,6 +35,8 @@ def densmap_logCPM_1kHVG(adata, test: bool = False): @_densmap_method(method_name="densMAP PCA (logCPM, 1kHVG)") def densmap_pca_logCPM_1kHVG(adata, test: bool = False): + import scanpy as sc + adata = log_cpm_hvg(adata) sc.tl.pca(adata, n_comps=50, svd_solver="arpack") return _densmap(adata, obsm="X_pca") diff --git a/openproblems/tasks/dimensionality_reduction/methods/pca.py b/openproblems/tasks/dimensionality_reduction/methods/pca.py index e81772d1f0..1c7c186471 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pca.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pca.py @@ -2,8 +2,6 @@ from ....tools.normalize import log_cpm_hvg from ....tools.utils import check_version -import scanpy as sc - @method( method_name="Principle Component Analysis (PCA) (logCPM, 1kHVG)", @@ -14,6 +12,8 @@ "sklearn.decomposition.PCA.html", ) def pca_logCPM_1kHVG(adata, test: bool = False): + import scanpy as sc + adata = log_cpm_hvg(adata) sc.tl.pca(adata, n_comps=50, svd_solver="arpack") adata.obsm["X_emb"] = adata.obsm["X_pca"][:, :2] diff --git a/openproblems/tasks/dimensionality_reduction/methods/tsne.py b/openproblems/tasks/dimensionality_reduction/methods/tsne.py index 6b987c6457..e19fa6cd3e 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/tsne.py +++ b/openproblems/tasks/dimensionality_reduction/methods/tsne.py @@ -2,8 +2,6 @@ from ....tools.normalize import log_cpm_hvg from ....tools.utils import check_version -import scanpy as sc - @method( method_name="“t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCPM, 1kHVG)", @@ -15,6 +13,8 @@ image="openproblems-python-extras", ) def tsne_logCPM_1kHVG(adata, test: bool = False, n_pca=50): + import scanpy as sc + adata = log_cpm_hvg(adata) sc.tl.pca(adata, n_comps=n_pca, svd_solver="arpack") sc.tl.tsne(adata, use_rep="X_pca", n_pcs=n_pca) diff --git a/openproblems/tasks/dimensionality_reduction/methods/umap.py b/openproblems/tasks/dimensionality_reduction/methods/umap.py index a64533f0b8..b9e73adf90 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/umap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/umap.py @@ -2,8 +2,6 @@ from ....tools.normalize import log_cpm_hvg from ....tools.utils import check_version -import scanpy as sc - @method( method_name="Uniform Manifold Approximation and Projection (UMAP), " @@ -15,6 +13,8 @@ code_url="https://github.com/lmcinnes/umap", ) def umap_logCPM_1kHVG(adata, test: bool = False, n_pca=50): + import scanpy as sc + adata = log_cpm_hvg(adata) sc.tl.pca(adata, n_comps=50, svd_solver="arpack") sc.pp.neighbors(adata, use_rep="X_pca", n_pcs=n_pca) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/density.py b/openproblems/tasks/dimensionality_reduction/metrics/density.py index a44a248aeb..680fd00ec0 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/density.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/density.py @@ -1,7 +1,5 @@ from anndata import AnnData from openproblems.tools.decorators import metric -from scipy.sparse import issparse -from scipy.stats import pearsonr from typing import Optional import numpy as np @@ -100,6 +98,8 @@ def _calculate_radii( @metric("density preservation", maximize=True, image="openproblems-python-extras") def density_preservation(adata: AnnData) -> float: + from scipy.sparse import issparse + from scipy.stats import pearsonr from umap import UMAP emb = adata.obsm["X_emb"] diff --git a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py index 946a4d5095..690b95eaeb 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py @@ -18,8 +18,6 @@ from ....tools.normalize import log_cpm_hvg from anndata import AnnData from numba import njit -from scipy.sparse import issparse -from sklearn.metrics import pairwise_distances from typing import Tuple import numpy as np @@ -155,6 +153,8 @@ def _metrics( def _high_dim(adata: AnnData) -> np.ndarray: + from scipy.sparse import issparse + adata.X = adata.layers["counts"] adata = log_cpm_hvg(adata) high_dim = adata.X @@ -164,6 +164,8 @@ def _high_dim(adata: AnnData) -> np.ndarray: def _fit( X: np.ndarray, E: np.ndarray ) -> Tuple[float, float, float, float, float, float, float]: + from sklearn.metrics import pairwise_distances + if np.any(np.isnan(E)): return 0.0, 0.0, 0.0, 0.5, -np.inf, -np.inf, -np.inf diff --git a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py index 9e12d5d798..52136ee29b 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py @@ -1,9 +1,6 @@ from ....tools.decorators import metric import numpy as np -import scipy as sp -import sklearn.decomposition -import sklearn.metrics def calculate_squareform_pairwise_distance(data): @@ -12,11 +9,16 @@ def calculate_squareform_pairwise_distance(data): Compute pairwise distance between points in a matrix / vector and then format this into a squareform vector. """ - return sp.spatial.distance.squareform(sp.spatial.distance.pdist(data)) + import scipy.spatial + + return scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(data)) def calculate_rmse(adata, n_svd=200): """Calculate dimensional reduction stress via root mean square error.""" + import sklearn.decomposition + import sklearn.metrics + X = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) high_dimensional_distance_matrix = calculate_squareform_pairwise_distance(X) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py index 3daf3360cd..d308ef4486 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py @@ -1,12 +1,13 @@ from ....tools.decorators import metric from anndata import AnnData -from sklearn import manifold import numpy as np @metric(metric_name="trustworthiness", maximize=True) def trustworthiness(adata: AnnData) -> float: + from sklearn import manifold + high_dim, low_dim = adata.X, adata.obsm["X_emb"] score = manifold.trustworthiness( diff --git a/openproblems/tasks/label_projection/methods/knn_classifier.py b/openproblems/tasks/label_projection/methods/knn_classifier.py index ffe576bc72..62109c89b5 100644 --- a/openproblems/tasks/label_projection/methods/knn_classifier.py +++ b/openproblems/tasks/label_projection/methods/knn_classifier.py @@ -4,7 +4,6 @@ from .sklearn import classifier import functools -import sklearn.neighbors _knn_classifier_method = functools.partial( method, @@ -20,6 +19,8 @@ method_name="K-neighbors classifier (log CPM)", ) def knn_classifier_log_cpm(adata, test=False): + import sklearn.neighbors + adata = log_cpm(adata) return classifier(adata, estimator=sklearn.neighbors.KNeighborsClassifier) @@ -29,5 +30,7 @@ def knn_classifier_log_cpm(adata, test=False): image="openproblems-r-base", ) def knn_classifier_scran(adata, test=False): + import sklearn.neighbors + adata = log_scran_pooling(adata) return classifier(adata, estimator=sklearn.neighbors.KNeighborsClassifier) diff --git a/openproblems/tasks/label_projection/methods/logistic_regression.py b/openproblems/tasks/label_projection/methods/logistic_regression.py index 25a2c40a5a..8c393b5387 100644 --- a/openproblems/tasks/label_projection/methods/logistic_regression.py +++ b/openproblems/tasks/label_projection/methods/logistic_regression.py @@ -4,7 +4,6 @@ from .sklearn import classifier import functools -import sklearn.linear_model _logistic_regression_method = functools.partial( method, @@ -17,6 +16,8 @@ def _logistic_regression(adata, test=False, max_iter=None): + import sklearn.linear_model + if test: max_iter = max_iter or 100 else: # pragma: no cover diff --git a/openproblems/tasks/label_projection/methods/mlp.py b/openproblems/tasks/label_projection/methods/mlp.py index 87e626bab6..71d1dcea9b 100644 --- a/openproblems/tasks/label_projection/methods/mlp.py +++ b/openproblems/tasks/label_projection/methods/mlp.py @@ -4,7 +4,6 @@ from .sklearn import classifier import functools -import sklearn.neural_network _mlp_method = functools.partial( method, @@ -17,6 +16,8 @@ def _mlp(adata, test=False, max_iter=None, hidden_layer_sizes=None): + import sklearn.neural_network + if test: hidden_layer_sizes = hidden_layer_sizes or (20,) max_iter = max_iter or 100 diff --git a/openproblems/tasks/label_projection/methods/sklearn.py b/openproblems/tasks/label_projection/methods/sklearn.py index 977446167a..18ec00ca65 100644 --- a/openproblems/tasks/label_projection/methods/sklearn.py +++ b/openproblems/tasks/label_projection/methods/sklearn.py @@ -2,12 +2,13 @@ from .utils import pca_op import numpy as np -import sklearn.pipeline -import sklearn.preprocessing def classifier(adata, estimator, n_pca=100, **kwargs): """Run a generic scikit-learn classifier.""" + import sklearn.pipeline + import sklearn.preprocessing + adata_train = adata[adata.obs["is_train"]] adata_test = adata[~adata.obs["is_train"]].copy() diff --git a/openproblems/tasks/label_projection/methods/utils.py b/openproblems/tasks/label_projection/methods/utils.py index 056b586e5d..f1925684f3 100644 --- a/openproblems/tasks/label_projection/methods/utils.py +++ b/openproblems/tasks/label_projection/methods/utils.py @@ -1,8 +1,6 @@ -import scipy.sparse -import sklearn.decomposition - - def pca_op(adata_train, adata_test, n_components=100): + import scipy.sparse + import sklearn.decomposition is_sparse = scipy.sparse.issparse(adata_train.X) diff --git a/openproblems/tasks/label_projection/metrics/accuracy.py b/openproblems/tasks/label_projection/metrics/accuracy.py index 5e661fa097..d86bf8ec48 100644 --- a/openproblems/tasks/label_projection/metrics/accuracy.py +++ b/openproblems/tasks/label_projection/metrics/accuracy.py @@ -1,11 +1,12 @@ from ....tools.decorators import metric import numpy as np -import sklearn.preprocessing @metric(metric_name="Accuracy", maximize=True) def accuracy(adata): + import sklearn.preprocessing + encoder = sklearn.preprocessing.LabelEncoder().fit(adata.obs["labels"]) test_data = adata[~adata.obs["is_train"]] diff --git a/openproblems/tasks/label_projection/metrics/f1.py b/openproblems/tasks/label_projection/metrics/f1.py index 94ea5446e7..e2e870a6ee 100644 --- a/openproblems/tasks/label_projection/metrics/f1.py +++ b/openproblems/tasks/label_projection/metrics/f1.py @@ -1,10 +1,10 @@ from ....tools.decorators import metric -import sklearn.metrics -import sklearn.preprocessing - def _f1(adata, average="weighted"): + import sklearn.metrics + import sklearn.preprocessing + encoder = sklearn.preprocessing.LabelEncoder().fit(adata.obs["labels"]) test_data = adata[~adata.obs["is_train"]] diff --git a/openproblems/tasks/multimodal_data_integration/methods/baseline.py b/openproblems/tasks/multimodal_data_integration/methods/baseline.py index 8419e90efb..49e9a3c55e 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/baseline.py +++ b/openproblems/tasks/multimodal_data_integration/methods/baseline.py @@ -3,7 +3,6 @@ from ....tools.utils import check_version import numpy as np -import sklearn.decomposition @method( @@ -15,6 +14,8 @@ is_baseline=True, ) def random_features(adata, test=False, n_svd=20): + import sklearn.decomposition + n_svd = min([n_svd, min(adata.X.shape) - 1, min(adata.obsm["mode2"].shape) - 1]) adata = log_cpm(adata) X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) @@ -35,6 +36,8 @@ def random_features(adata, test=False, n_svd=20): is_baseline=True, ) def true_features(adata, test=False, n_svd=20): + import sklearn.decomposition + n_svd = min([n_svd, min(adata.X.shape) - 1, min(adata.obsm["mode2"].shape) - 1]) adata = log_cpm(adata) X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) diff --git a/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py b/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py index e17db7332e..c60b689ec3 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py +++ b/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py @@ -5,7 +5,6 @@ from ....tools.utils import check_version import functools -import sklearn.decomposition _harmonic_alignment_method = functools.partial( method, @@ -20,6 +19,7 @@ def _harmonic_alignment( adata, test=False, n_svd=None, n_eigenvectors=None, n_pca_XY=None, n_filters=None ): import harmonicalignment + import sklearn.decomposition if test: n_svd = n_svd or 20 diff --git a/openproblems/tasks/multimodal_data_integration/methods/procrustes.py b/openproblems/tasks/multimodal_data_integration/methods/procrustes.py index be3a1e8e0f..82ce06fa72 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/procrustes.py +++ b/openproblems/tasks/multimodal_data_integration/methods/procrustes.py @@ -2,9 +2,6 @@ from ....tools.normalize import log_cpm from ....tools.utils import check_version -import scipy.spatial -import sklearn.decomposition - @method( method_name="Procrustes", @@ -15,6 +12,9 @@ "scipy.spatial.procrustes.html", ) def procrustes(adata, test=False, n_svd=None): + import scipy.spatial + import sklearn.decomposition + if test: n_svd = n_svd or 20 else: # pragma: no cover diff --git a/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py b/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py index 88d3d12c0b..7a458ee386 100644 --- a/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py +++ b/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py @@ -1,12 +1,13 @@ from ....tools.decorators import metric import numpy as np -import sklearn.decomposition -import sklearn.neighbors @metric(metric_name="kNN Area Under the Curve", maximize=True) def knn_auc(adata, proportion_neighbors=0.1, n_svd=100): + import sklearn.decomposition + import sklearn.neighbors + n_svd = min([n_svd, min(adata.X.shape) - 1]) n_neighbors = int(np.ceil(proportion_neighbors * adata.X.shape[0])) X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py index 68aa9746ce..e5dffb6fa7 100644 --- a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py +++ b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd -import scanpy as sc import scipy.sparse @@ -70,6 +69,8 @@ def _get_annotation(adata, retries=3): def _filter_mitochondrial(adata): + import scanpy as sc + if adata.uns["species"] in ["mus_musculus", "homo_sapiens"]: adata.var["mt"] = adata.var.gene_short_name.str.lower().str.startswith( "mt-" @@ -92,6 +93,8 @@ def _filter_n_genes_max(adata): def _filter_n_genes_min(adata): + import scanpy as sc + adata_filter = adata.copy() sc.pp.filter_cells(adata_filter, min_genes=200) if adata_filter.shape[0] > 100: @@ -100,6 +103,8 @@ def _filter_n_genes_min(adata): def _filter_n_cells(adata): + import scanpy as sc + adata_filter = adata.copy() sc.pp.filter_genes(adata_filter, min_cells=5) if adata_filter.shape[1] > 100: @@ -117,6 +122,7 @@ def _filter_has_chr(adata): def _beta(adata, test=False, top_genes=None, threshold=1): """Calculate gene scores and insert into .obsm.""" import pybedtools + import scanpy as sc if test: top_genes = top_genes or 100 diff --git a/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py b/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py index 29366d8565..e586a15e8a 100644 --- a/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py +++ b/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py @@ -1,11 +1,12 @@ from ....tools.decorators import metric import numpy as np -import scipy.sparse -import scipy.stats def _correlation(adata, method="pearson"): + import scipy.sparse + import scipy.stats + if method == "pearson": method = scipy.stats.pearsonr else: diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py index a5dc35b03f..b48ef15b12 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py @@ -9,15 +9,11 @@ from scipy.sparse import csr_matrix from scipy.spatial.distance import pdist from scipy.spatial.distance import squareform -from sklearn.cluster import AgglomerativeClustering -from sklearn.decomposition import PCA -from sklearn.neighbors import kneighbors_graph from typing import Optional import anndata import numpy as np import pandas as pd -import scanpy as sc def categorical(p, n_samples): @@ -66,6 +62,11 @@ def generate_synthetic_dataset( K_sampled: Optional[int] = None, # cells sampled for each spot seed: int = 0, ): + from sklearn.cluster import AgglomerativeClustering + from sklearn.decomposition import PCA + from sklearn.neighbors import kneighbors_graph + + import scanpy as sc import torch np.random.seed(seed) diff --git a/openproblems/tasks/spatial_decomposition/datasets/pancreas.py b/openproblems/tasks/spatial_decomposition/datasets/pancreas.py index 7263b55007..9c5c70bcbd 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/pancreas.py +++ b/openproblems/tasks/spatial_decomposition/datasets/pancreas.py @@ -6,7 +6,6 @@ from typing import Optional import functools -import scanpy as sc _pancreas_dataset = functools.partial( dataset, @@ -24,6 +23,8 @@ def _pancreas_synthetic( n_obs: int = 100, keep_techs: Optional[List[str]] = None, ): + import scanpy as sc + adata = load_pancreas(test=test, keep_techs=keep_techs or ["inDrop3"]) sc.pp.filter_genes(adata, min_counts=10) adata.obs["label"] = adata.obs["celltype"] diff --git a/openproblems/tools/conversion.py b/openproblems/tools/conversion.py index 7b32dc65f8..d80393accd 100644 --- a/openproblems/tools/conversion.py +++ b/openproblems/tools/conversion.py @@ -20,6 +20,7 @@ def r_function(filename, args="sce"): fun : scprep.run.RFunction Python callable evaluating the R code """ + assert filename.endswith(".R") # get the path to the module that called `r_function` diff --git a/openproblems/tools/decorators.py b/openproblems/tools/decorators.py index 7912799d19..9157403414 100644 --- a/openproblems/tools/decorators.py +++ b/openproblems/tools/decorators.py @@ -4,7 +4,6 @@ import anndata import functools import logging -import memory_profiler import time log = logging.getLogger("openproblems") @@ -180,6 +179,7 @@ def profile(func): result : dict Contains 'result', 'runtime_s', 'memory_mb', 'memory_leaked_mb' """ + import memory_profiler @functools.wraps(func) def decorated(*args, **kwargs): diff --git a/openproblems/tools/normalize.py b/openproblems/tools/normalize.py index e8fba61c23..e124cc0dab 100644 --- a/openproblems/tools/normalize.py +++ b/openproblems/tools/normalize.py @@ -2,7 +2,6 @@ import anndata as ad import logging -import scanpy as sc import scprep log = logging.getLogger("openproblems") @@ -31,6 +30,8 @@ @decorators.normalizer def log_scran_pooling(adata: ad.AnnData) -> ad.AnnData: """Normalize data with scran via rpy2.""" + import scanpy as sc + scprep.run.install_bioconductor("scran") adata.obs["size_factors"] = _scran(adata) adata.X = scprep.utils.matrix_vector_elementwise_multiply( @@ -41,6 +42,8 @@ def log_scran_pooling(adata: ad.AnnData) -> ad.AnnData: def _cpm(adata: ad.AnnData): + import scanpy as sc + adata.layers["counts"] = adata.X.copy() sc.pp.normalize_total(adata, target_sum=1e6, key_added="size_factors") @@ -55,6 +58,8 @@ def cpm(adata: ad.AnnData) -> ad.AnnData: @decorators.normalizer def log_cpm(adata: ad.AnnData) -> ad.AnnData: """Normalize data to log counts per million.""" + import scanpy as sc + _cpm(adata) sc.pp.log1p(adata) return adata @@ -75,6 +80,7 @@ def log_cpm_hvg(adata: ad.AnnData, n_genes: int = 1000) -> ad.AnnData: Normalize data to log counts per million and select n_genes highly variable genes """ + import scanpy as sc adata = log_cpm(adata) From a540edc22b2be6dc4994d19698b535e967d413f7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 18:05:16 -0500 Subject: [PATCH 092/266] temp --- .github/workflows/run_tests.yml | 106 ++++++++++++++++---------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 69eb675464..7ca2080d13 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -237,8 +237,8 @@ jobs: with: fetch-depth: 1000 - - name: Clear space on runner - run: ./scripts/clear_runner_diskspace.sh +# - name: Clear space on runner +# run: ./scripts/clear_runner_diskspace.sh - name: Install system dependencies run: | @@ -355,55 +355,55 @@ jobs: python -c "import openproblems" openproblems-cli --version openproblems-cli --test-hash - - - name: Pull Docker images - if: | - startsWith(github.ref, 'refs/heads/main') && - github.repository == 'openproblems-bio/openproblems' - run: | - cd workflow - snakemake -j $(nproc) docker_pull - cd .. - - - name: Build Docker images - if: | - !( - startsWith(github.ref, 'refs/heads/main') && - github.repository == 'openproblems-bio/openproblems' - ) - run: | - cd workflow - SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker - cd .. - - - name: Upload Docker images - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - run: | - ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" - aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ - docker login --username AWS --password-stdin $ECR_ENDPOINT - for image in $(cd docker && ls -1d */ | tr -d '/'); do - docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} - docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} - done - - - name: Upload Docker images for full benchmark - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - if: >- - startsWith(github.ref, 'refs/tags') || - startsWith(github.ref, 'refs/heads/test_benchmark') - run: | - ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" - for image in $(cd docker && ls -1d */ | tr -d '/'); do - docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} - docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} - done +# +# - name: Pull Docker images +# if: | +# startsWith(github.ref, 'refs/heads/main') && +# github.repository == 'openproblems-bio/openproblems' +# run: | +# cd workflow +# snakemake -j $(nproc) docker_pull +# cd .. +# +# - name: Build Docker images +# if: | +# !( +# startsWith(github.ref, 'refs/heads/main') && +# github.repository == 'openproblems-bio/openproblems' +# ) +# run: | +# cd workflow +# SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker +# cd .. +# +# - name: Upload Docker images +# env: +# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} +# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +# AWS_DEFAULT_REGION: us-west-2 +# run: | +# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" +# aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ +# docker login --username AWS --password-stdin $ECR_ENDPOINT +# for image in $(cd docker && ls -1d */ | tr -d '/'); do +# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} +# docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} +# done +# +# - name: Upload Docker images for full benchmark +# env: +# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} +# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +# AWS_DEFAULT_REGION: us-west-2 +# if: >- +# startsWith(github.ref, 'refs/tags') || +# startsWith(github.ref, 'refs/heads/test_full_benchmark') +# run: | +# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" +# for image in $(cd docker && ls -1d */ | tr -d '/'); do +# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} +# docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} +# done - name: Run test benchmark env: @@ -460,11 +460,11 @@ jobs: github.event_name == 'push' && ( needs.run_tester.result == 'success' || - startsWith(github.ref, 'refs/heads/test_benchmark') + startsWith(github.ref, 'refs/heads/test_full_benchmark') ) && ( startsWith(github.ref, 'refs/tags') || - startsWith(github.ref, 'refs/heads/test_benchmark') + startsWith(github.ref, 'refs/heads/test_full_benchmark') ) steps: From 4d9edcb77d4a1b56f3653d031837018d5dfe7e7c Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 18:45:29 -0500 Subject: [PATCH 093/266] back to master --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 7ca2080d13..c5293f779d 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -417,7 +417,7 @@ jobs: RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" cd /mnt/openproblems-nextflow/cwd/${BRANCH} nextflow run \ - -revision v1.9 \ + -revision master \ -with-tower \ -ansi-log false -resume \ -profile aws,test \ From d24a30f56629d305002f669fe1215f6cf21465a7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 18:57:53 -0500 Subject: [PATCH 094/266] empty From 1d7bf243aaac2e34990727dca854f79e1858d61f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 15 Nov 2022 19:05:46 -0500 Subject: [PATCH 095/266] empty From 310474247cfe983bc85fb610866f3be0dba48b08 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 15 Nov 2022 19:19:09 -0500 Subject: [PATCH 096/266] add perfect celltype mixing baseline (#686) * add perfect celltype mixing baseline * fix typo * another typo * remove pip check for now * migrate flake8 from gitlab to github * fix dimensionality --- .pre-commit-config.yaml | 2 +- .../methods/__init__.py | 1 + .../methods/baseline.py | 15 +++++++++++ .../methods/__init__.py | 1 + .../methods/baseline.py | 27 +++++++++++++++++++ 5 files changed, 45 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 85cc5bd5b2..57a3fce70c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: rev: v1.5.4 hooks: - id: autopep8 - - repo: https://gitlab.com/pycqa/flake8 + - repo: https://github.com/pycqa/flake8 rev: 3.8.4 hooks: - id: flake8 diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py index 3fd0c826ff..abac573ae2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py @@ -29,6 +29,7 @@ from ...batch_integration_graph.methods.scvi import scvi_full_unscaled from ...batch_integration_graph.methods.scvi import scvi_hvg_unscaled from .baseline import batch_random_integration +from .baseline import celltype_random_embedding from .baseline import celltype_random_integration from .baseline import no_integration from .baseline import random_integration diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py index a7a0aa3c21..783814fb65 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py @@ -1,5 +1,6 @@ from .....tools.decorators import method from .....tools.utils import check_version +from ...batch_integration_graph.methods.baseline import _random_embedding from ...batch_integration_graph.methods.baseline import _randomize_features @@ -47,6 +48,20 @@ def celltype_random_integration(adata, test=False): return adata +@method( + method_name="Random Embedding by Celltype", + paper_name="Random Embedding by Celltype (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def celltype_random_embedding(adata, test=False): + adata.obsm["X_emb"] = _random_embedding(partition=adata.obs["labels"]) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + @method( method_name="Random Integration by Batch", paper_name="Random Integration by Batch (baseline)", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py index 89519752f9..f664b9139d 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py @@ -1,4 +1,5 @@ from .baseline import batch_random_integration +from .baseline import celltype_random_graph from .baseline import celltype_random_integration from .baseline import no_integration from .baseline import random_integration diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py index b54f1cb018..015fecd0e8 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py @@ -2,6 +2,7 @@ from .....tools.utils import check_version import numpy as np +import scanpy as sc def _set_uns(adata): @@ -50,6 +51,17 @@ def _randomize_graph(adata, partition=None): return adata +def _random_embedding(partition): + from sklearn.preprocessing import LabelEncoder + from sklearn.preprocessing import OneHotEncoder + + embedding = OneHotEncoder().fit_transform( + LabelEncoder().fit_transform(partition)[:, None] + ) + embedding = embedding + np.random.uniform(-0.1, 0.1, embedding.shape) + return embedding + + @method( method_name="Random Integration", paper_name="Random Integration (baseline)", @@ -96,3 +108,18 @@ def batch_random_integration(adata, test=False): ) adata.uns["method_code_version"] = check_version("openproblems") return adata + + +@method( + method_name="Random Graph by Celltype", + paper_name="Random Graph by Celltype (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def celltype_random_graph(adata, test=False): + adata.obsm["X_emb"] = _random_embedding(partition=adata.obs["labels"]) + sc.pp.neighbors(adata, use_rep="X_emb") + adata.uns["method_code_version"] = check_version("openproblems") + return adata From 48e777cf8b4f6394cea333c7769c314fc80aeda1 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 10:25:18 -0500 Subject: [PATCH 097/266] upgrade nextflow --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index c5293f779d..60c34db9c2 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -312,7 +312,7 @@ jobs: - name: Install Nextflow env: CAPSULE_LOG: none - NXF_VER: 22.04.0 + NXF_VER: 22.10.2 run: | mkdir /tmp/nextflow cd /tmp/nextflow From 324661f270098f5ddc5ca7144ee37d4c2549888d Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 10:36:22 -0500 Subject: [PATCH 098/266] overwrite --- .github/workflows/run_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 60c34db9c2..50a2aab5a1 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -426,6 +426,7 @@ jobs: -name "${RUN_NAME}" \ -e.PYTHONPATH="${PYTHONPATH}" \ openproblems-bio/nf-openproblems \ + --overwrite true \ --branch ${BRANCH} | \ tee >(grep --color=never --line-buffered "Monitor the execution with Nextflow Tower using this url" >> $GITHUB_STEP_SUMMARY) shell: /bin/bash -eou pipefail {0} From a54014b04064bc640a2db45c1b591b5e329b4a01 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 10:45:29 -0500 Subject: [PATCH 099/266] just remove it --- .github/workflows/run_tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 50a2aab5a1..45279012de 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -416,6 +416,9 @@ jobs: run: | RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" cd /mnt/openproblems-nextflow/cwd/${BRANCH} + if [ -f results/pipeline_info/execution_trace.txt ]; then + rm results/pipeline_info/execution_trace.txt + fi nextflow run \ -revision master \ -with-tower \ @@ -426,7 +429,6 @@ jobs: -name "${RUN_NAME}" \ -e.PYTHONPATH="${PYTHONPATH}" \ openproblems-bio/nf-openproblems \ - --overwrite true \ --branch ${BRANCH} | \ tee >(grep --color=never --line-buffered "Monitor the execution with Nextflow Tower using this url" >> $GITHUB_STEP_SUMMARY) shell: /bin/bash -eou pipefail {0} From f628d6b7eb145464853516406ecb4ef59c3bbffe Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 10:59:08 -0500 Subject: [PATCH 100/266] v1.9 --- .github/workflows/run_tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 45279012de..4f6527386e 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -412,7 +412,6 @@ jobs: TOWER_ACCESS_TOKEN: ${{ secrets.TOWER_ACCESS_KEY }} TOWER_WORKSPACE_ID: 53907369739130 AWS_DEFAULT_REGION: us-west-2 - NXF_DEFAULT_DSL: 1 run: | RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" cd /mnt/openproblems-nextflow/cwd/${BRANCH} @@ -420,7 +419,7 @@ jobs: rm results/pipeline_info/execution_trace.txt fi nextflow run \ - -revision master \ + -revision v1.9 \ -with-tower \ -ansi-log false -resume \ -profile aws,test \ From bfe2905d4ef5937dcb31bbb032eea4062dc25f7e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 11:06:15 -0500 Subject: [PATCH 101/266] undo temp --- .github/workflows/run_tests.yml | 102 ++++++++++++++++---------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 4f6527386e..a7d4998bff 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -237,8 +237,8 @@ jobs: with: fetch-depth: 1000 -# - name: Clear space on runner -# run: ./scripts/clear_runner_diskspace.sh + - name: Clear space on runner + run: ./scripts/clear_runner_diskspace.sh - name: Install system dependencies run: | @@ -355,55 +355,55 @@ jobs: python -c "import openproblems" openproblems-cli --version openproblems-cli --test-hash -# -# - name: Pull Docker images -# if: | -# startsWith(github.ref, 'refs/heads/main') && -# github.repository == 'openproblems-bio/openproblems' -# run: | -# cd workflow -# snakemake -j $(nproc) docker_pull -# cd .. -# -# - name: Build Docker images -# if: | -# !( -# startsWith(github.ref, 'refs/heads/main') && -# github.repository == 'openproblems-bio/openproblems' -# ) -# run: | -# cd workflow -# SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker -# cd .. -# -# - name: Upload Docker images -# env: -# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} -# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} -# AWS_DEFAULT_REGION: us-west-2 -# run: | -# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" -# aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ -# docker login --username AWS --password-stdin $ECR_ENDPOINT -# for image in $(cd docker && ls -1d */ | tr -d '/'); do -# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} -# docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} -# done -# -# - name: Upload Docker images for full benchmark -# env: -# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} -# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} -# AWS_DEFAULT_REGION: us-west-2 -# if: >- -# startsWith(github.ref, 'refs/tags') || -# startsWith(github.ref, 'refs/heads/test_full_benchmark') -# run: | -# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" -# for image in $(cd docker && ls -1d */ | tr -d '/'); do -# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} -# docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} -# done + + - name: Pull Docker images + if: | + startsWith(github.ref, 'refs/heads/main') && + github.repository == 'openproblems-bio/openproblems' + run: | + cd workflow + snakemake -j $(nproc) docker_pull + cd .. + + - name: Build Docker images + if: | + !( + startsWith(github.ref, 'refs/heads/main') && + github.repository == 'openproblems-bio/openproblems' + ) + run: | + cd workflow + SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker + cd .. + + - name: Upload Docker images + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + run: | + ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" + aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ + docker login --username AWS --password-stdin $ECR_ENDPOINT + for image in $(cd docker && ls -1d */ | tr -d '/'); do + docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} + docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} + done + + - name: Upload Docker images for full benchmark + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + if: >- + startsWith(github.ref, 'refs/tags') || + startsWith(github.ref, 'refs/heads/test_full_benchmark') + run: | + ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" + for image in $(cd docker && ls -1d */ | tr -d '/'); do + docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} + docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} + done - name: Run test benchmark env: From 736f41dfff9bb2e56a8f35110f55d71e2402672c Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 11:08:21 -0500 Subject: [PATCH 102/266] Revert "undo temp" This reverts commit bfe2905d4ef5937dcb31bbb032eea4062dc25f7e. --- .github/workflows/run_tests.yml | 102 ++++++++++++++++---------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index a7d4998bff..4f6527386e 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -237,8 +237,8 @@ jobs: with: fetch-depth: 1000 - - name: Clear space on runner - run: ./scripts/clear_runner_diskspace.sh +# - name: Clear space on runner +# run: ./scripts/clear_runner_diskspace.sh - name: Install system dependencies run: | @@ -355,55 +355,55 @@ jobs: python -c "import openproblems" openproblems-cli --version openproblems-cli --test-hash - - - name: Pull Docker images - if: | - startsWith(github.ref, 'refs/heads/main') && - github.repository == 'openproblems-bio/openproblems' - run: | - cd workflow - snakemake -j $(nproc) docker_pull - cd .. - - - name: Build Docker images - if: | - !( - startsWith(github.ref, 'refs/heads/main') && - github.repository == 'openproblems-bio/openproblems' - ) - run: | - cd workflow - SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker - cd .. - - - name: Upload Docker images - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - run: | - ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" - aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ - docker login --username AWS --password-stdin $ECR_ENDPOINT - for image in $(cd docker && ls -1d */ | tr -d '/'); do - docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} - docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} - done - - - name: Upload Docker images for full benchmark - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - if: >- - startsWith(github.ref, 'refs/tags') || - startsWith(github.ref, 'refs/heads/test_full_benchmark') - run: | - ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" - for image in $(cd docker && ls -1d */ | tr -d '/'); do - docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} - docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} - done +# +# - name: Pull Docker images +# if: | +# startsWith(github.ref, 'refs/heads/main') && +# github.repository == 'openproblems-bio/openproblems' +# run: | +# cd workflow +# snakemake -j $(nproc) docker_pull +# cd .. +# +# - name: Build Docker images +# if: | +# !( +# startsWith(github.ref, 'refs/heads/main') && +# github.repository == 'openproblems-bio/openproblems' +# ) +# run: | +# cd workflow +# SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker +# cd .. +# +# - name: Upload Docker images +# env: +# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} +# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +# AWS_DEFAULT_REGION: us-west-2 +# run: | +# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" +# aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ +# docker login --username AWS --password-stdin $ECR_ENDPOINT +# for image in $(cd docker && ls -1d */ | tr -d '/'); do +# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} +# docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} +# done +# +# - name: Upload Docker images for full benchmark +# env: +# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} +# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +# AWS_DEFAULT_REGION: us-west-2 +# if: >- +# startsWith(github.ref, 'refs/tags') || +# startsWith(github.ref, 'refs/heads/test_full_benchmark') +# run: | +# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" +# for image in $(cd docker && ls -1d */ | tr -d '/'); do +# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} +# docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} +# done - name: Run test benchmark env: From ed7c4dbab16dc7ffe72f0d728b5aa6c33b583dbf Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 11:09:52 -0500 Subject: [PATCH 103/266] unique cwd --- .github/workflows/run_tests.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 4f6527386e..65524ae165 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -345,6 +345,7 @@ jobs: for dir in bucket work cwd; do mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH} done + mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH}/cwd/${{ github.run_id }}_${{ github.run_attempt }} ls -l /mnt/openproblems-nextflow/*/${BRANCH} - name: Install package & dependencies @@ -414,10 +415,7 @@ jobs: AWS_DEFAULT_REGION: us-west-2 run: | RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" - cd /mnt/openproblems-nextflow/cwd/${BRANCH} - if [ -f results/pipeline_info/execution_trace.txt ]; then - rm results/pipeline_info/execution_trace.txt - fi + cd /mnt/openproblems-nextflow/cwd/${BRANCH}/${{ github.run_id }}_${{ github.run_attempt }} nextflow run \ -revision v1.9 \ -with-tower \ From 49d0eb6714926ad305fdb8f6d52995a3e4ecc999 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 11:09:59 -0500 Subject: [PATCH 104/266] Revert "Revert "undo temp"" This reverts commit 736f41dfff9bb2e56a8f35110f55d71e2402672c. --- .github/workflows/run_tests.yml | 102 ++++++++++++++++---------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 65524ae165..34b4d13968 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -237,8 +237,8 @@ jobs: with: fetch-depth: 1000 -# - name: Clear space on runner -# run: ./scripts/clear_runner_diskspace.sh + - name: Clear space on runner + run: ./scripts/clear_runner_diskspace.sh - name: Install system dependencies run: | @@ -356,55 +356,55 @@ jobs: python -c "import openproblems" openproblems-cli --version openproblems-cli --test-hash -# -# - name: Pull Docker images -# if: | -# startsWith(github.ref, 'refs/heads/main') && -# github.repository == 'openproblems-bio/openproblems' -# run: | -# cd workflow -# snakemake -j $(nproc) docker_pull -# cd .. -# -# - name: Build Docker images -# if: | -# !( -# startsWith(github.ref, 'refs/heads/main') && -# github.repository == 'openproblems-bio/openproblems' -# ) -# run: | -# cd workflow -# SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker -# cd .. -# -# - name: Upload Docker images -# env: -# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} -# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} -# AWS_DEFAULT_REGION: us-west-2 -# run: | -# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" -# aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ -# docker login --username AWS --password-stdin $ECR_ENDPOINT -# for image in $(cd docker && ls -1d */ | tr -d '/'); do -# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} -# docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} -# done -# -# - name: Upload Docker images for full benchmark -# env: -# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} -# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} -# AWS_DEFAULT_REGION: us-west-2 -# if: >- -# startsWith(github.ref, 'refs/tags') || -# startsWith(github.ref, 'refs/heads/test_full_benchmark') -# run: | -# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" -# for image in $(cd docker && ls -1d */ | tr -d '/'); do -# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} -# docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} -# done + + - name: Pull Docker images + if: | + startsWith(github.ref, 'refs/heads/main') && + github.repository == 'openproblems-bio/openproblems' + run: | + cd workflow + snakemake -j $(nproc) docker_pull + cd .. + + - name: Build Docker images + if: | + !( + startsWith(github.ref, 'refs/heads/main') && + github.repository == 'openproblems-bio/openproblems' + ) + run: | + cd workflow + SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker + cd .. + + - name: Upload Docker images + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + run: | + ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" + aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ + docker login --username AWS --password-stdin $ECR_ENDPOINT + for image in $(cd docker && ls -1d */ | tr -d '/'); do + docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} + docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} + done + + - name: Upload Docker images for full benchmark + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + if: >- + startsWith(github.ref, 'refs/tags') || + startsWith(github.ref, 'refs/heads/test_full_benchmark') + run: | + ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" + for image in $(cd docker && ls -1d */ | tr -d '/'); do + docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} + docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} + done - name: Run test benchmark env: From 829aef5b0c3f5f639e8e8fca2ed65e7a93eefeb4 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 11:20:58 -0500 Subject: [PATCH 105/266] mkdir cwd --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 34b4d13968..b7a70af2a5 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -345,7 +345,7 @@ jobs: for dir in bucket work cwd; do mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH} done - mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH}/cwd/${{ github.run_id }}_${{ github.run_attempt }} + mkdir -p /mnt/openproblems-nextflow/cwd/${BRANCH}/cwd/${{ github.run_id }}_${{ github.run_attempt }} ls -l /mnt/openproblems-nextflow/*/${BRANCH} - name: Install package & dependencies From 9e42c7c6ec0b4d4b6646a84fabb6dc4600f5bba0 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 12:14:00 -0500 Subject: [PATCH 106/266] Revert "Revert "Revert "undo temp""" This reverts commit 49d0eb6714926ad305fdb8f6d52995a3e4ecc999. --- .github/workflows/run_tests.yml | 102 ++++++++++++++++---------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index b7a70af2a5..dba6f489c1 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -237,8 +237,8 @@ jobs: with: fetch-depth: 1000 - - name: Clear space on runner - run: ./scripts/clear_runner_diskspace.sh +# - name: Clear space on runner +# run: ./scripts/clear_runner_diskspace.sh - name: Install system dependencies run: | @@ -356,55 +356,55 @@ jobs: python -c "import openproblems" openproblems-cli --version openproblems-cli --test-hash - - - name: Pull Docker images - if: | - startsWith(github.ref, 'refs/heads/main') && - github.repository == 'openproblems-bio/openproblems' - run: | - cd workflow - snakemake -j $(nproc) docker_pull - cd .. - - - name: Build Docker images - if: | - !( - startsWith(github.ref, 'refs/heads/main') && - github.repository == 'openproblems-bio/openproblems' - ) - run: | - cd workflow - SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker - cd .. - - - name: Upload Docker images - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - run: | - ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" - aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ - docker login --username AWS --password-stdin $ECR_ENDPOINT - for image in $(cd docker && ls -1d */ | tr -d '/'); do - docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} - docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} - done - - - name: Upload Docker images for full benchmark - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - if: >- - startsWith(github.ref, 'refs/tags') || - startsWith(github.ref, 'refs/heads/test_full_benchmark') - run: | - ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" - for image in $(cd docker && ls -1d */ | tr -d '/'); do - docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} - docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} - done +# +# - name: Pull Docker images +# if: | +# startsWith(github.ref, 'refs/heads/main') && +# github.repository == 'openproblems-bio/openproblems' +# run: | +# cd workflow +# snakemake -j $(nproc) docker_pull +# cd .. +# +# - name: Build Docker images +# if: | +# !( +# startsWith(github.ref, 'refs/heads/main') && +# github.repository == 'openproblems-bio/openproblems' +# ) +# run: | +# cd workflow +# SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker +# cd .. +# +# - name: Upload Docker images +# env: +# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} +# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +# AWS_DEFAULT_REGION: us-west-2 +# run: | +# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" +# aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ +# docker login --username AWS --password-stdin $ECR_ENDPOINT +# for image in $(cd docker && ls -1d */ | tr -d '/'); do +# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} +# docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} +# done +# +# - name: Upload Docker images for full benchmark +# env: +# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} +# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +# AWS_DEFAULT_REGION: us-west-2 +# if: >- +# startsWith(github.ref, 'refs/tags') || +# startsWith(github.ref, 'refs/heads/test_full_benchmark') +# run: | +# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" +# for image in $(cd docker && ls -1d */ | tr -d '/'); do +# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} +# docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} +# done - name: Run test benchmark env: From 38aeae693f095832bab1c54e08c2bf4281405e6f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 12:15:59 -0500 Subject: [PATCH 107/266] fix mkdir --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index dba6f489c1..07980b17e2 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -345,7 +345,7 @@ jobs: for dir in bucket work cwd; do mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH} done - mkdir -p /mnt/openproblems-nextflow/cwd/${BRANCH}/cwd/${{ github.run_id }}_${{ github.run_attempt }} + mkdir -p /mnt/openproblems-nextflow/cwd/${BRANCH}/${{ github.run_id }}_${{ github.run_attempt }} ls -l /mnt/openproblems-nextflow/*/${BRANCH} - name: Install package & dependencies From 3c7e2cbc92392b90448b8fcfc6690bf860356ad9 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 12:34:47 -0500 Subject: [PATCH 108/266] master --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 07980b17e2..c88793b8ae 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -417,7 +417,7 @@ jobs: RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" cd /mnt/openproblems-nextflow/cwd/${BRANCH}/${{ github.run_id }}_${{ github.run_attempt }} nextflow run \ - -revision v1.9 \ + -revision master \ -with-tower \ -ansi-log false -resume \ -profile aws,test \ From 355b0afb8fc8301efdb1193f608e9fc494c0ba8b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 12:45:16 -0500 Subject: [PATCH 109/266] fix parsing --- .github/workflows/run_tests.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index c88793b8ae..048c9872f7 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -345,7 +345,8 @@ jobs: for dir in bucket work cwd; do mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH} done - mkdir -p /mnt/openproblems-nextflow/cwd/${BRANCH}/${{ github.run_id }}_${{ github.run_attempt }} + echo "CWD=/mnt/openproblems-nextflow/cwd/${BRANCH}/${{ github.run_id }}_${{ github.run_attempt }}" >> $GITHUB_ENV + mkdir -p $CWD ls -l /mnt/openproblems-nextflow/*/${BRANCH} - name: Install package & dependencies @@ -415,7 +416,7 @@ jobs: AWS_DEFAULT_REGION: us-west-2 run: | RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" - cd /mnt/openproblems-nextflow/cwd/${BRANCH}/${{ github.run_id }}_${{ github.run_attempt }} + cd $CWD nextflow run \ -revision master \ -with-tower \ @@ -432,13 +433,13 @@ jobs: - name: Parse results run: | - python workflow/parse_nextflow.py /mnt/openproblems-nextflow/cwd/${BRANCH} /tmp/website + python workflow/parse_nextflow.py $CWD /tmp/website python workflow/generate_website_markdown.py /tmp/website - name: Rename nextflow log if: always() run: | - mv /mnt/openproblems-nextflow/cwd/${{ env.BRANCH }}/.nextflow.log /tmp/nextflow.log + mv ${CWD}/.nextflow.log /tmp/nextflow.log continue-on-error: true - name: Upload nextflow log From 0645d0450a3ada2d80ee242b3f3878413cbace0a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 13:07:05 -0500 Subject: [PATCH 110/266] Revert "upgrade nextflow" This reverts commit 48e777cf8b4f6394cea333c7769c314fc80aeda1. --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 048c9872f7..cf3b36457d 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -312,7 +312,7 @@ jobs: - name: Install Nextflow env: CAPSULE_LOG: none - NXF_VER: 22.10.2 + NXF_VER: 22.04.0 run: | mkdir /tmp/nextflow cd /tmp/nextflow From 959bee34fe584ef513d5e2f56d95313d85b9446b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 13:55:04 -0500 Subject: [PATCH 111/266] fix cwd --- .github/workflows/run_tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index cf3b36457d..b58fb160f5 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -345,7 +345,8 @@ jobs: for dir in bucket work cwd; do mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH} done - echo "CWD=/mnt/openproblems-nextflow/cwd/${BRANCH}/${{ github.run_id }}_${{ github.run_attempt }}" >> $GITHUB_ENV + CWD=/mnt/openproblems-nextflow/cwd/${BRANCH}/${{ github.run_id }}_${{ github.run_attempt }} + echo "CWD=$CWD" >> $GITHUB_ENV mkdir -p $CWD ls -l /mnt/openproblems-nextflow/*/${BRANCH} From c52dc8a9ce1ff7a91fc366f3b1a5e9eb2e787108 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 14:43:07 -0500 Subject: [PATCH 112/266] Revert "Revert "upgrade nextflow"" This reverts commit 0645d0450a3ada2d80ee242b3f3878413cbace0a. --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index b58fb160f5..8974403f3e 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -312,7 +312,7 @@ jobs: - name: Install Nextflow env: CAPSULE_LOG: none - NXF_VER: 22.04.0 + NXF_VER: 22.10.2 run: | mkdir /tmp/nextflow cd /tmp/nextflow From be74dc5395b82a81bbc7be03071eb36c79f9bbf5 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 14:43:19 -0500 Subject: [PATCH 113/266] Revert "Revert "Revert "Revert "undo temp"""" This reverts commit 9e42c7c6ec0b4d4b6646a84fabb6dc4600f5bba0. --- .github/workflows/run_tests.yml | 102 ++++++++++++++++---------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 8974403f3e..751750927a 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -237,8 +237,8 @@ jobs: with: fetch-depth: 1000 -# - name: Clear space on runner -# run: ./scripts/clear_runner_diskspace.sh + - name: Clear space on runner + run: ./scripts/clear_runner_diskspace.sh - name: Install system dependencies run: | @@ -358,55 +358,55 @@ jobs: python -c "import openproblems" openproblems-cli --version openproblems-cli --test-hash -# -# - name: Pull Docker images -# if: | -# startsWith(github.ref, 'refs/heads/main') && -# github.repository == 'openproblems-bio/openproblems' -# run: | -# cd workflow -# snakemake -j $(nproc) docker_pull -# cd .. -# -# - name: Build Docker images -# if: | -# !( -# startsWith(github.ref, 'refs/heads/main') && -# github.repository == 'openproblems-bio/openproblems' -# ) -# run: | -# cd workflow -# SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker -# cd .. -# -# - name: Upload Docker images -# env: -# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} -# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} -# AWS_DEFAULT_REGION: us-west-2 -# run: | -# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" -# aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ -# docker login --username AWS --password-stdin $ECR_ENDPOINT -# for image in $(cd docker && ls -1d */ | tr -d '/'); do -# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} -# docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} -# done -# -# - name: Upload Docker images for full benchmark -# env: -# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} -# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} -# AWS_DEFAULT_REGION: us-west-2 -# if: >- -# startsWith(github.ref, 'refs/tags') || -# startsWith(github.ref, 'refs/heads/test_full_benchmark') -# run: | -# ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" -# for image in $(cd docker && ls -1d */ | tr -d '/'); do -# docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} -# docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} -# done + + - name: Pull Docker images + if: | + startsWith(github.ref, 'refs/heads/main') && + github.repository == 'openproblems-bio/openproblems' + run: | + cd workflow + snakemake -j $(nproc) docker_pull + cd .. + + - name: Build Docker images + if: | + !( + startsWith(github.ref, 'refs/heads/main') && + github.repository == 'openproblems-bio/openproblems' + ) + run: | + cd workflow + SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker + cd .. + + - name: Upload Docker images + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + run: | + ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" + aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ + docker login --username AWS --password-stdin $ECR_ENDPOINT + for image in $(cd docker && ls -1d */ | tr -d '/'); do + docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} + docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} + done + + - name: Upload Docker images for full benchmark + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + if: >- + startsWith(github.ref, 'refs/tags') || + startsWith(github.ref, 'refs/heads/test_full_benchmark') + run: | + ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" + for image in $(cd docker && ls -1d */ | tr -d '/'); do + docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} + docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} + done - name: Run test benchmark env: From 58149a0ed7e1a280df8e030c2ceb1cb21a30b483 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 14:43:57 -0500 Subject: [PATCH 114/266] v1.9 --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 751750927a..34b93297f3 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -419,7 +419,7 @@ jobs: RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" cd $CWD nextflow run \ - -revision master \ + -revision v1.9 \ -with-tower \ -ansi-log false -resume \ -profile aws,test \ From 6259565edd4a19a0a9ec246504ff7938e87b6b22 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 16 Nov 2022 16:20:11 -0500 Subject: [PATCH 115/266] fix quotes --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 34b93297f3..135e87f62e 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -89,7 +89,7 @@ jobs: title: '[auto] Update docker version' reviewers: scottgigante, dburkhardt author: "openproblems-bio " - commit-message: Update docker version # ci skip + commit-message: "Update docker version # ci skip" add-paths: docker/.version - name: Upload check results on fail From 48eec7b494f21c0e3fa1c0b951fc2dab570a8041 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Wed, 16 Nov 2022 18:50:50 -0500 Subject: [PATCH 116/266] Bump nf-openproblems to v1.9 (#691) * use v1.9 * remove pip check * skip * update tag * ubuntu-latest * gitlab -> github * nothing commit * only import big packages at runtime * temp * back to master * empty * empty * upgrade nextflow * overwrite * just remove it * v1.9 * undo temp * Revert "undo temp" This reverts commit bfe2905d4ef5937dcb31bbb032eea4062dc25f7e. * unique cwd * Revert "Revert "undo temp"" This reverts commit 736f41dfff9bb2e56a8f35110f55d71e2402672c. * mkdir cwd * Revert "Revert "Revert "undo temp""" This reverts commit 49d0eb6714926ad305fdb8f6d52995a3e4ecc999. * fix mkdir * master * fix parsing * Revert "upgrade nextflow" This reverts commit 48e777cf8b4f6394cea333c7769c314fc80aeda1. * fix cwd * Revert "Revert "upgrade nextflow"" This reverts commit 0645d0450a3ada2d80ee242b3f3878413cbace0a. * Revert "Revert "Revert "Revert "undo temp"""" This reverts commit 9e42c7c6ec0b4d4b6646a84fabb6dc4600f5bba0. * v1.9 * fix quotes --- .github/workflows/pre-commit.yml | 2 +- .github/workflows/run_tests.yml | 24 ++++++++++--------- openproblems/data/allen_brain_atlas.py | 3 ++- openproblems/data/cengen.py | 3 ++- openproblems/data/immune_cells.py | 3 ++- .../data/mouse_blood_olssen_labelled.py | 3 ++- openproblems/data/mouse_hspc_nestorowa2016.py | 3 ++- openproblems/data/multimodal/utils.py | 3 ++- openproblems/data/pancreas.py | 3 ++- openproblems/data/tabula_muris_senis.py | 3 ++- openproblems/data/tenx.py | 5 +++- openproblems/data/tnbc_wu2021.py | 3 ++- openproblems/data/utils.py | 5 +++- .../batch_integration_embed/api.py | 3 ++- .../batch_integration_feature/api.py | 3 ++- .../batch_integration_graph/api.py | 3 ++- .../datasets/immune.py | 4 ++-- .../datasets/pancreas.py | 4 ++-- .../_cell_cell_communication/_common/api.py | 3 ++- .../_common/metrics/auprc.py | 5 ++-- .../tasks/denoising/datasets/utils.py | 2 +- openproblems/tasks/denoising/methods/dca.py | 9 +++---- openproblems/tasks/denoising/metrics/mse.py | 7 +++--- .../tasks/denoising/metrics/poisson.py | 4 ++-- .../tasks/dimensionality_reduction/api.py | 3 ++- .../methods/densmap.py | 3 ++- .../dimensionality_reduction/methods/pca.py | 4 ++-- .../dimensionality_reduction/methods/tsne.py | 4 ++-- .../dimensionality_reduction/methods/umap.py | 4 ++-- .../metrics/density.py | 4 ++-- .../metrics/nn_ranking.py | 6 +++-- .../metrics/root_mean_square_error.py | 10 ++++---- .../metrics/trustworthiness.py | 3 ++- .../methods/knn_classifier.py | 5 +++- .../methods/logistic_regression.py | 3 ++- .../tasks/label_projection/methods/mlp.py | 3 ++- .../tasks/label_projection/methods/sklearn.py | 5 ++-- .../tasks/label_projection/methods/utils.py | 6 ++--- .../label_projection/metrics/accuracy.py | 3 ++- .../tasks/label_projection/metrics/f1.py | 6 ++--- .../methods/baseline.py | 5 +++- .../methods/harmonic_alignment.py | 2 +- .../methods/procrustes.py | 6 ++--- .../metrics/knn_auc.py | 5 ++-- .../methods/beta.py | 8 ++++++- .../metrics/correlation.py | 5 ++-- .../datasets/destvi/utils.py | 9 +++---- .../datasets/pancreas.py | 3 ++- openproblems/tools/conversion.py | 1 + openproblems/tools/decorators.py | 2 +- openproblems/tools/normalize.py | 8 ++++++- setup.cfg | 4 ++-- 52 files changed, 147 insertions(+), 93 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 0929ca3594..47c14ff13e 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -8,7 +8,7 @@ on: jobs: pre-commit: - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest container: image: singlecellopenproblems/openproblems-github-actions:latest diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 82f99b53ab..135e87f62e 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -89,7 +89,7 @@ jobs: title: '[auto] Update docker version' reviewers: scottgigante, dburkhardt author: "openproblems-bio " - commit-message: Update docker version # ci skip + commit-message: "Update docker version # ci skip" add-paths: docker/.version - name: Upload check results on fail @@ -312,7 +312,7 @@ jobs: - name: Install Nextflow env: CAPSULE_LOG: none - NXF_VER: 22.04.0 + NXF_VER: 22.10.2 run: | mkdir /tmp/nextflow cd /tmp/nextflow @@ -345,6 +345,9 @@ jobs: for dir in bucket work cwd; do mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH} done + CWD=/mnt/openproblems-nextflow/cwd/${BRANCH}/${{ github.run_id }}_${{ github.run_attempt }} + echo "CWD=$CWD" >> $GITHUB_ENV + mkdir -p $CWD ls -l /mnt/openproblems-nextflow/*/${BRANCH} - name: Install package & dependencies @@ -397,7 +400,7 @@ jobs: AWS_DEFAULT_REGION: us-west-2 if: >- startsWith(github.ref, 'refs/tags') || - startsWith(github.ref, 'refs/heads/test_benchmark') + startsWith(github.ref, 'refs/heads/test_full_benchmark') run: | ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" for image in $(cd docker && ls -1d */ | tr -d '/'); do @@ -412,12 +415,11 @@ jobs: TOWER_ACCESS_TOKEN: ${{ secrets.TOWER_ACCESS_KEY }} TOWER_WORKSPACE_ID: 53907369739130 AWS_DEFAULT_REGION: us-west-2 - NXF_DEFAULT_DSL: 1 run: | RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" - cd /mnt/openproblems-nextflow/cwd/${BRANCH} + cd $CWD nextflow run \ - -revision v1.8 \ + -revision v1.9 \ -with-tower \ -ansi-log false -resume \ -profile aws,test \ @@ -432,13 +434,13 @@ jobs: - name: Parse results run: | - python workflow/parse_nextflow.py /mnt/openproblems-nextflow/cwd/${BRANCH} /tmp/website + python workflow/parse_nextflow.py $CWD /tmp/website python workflow/generate_website_markdown.py /tmp/website - name: Rename nextflow log if: always() run: | - mv /mnt/openproblems-nextflow/cwd/${{ env.BRANCH }}/.nextflow.log /tmp/nextflow.log + mv ${CWD}/.nextflow.log /tmp/nextflow.log continue-on-error: true - name: Upload nextflow log @@ -460,11 +462,11 @@ jobs: github.event_name == 'push' && ( needs.run_tester.result == 'success' || - startsWith(github.ref, 'refs/heads/test_benchmark') + startsWith(github.ref, 'refs/heads/test_full_benchmark') ) && ( startsWith(github.ref, 'refs/tags') || - startsWith(github.ref, 'refs/heads/test_benchmark') + startsWith(github.ref, 'refs/heads/test_full_benchmark') ) steps: @@ -473,7 +475,7 @@ jobs: env: TOWER_WATCH_URL: https://tower.nf/orgs/openproblems-bio/workspaces/openproblems-bio/watch TOWER_WORKSPACE_ID: 53907369739130 - TOWER_ACTION_ID: 7jylKuFGbSN65qSA4NfdFY + TOWER_ACTION_ID: bVQhVSNah1JmJfnKkfyjg run: | generate_parameters() { diff --git a/openproblems/data/allen_brain_atlas.py b/openproblems/data/allen_brain_atlas.py index 66412172ee..fecbca85d1 100644 --- a/openproblems/data/allen_brain_atlas.py +++ b/openproblems/data/allen_brain_atlas.py @@ -2,7 +2,6 @@ import numpy as np import os -import scanpy as sc import scprep import tempfile @@ -17,6 +16,8 @@ def load_mouse_brain_atlas(test=False): to the dataset is available at: https://figshare.com/articles/dataset/allen_brain_h5ad/20338089 """ + import scanpy as sc + if test: # load full data first, cached if available adata = load_mouse_brain_atlas(test=False) diff --git a/openproblems/data/cengen.py b/openproblems/data/cengen.py index e47f312fa4..859755f558 100644 --- a/openproblems/data/cengen.py +++ b/openproblems/data/cengen.py @@ -1,7 +1,6 @@ from . import utils import os -import scanpy as sc import scprep import tempfile @@ -22,6 +21,8 @@ def load_cengen(test=False): To learn about WormBase curation efforts for C. elegans single cell data visit https://wormbase.github.io/single-cell/ """ + import scanpy as sc + with tempfile.TemporaryDirectory() as tempdir: filepath = os.path.join(tempdir, "cengen.h5ad") scprep.io.download.download_url(URL, filepath) diff --git a/openproblems/data/immune_cells.py b/openproblems/data/immune_cells.py index abed1d0a28..ce740d14f0 100644 --- a/openproblems/data/immune_cells.py +++ b/openproblems/data/immune_cells.py @@ -1,7 +1,6 @@ from . import utils import os -import scanpy as sc import scprep import tempfile @@ -12,6 +11,8 @@ @utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/s41592-021-01336-8") def load_immune(test=False): """Download immune human data from figshare.""" + import scanpy as sc + if test: # load full data first, cached if available adata = load_immune(test=False) diff --git a/openproblems/data/mouse_blood_olssen_labelled.py b/openproblems/data/mouse_blood_olssen_labelled.py index d04ecb353f..9deb09a4e1 100644 --- a/openproblems/data/mouse_blood_olssen_labelled.py +++ b/openproblems/data/mouse_blood_olssen_labelled.py @@ -1,7 +1,6 @@ from . import utils import os -import scanpy as sc import scprep import tempfile @@ -13,6 +12,8 @@ @utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/nature19348") def load_olsson_2016_mouse_blood(test=False): """Download Olsson, 2016_mouse_blood, Nature, 2016 data from Figshare.""" + import scanpy as sc + if test: # load full data first, cached if available adata = load_olsson_2016_mouse_blood(test=False) diff --git a/openproblems/data/mouse_hspc_nestorowa2016.py b/openproblems/data/mouse_hspc_nestorowa2016.py index 81218ef9f0..d409c2ce70 100644 --- a/openproblems/data/mouse_hspc_nestorowa2016.py +++ b/openproblems/data/mouse_hspc_nestorowa2016.py @@ -1,7 +1,6 @@ from . import utils import os -import scanpy as sc import scprep import tempfile @@ -15,6 +14,8 @@ ) def load_mouse_hspc_nestorowa2016(test=False): """Download Nesterova data from Figshare.""" + import scanpy as sc + if test: # load full data first, cached if available adata = load_mouse_hspc_nestorowa2016(test=False) diff --git a/openproblems/data/multimodal/utils.py b/openproblems/data/multimodal/utils.py index 6757ba0675..d4b4245d8a 100644 --- a/openproblems/data/multimodal/utils.py +++ b/openproblems/data/multimodal/utils.py @@ -1,7 +1,6 @@ import anndata import numpy as np import pandas as pd -import scanpy as sc import scprep @@ -17,6 +16,8 @@ def subset_mode2_genes(adata, keep_genes): def filter_joint_data_empty_cells(adata): """Remove empty cells and genes from a multimodal dataset.""" + import scanpy as sc + assert np.all(adata.uns["mode2_obs"] == adata.obs.index) # filter cells n_cells_mode1 = scprep.utils.toarray(adata.X.sum(axis=1)).flatten() diff --git a/openproblems/data/pancreas.py b/openproblems/data/pancreas.py index a67943a1fc..ff0c0af843 100644 --- a/openproblems/data/pancreas.py +++ b/openproblems/data/pancreas.py @@ -2,7 +2,6 @@ import numpy as np import os -import scanpy as sc import scprep import tempfile @@ -13,6 +12,8 @@ @utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/s41592-021-01336-8") def load_pancreas(test=False, keep_techs=None): """Download pancreas data from figshare.""" + import scanpy as sc + if test: # load full data first, cached if available adata = load_pancreas( diff --git a/openproblems/data/tabula_muris_senis.py b/openproblems/data/tabula_muris_senis.py index b687e3c401..c634e85d43 100644 --- a/openproblems/data/tabula_muris_senis.py +++ b/openproblems/data/tabula_muris_senis.py @@ -3,7 +3,6 @@ import anndata as ad import os import requests -import scanpy as sc import scprep import tempfile import time @@ -59,6 +58,8 @@ def matching_dataset(dataset, method_list, organ_list): def load_raw_counts(dataset): + import scanpy as sc + dataset_id = dataset["id"] assets_path = ( f"/curation/v1/collections/{COLLECTION_ID}/datasets/{dataset_id}/assets" diff --git a/openproblems/data/tenx.py b/openproblems/data/tenx.py index 73cccee21c..59f9cb93da 100644 --- a/openproblems/data/tenx.py +++ b/openproblems/data/tenx.py @@ -1,7 +1,6 @@ from . import utils import os -import scanpy as sc import scprep import tempfile @@ -17,6 +16,8 @@ @utils.loader(data_url=PBMC_1K_URL, data_reference=REFERENCE_URL) def load_tenx_1k_pbmc(test=False): """Download PBMC data from Figshare.""" + import scanpy as sc + if test: adata = load_tenx_1k_pbmc(test=False) sc.pp.subsample(adata, n_obs=100) @@ -34,6 +35,8 @@ def load_tenx_1k_pbmc(test=False): @utils.loader(data_url=PBMC_5K_URL, data_reference=REFERENCE_URL) def load_tenx_5k_pbmc(test=False): """Download 5k PBMCs from 10x Genomics.""" + import scanpy as sc + if test: # load full data first, cached if available adata = load_tenx_5k_pbmc(test=False) diff --git a/openproblems/data/tnbc_wu2021.py b/openproblems/data/tnbc_wu2021.py index 58dcc53d8b..c0a3e6a941 100644 --- a/openproblems/data/tnbc_wu2021.py +++ b/openproblems/data/tnbc_wu2021.py @@ -2,7 +2,6 @@ import numpy as np import os -import scanpy as sc import scipy.sparse import scprep import tempfile @@ -19,6 +18,8 @@ def load_tnbc_data(test=False): https://figshare.com/articles/dataset/TNBC_Data_from_Wu_et_al_2021/20338536 """ + import scanpy as sc + if test: # load full data first, cached if available adata = load_tnbc_data(test=False) diff --git a/openproblems/data/utils.py b/openproblems/data/utils.py index d4e8312f52..9bc05a6e8b 100644 --- a/openproblems/data/utils.py +++ b/openproblems/data/utils.py @@ -5,7 +5,6 @@ import hashlib import logging import os -import scanpy as sc import scipy.sparse log = logging.getLogger("openproblems") @@ -94,6 +93,8 @@ def apply_func(*args, **kwargs): def filter_genes_cells(adata): """Remove empty cells and genes.""" + import scanpy as sc + if "var_names_all" not in adata.uns: # fill in original var names before filtering adata.uns["var_names_all"] = adata.var.index.to_numpy() @@ -117,6 +118,8 @@ def subsample_even(adata, n_obs, even_obs): adata : AnnData Subsampled AnnData object """ + import scanpy as sc + values = adata.obs[even_obs].unique() adatas = [] n_obs_per_value = n_obs // len(values) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py index c867a03157..52f26ee0f0 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py @@ -2,7 +2,6 @@ from ....tools.decorators import dataset import numpy as np -import scanpy as sc def check_dataset(adata): @@ -25,6 +24,8 @@ def check_method(adata, is_baseline=False): @dataset() def sample_dataset(): """Create a simple dataset to use for testing methods in this task.""" + import scanpy as sc + adata = load_sample_data() adata.var.index = adata.var.gene_short_name.astype(str) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py index adf42b38c2..088cd38ea9 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py @@ -2,7 +2,6 @@ from ....tools.decorators import dataset import numpy as np -import scanpy as sc def check_dataset(adata): @@ -29,6 +28,8 @@ def check_method(adata, is_baseline=False): @dataset() def sample_dataset(): """Create a simple dataset to use for testing methods in this task.""" + import scanpy as sc + adata = load_sample_data() adata.var.index = adata.var.gene_short_name.astype(str) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/api.py b/openproblems/tasks/_batch_integration/batch_integration_graph/api.py index 123860d6f2..5cc9831f99 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/api.py @@ -2,7 +2,6 @@ from ....tools.decorators import dataset import numpy as np -import scanpy as sc def check_dataset(adata): @@ -34,6 +33,8 @@ def check_method(adata, is_baseline=False): @dataset() def sample_dataset(): """Create a simple dataset to use for testing methods in this task.""" + import scanpy as sc + adata = load_sample_data() adata.obsm["X_uni_pca"] = sc.pp.pca(adata.X) adata.layers["log_normalized"] = adata.X diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py index 8c39c0fe28..bee6e7699f 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py @@ -1,8 +1,6 @@ from .....data.immune_cells import load_immune from .....tools.decorators import dataset -import scanpy as sc - @dataset( dataset_name="Immune (by batch)", @@ -14,6 +12,8 @@ image="openproblems", ) def immune_batch(test=False): + import scanpy as sc + adata = load_immune(test) adata.obs["labels"] = adata.obs["final_annotation"] diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py index 6d8af4f505..23dcdd016b 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py @@ -1,8 +1,6 @@ from .....data.pancreas import load_pancreas from .....tools.decorators import dataset -import scanpy as sc - @dataset( dataset_name="Pancreas (by batch)", @@ -14,6 +12,8 @@ image="openproblems", ) def pancreas_batch(test=False): + import scanpy as sc + adata = load_pancreas(test) adata.obs["labels"] = adata.obs["celltype"] adata.obs["batch"] = adata.obs["tech"] diff --git a/openproblems/tasks/_cell_cell_communication/_common/api.py b/openproblems/tasks/_cell_cell_communication/_common/api.py index 8ebdb66c3d..2010de8c2b 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/api.py +++ b/openproblems/tasks/_cell_cell_communication/_common/api.py @@ -3,7 +3,6 @@ import numbers import numpy as np import pandas as pd -import scanpy as sc SAMPLE_RECEPTOR_NAMES = [ "LGALS9", @@ -197,6 +196,8 @@ def check_method(adata, merge_keys, is_baseline=False): def sample_dataset(merge_keys): """Create a simple dataset to use for testing methods in this task.""" + import scanpy as sc + adata = load_sample_data() adata.uns["merge_keys"] = merge_keys diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py index 430541190b..9365998a30 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py @@ -1,11 +1,12 @@ from .....tools.decorators import metric from ..utils import join_truth_and_pred -from sklearn.metrics import auc -from sklearn.metrics import precision_recall_curve @metric(metric_name="Precision-recall AUC", maximize=True) def auprc(adata): + from sklearn.metrics import auc + from sklearn.metrics import precision_recall_curve + gt = join_truth_and_pred(adata) precision, recall, _ = precision_recall_curve( gt["response"], gt["score"], pos_label=1 diff --git a/openproblems/tasks/denoising/datasets/utils.py b/openproblems/tasks/denoising/datasets/utils.py index ec31002e72..3a91e9cebb 100644 --- a/openproblems/tasks/denoising/datasets/utils.py +++ b/openproblems/tasks/denoising/datasets/utils.py @@ -1,6 +1,5 @@ import anndata import numpy as np -import scipy.sparse def split_data( @@ -11,6 +10,7 @@ def split_data( Stores "train" and "test" dataset using the AnnData.obsm property. """ import molecular_cross_validation.util + import scipy.sparse random_state = np.random.RandomState(seed) diff --git a/openproblems/tasks/denoising/methods/dca.py b/openproblems/tasks/denoising/methods/dca.py index eee6c78354..fddfb85d51 100644 --- a/openproblems/tasks/denoising/methods/dca.py +++ b/openproblems/tasks/denoising/methods/dca.py @@ -1,18 +1,19 @@ from ....tools.decorators import method from ....tools.utils import check_version -import scanpy as sc - def _dca(adata, test=False, epochs=None): + from dca.api import dca + + import anndata + if test: epochs = epochs or 30 else: # pragma: nocover epochs = epochs or 300 - from dca.api import dca # make adata object with train counts - adata_train = sc.AnnData(adata.obsm["train"]) + adata_train = anndata.AnnData(adata.obsm["train"]) # run DCA dca(adata_train, epochs=epochs) diff --git a/openproblems/tasks/denoising/metrics/mse.py b/openproblems/tasks/denoising/metrics/mse.py index 63c8f17a16..4a663ed6e6 100644 --- a/openproblems/tasks/denoising/metrics/mse.py +++ b/openproblems/tasks/denoising/metrics/mse.py @@ -1,13 +1,12 @@ from ....tools.decorators import metric -import anndata -import scanpy as sc -import sklearn.metrics - @metric(metric_name="Mean-squared error", maximize=False) def mse(adata): + import anndata + import scanpy as sc import scprep + import sklearn.metrics test_data = anndata.AnnData(X=adata.obsm["test"], obs=adata.obs, var=adata.var) denoised_data = anndata.AnnData( diff --git a/openproblems/tasks/denoising/metrics/poisson.py b/openproblems/tasks/denoising/metrics/poisson.py index e4f0f6a749..ebd2a73378 100644 --- a/openproblems/tasks/denoising/metrics/poisson.py +++ b/openproblems/tasks/denoising/metrics/poisson.py @@ -1,12 +1,12 @@ from ....tools.decorators import metric -import scprep - @metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-extras") def poisson(adata): from molecular_cross_validation.mcv_sweep import poisson_nll_loss + import scprep + test_data = adata.obsm["test"] denoised_data = adata.obsm["denoised"] diff --git a/openproblems/tasks/dimensionality_reduction/api.py b/openproblems/tasks/dimensionality_reduction/api.py index c58f025a1c..d217c2df58 100644 --- a/openproblems/tasks/dimensionality_reduction/api.py +++ b/openproblems/tasks/dimensionality_reduction/api.py @@ -2,7 +2,6 @@ from ...tools.decorators import dataset import numpy as np -import scanpy as sc def check_dataset(adata): @@ -27,6 +26,8 @@ def sample_dataset(): def sample_method(adata): """Create sample method output for testing metrics in this task.""" + import scanpy as sc + sc.tl.pca(adata) adata.obsm["X_emb"] = adata.obsm["X_pca"][:, :2] return adata diff --git a/openproblems/tasks/dimensionality_reduction/methods/densmap.py b/openproblems/tasks/dimensionality_reduction/methods/densmap.py index 900e9f78f2..b7a0285774 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/densmap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/densmap.py @@ -3,7 +3,6 @@ from ....tools.utils import check_version import functools -import scanpy as sc _densmap_method = functools.partial( method, @@ -36,6 +35,8 @@ def densmap_logCPM_1kHVG(adata, test: bool = False): @_densmap_method(method_name="densMAP PCA (logCPM, 1kHVG)") def densmap_pca_logCPM_1kHVG(adata, test: bool = False): + import scanpy as sc + adata = log_cpm_hvg(adata) sc.tl.pca(adata, n_comps=50, svd_solver="arpack") return _densmap(adata, obsm="X_pca") diff --git a/openproblems/tasks/dimensionality_reduction/methods/pca.py b/openproblems/tasks/dimensionality_reduction/methods/pca.py index e81772d1f0..1c7c186471 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pca.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pca.py @@ -2,8 +2,6 @@ from ....tools.normalize import log_cpm_hvg from ....tools.utils import check_version -import scanpy as sc - @method( method_name="Principle Component Analysis (PCA) (logCPM, 1kHVG)", @@ -14,6 +12,8 @@ "sklearn.decomposition.PCA.html", ) def pca_logCPM_1kHVG(adata, test: bool = False): + import scanpy as sc + adata = log_cpm_hvg(adata) sc.tl.pca(adata, n_comps=50, svd_solver="arpack") adata.obsm["X_emb"] = adata.obsm["X_pca"][:, :2] diff --git a/openproblems/tasks/dimensionality_reduction/methods/tsne.py b/openproblems/tasks/dimensionality_reduction/methods/tsne.py index 6b987c6457..e19fa6cd3e 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/tsne.py +++ b/openproblems/tasks/dimensionality_reduction/methods/tsne.py @@ -2,8 +2,6 @@ from ....tools.normalize import log_cpm_hvg from ....tools.utils import check_version -import scanpy as sc - @method( method_name="“t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCPM, 1kHVG)", @@ -15,6 +13,8 @@ image="openproblems-python-extras", ) def tsne_logCPM_1kHVG(adata, test: bool = False, n_pca=50): + import scanpy as sc + adata = log_cpm_hvg(adata) sc.tl.pca(adata, n_comps=n_pca, svd_solver="arpack") sc.tl.tsne(adata, use_rep="X_pca", n_pcs=n_pca) diff --git a/openproblems/tasks/dimensionality_reduction/methods/umap.py b/openproblems/tasks/dimensionality_reduction/methods/umap.py index a64533f0b8..b9e73adf90 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/umap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/umap.py @@ -2,8 +2,6 @@ from ....tools.normalize import log_cpm_hvg from ....tools.utils import check_version -import scanpy as sc - @method( method_name="Uniform Manifold Approximation and Projection (UMAP), " @@ -15,6 +13,8 @@ code_url="https://github.com/lmcinnes/umap", ) def umap_logCPM_1kHVG(adata, test: bool = False, n_pca=50): + import scanpy as sc + adata = log_cpm_hvg(adata) sc.tl.pca(adata, n_comps=50, svd_solver="arpack") sc.pp.neighbors(adata, use_rep="X_pca", n_pcs=n_pca) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/density.py b/openproblems/tasks/dimensionality_reduction/metrics/density.py index a44a248aeb..680fd00ec0 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/density.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/density.py @@ -1,7 +1,5 @@ from anndata import AnnData from openproblems.tools.decorators import metric -from scipy.sparse import issparse -from scipy.stats import pearsonr from typing import Optional import numpy as np @@ -100,6 +98,8 @@ def _calculate_radii( @metric("density preservation", maximize=True, image="openproblems-python-extras") def density_preservation(adata: AnnData) -> float: + from scipy.sparse import issparse + from scipy.stats import pearsonr from umap import UMAP emb = adata.obsm["X_emb"] diff --git a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py index 946a4d5095..690b95eaeb 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py @@ -18,8 +18,6 @@ from ....tools.normalize import log_cpm_hvg from anndata import AnnData from numba import njit -from scipy.sparse import issparse -from sklearn.metrics import pairwise_distances from typing import Tuple import numpy as np @@ -155,6 +153,8 @@ def _metrics( def _high_dim(adata: AnnData) -> np.ndarray: + from scipy.sparse import issparse + adata.X = adata.layers["counts"] adata = log_cpm_hvg(adata) high_dim = adata.X @@ -164,6 +164,8 @@ def _high_dim(adata: AnnData) -> np.ndarray: def _fit( X: np.ndarray, E: np.ndarray ) -> Tuple[float, float, float, float, float, float, float]: + from sklearn.metrics import pairwise_distances + if np.any(np.isnan(E)): return 0.0, 0.0, 0.0, 0.5, -np.inf, -np.inf, -np.inf diff --git a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py index 9e12d5d798..52136ee29b 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py @@ -1,9 +1,6 @@ from ....tools.decorators import metric import numpy as np -import scipy as sp -import sklearn.decomposition -import sklearn.metrics def calculate_squareform_pairwise_distance(data): @@ -12,11 +9,16 @@ def calculate_squareform_pairwise_distance(data): Compute pairwise distance between points in a matrix / vector and then format this into a squareform vector. """ - return sp.spatial.distance.squareform(sp.spatial.distance.pdist(data)) + import scipy.spatial + + return scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(data)) def calculate_rmse(adata, n_svd=200): """Calculate dimensional reduction stress via root mean square error.""" + import sklearn.decomposition + import sklearn.metrics + X = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) high_dimensional_distance_matrix = calculate_squareform_pairwise_distance(X) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py index 3daf3360cd..d308ef4486 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py @@ -1,12 +1,13 @@ from ....tools.decorators import metric from anndata import AnnData -from sklearn import manifold import numpy as np @metric(metric_name="trustworthiness", maximize=True) def trustworthiness(adata: AnnData) -> float: + from sklearn import manifold + high_dim, low_dim = adata.X, adata.obsm["X_emb"] score = manifold.trustworthiness( diff --git a/openproblems/tasks/label_projection/methods/knn_classifier.py b/openproblems/tasks/label_projection/methods/knn_classifier.py index ffe576bc72..62109c89b5 100644 --- a/openproblems/tasks/label_projection/methods/knn_classifier.py +++ b/openproblems/tasks/label_projection/methods/knn_classifier.py @@ -4,7 +4,6 @@ from .sklearn import classifier import functools -import sklearn.neighbors _knn_classifier_method = functools.partial( method, @@ -20,6 +19,8 @@ method_name="K-neighbors classifier (log CPM)", ) def knn_classifier_log_cpm(adata, test=False): + import sklearn.neighbors + adata = log_cpm(adata) return classifier(adata, estimator=sklearn.neighbors.KNeighborsClassifier) @@ -29,5 +30,7 @@ def knn_classifier_log_cpm(adata, test=False): image="openproblems-r-base", ) def knn_classifier_scran(adata, test=False): + import sklearn.neighbors + adata = log_scran_pooling(adata) return classifier(adata, estimator=sklearn.neighbors.KNeighborsClassifier) diff --git a/openproblems/tasks/label_projection/methods/logistic_regression.py b/openproblems/tasks/label_projection/methods/logistic_regression.py index 25a2c40a5a..8c393b5387 100644 --- a/openproblems/tasks/label_projection/methods/logistic_regression.py +++ b/openproblems/tasks/label_projection/methods/logistic_regression.py @@ -4,7 +4,6 @@ from .sklearn import classifier import functools -import sklearn.linear_model _logistic_regression_method = functools.partial( method, @@ -17,6 +16,8 @@ def _logistic_regression(adata, test=False, max_iter=None): + import sklearn.linear_model + if test: max_iter = max_iter or 100 else: # pragma: no cover diff --git a/openproblems/tasks/label_projection/methods/mlp.py b/openproblems/tasks/label_projection/methods/mlp.py index 87e626bab6..71d1dcea9b 100644 --- a/openproblems/tasks/label_projection/methods/mlp.py +++ b/openproblems/tasks/label_projection/methods/mlp.py @@ -4,7 +4,6 @@ from .sklearn import classifier import functools -import sklearn.neural_network _mlp_method = functools.partial( method, @@ -17,6 +16,8 @@ def _mlp(adata, test=False, max_iter=None, hidden_layer_sizes=None): + import sklearn.neural_network + if test: hidden_layer_sizes = hidden_layer_sizes or (20,) max_iter = max_iter or 100 diff --git a/openproblems/tasks/label_projection/methods/sklearn.py b/openproblems/tasks/label_projection/methods/sklearn.py index 977446167a..18ec00ca65 100644 --- a/openproblems/tasks/label_projection/methods/sklearn.py +++ b/openproblems/tasks/label_projection/methods/sklearn.py @@ -2,12 +2,13 @@ from .utils import pca_op import numpy as np -import sklearn.pipeline -import sklearn.preprocessing def classifier(adata, estimator, n_pca=100, **kwargs): """Run a generic scikit-learn classifier.""" + import sklearn.pipeline + import sklearn.preprocessing + adata_train = adata[adata.obs["is_train"]] adata_test = adata[~adata.obs["is_train"]].copy() diff --git a/openproblems/tasks/label_projection/methods/utils.py b/openproblems/tasks/label_projection/methods/utils.py index 056b586e5d..f1925684f3 100644 --- a/openproblems/tasks/label_projection/methods/utils.py +++ b/openproblems/tasks/label_projection/methods/utils.py @@ -1,8 +1,6 @@ -import scipy.sparse -import sklearn.decomposition - - def pca_op(adata_train, adata_test, n_components=100): + import scipy.sparse + import sklearn.decomposition is_sparse = scipy.sparse.issparse(adata_train.X) diff --git a/openproblems/tasks/label_projection/metrics/accuracy.py b/openproblems/tasks/label_projection/metrics/accuracy.py index 5e661fa097..d86bf8ec48 100644 --- a/openproblems/tasks/label_projection/metrics/accuracy.py +++ b/openproblems/tasks/label_projection/metrics/accuracy.py @@ -1,11 +1,12 @@ from ....tools.decorators import metric import numpy as np -import sklearn.preprocessing @metric(metric_name="Accuracy", maximize=True) def accuracy(adata): + import sklearn.preprocessing + encoder = sklearn.preprocessing.LabelEncoder().fit(adata.obs["labels"]) test_data = adata[~adata.obs["is_train"]] diff --git a/openproblems/tasks/label_projection/metrics/f1.py b/openproblems/tasks/label_projection/metrics/f1.py index 94ea5446e7..e2e870a6ee 100644 --- a/openproblems/tasks/label_projection/metrics/f1.py +++ b/openproblems/tasks/label_projection/metrics/f1.py @@ -1,10 +1,10 @@ from ....tools.decorators import metric -import sklearn.metrics -import sklearn.preprocessing - def _f1(adata, average="weighted"): + import sklearn.metrics + import sklearn.preprocessing + encoder = sklearn.preprocessing.LabelEncoder().fit(adata.obs["labels"]) test_data = adata[~adata.obs["is_train"]] diff --git a/openproblems/tasks/multimodal_data_integration/methods/baseline.py b/openproblems/tasks/multimodal_data_integration/methods/baseline.py index 8419e90efb..49e9a3c55e 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/baseline.py +++ b/openproblems/tasks/multimodal_data_integration/methods/baseline.py @@ -3,7 +3,6 @@ from ....tools.utils import check_version import numpy as np -import sklearn.decomposition @method( @@ -15,6 +14,8 @@ is_baseline=True, ) def random_features(adata, test=False, n_svd=20): + import sklearn.decomposition + n_svd = min([n_svd, min(adata.X.shape) - 1, min(adata.obsm["mode2"].shape) - 1]) adata = log_cpm(adata) X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) @@ -35,6 +36,8 @@ def random_features(adata, test=False, n_svd=20): is_baseline=True, ) def true_features(adata, test=False, n_svd=20): + import sklearn.decomposition + n_svd = min([n_svd, min(adata.X.shape) - 1, min(adata.obsm["mode2"].shape) - 1]) adata = log_cpm(adata) X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) diff --git a/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py b/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py index e17db7332e..c60b689ec3 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py +++ b/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py @@ -5,7 +5,6 @@ from ....tools.utils import check_version import functools -import sklearn.decomposition _harmonic_alignment_method = functools.partial( method, @@ -20,6 +19,7 @@ def _harmonic_alignment( adata, test=False, n_svd=None, n_eigenvectors=None, n_pca_XY=None, n_filters=None ): import harmonicalignment + import sklearn.decomposition if test: n_svd = n_svd or 20 diff --git a/openproblems/tasks/multimodal_data_integration/methods/procrustes.py b/openproblems/tasks/multimodal_data_integration/methods/procrustes.py index be3a1e8e0f..82ce06fa72 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/procrustes.py +++ b/openproblems/tasks/multimodal_data_integration/methods/procrustes.py @@ -2,9 +2,6 @@ from ....tools.normalize import log_cpm from ....tools.utils import check_version -import scipy.spatial -import sklearn.decomposition - @method( method_name="Procrustes", @@ -15,6 +12,9 @@ "scipy.spatial.procrustes.html", ) def procrustes(adata, test=False, n_svd=None): + import scipy.spatial + import sklearn.decomposition + if test: n_svd = n_svd or 20 else: # pragma: no cover diff --git a/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py b/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py index 88d3d12c0b..7a458ee386 100644 --- a/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py +++ b/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py @@ -1,12 +1,13 @@ from ....tools.decorators import metric import numpy as np -import sklearn.decomposition -import sklearn.neighbors @metric(metric_name="kNN Area Under the Curve", maximize=True) def knn_auc(adata, proportion_neighbors=0.1, n_svd=100): + import sklearn.decomposition + import sklearn.neighbors + n_svd = min([n_svd, min(adata.X.shape) - 1]) n_neighbors = int(np.ceil(proportion_neighbors * adata.X.shape[0])) X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py index 68aa9746ce..e5dffb6fa7 100644 --- a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py +++ b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd -import scanpy as sc import scipy.sparse @@ -70,6 +69,8 @@ def _get_annotation(adata, retries=3): def _filter_mitochondrial(adata): + import scanpy as sc + if adata.uns["species"] in ["mus_musculus", "homo_sapiens"]: adata.var["mt"] = adata.var.gene_short_name.str.lower().str.startswith( "mt-" @@ -92,6 +93,8 @@ def _filter_n_genes_max(adata): def _filter_n_genes_min(adata): + import scanpy as sc + adata_filter = adata.copy() sc.pp.filter_cells(adata_filter, min_genes=200) if adata_filter.shape[0] > 100: @@ -100,6 +103,8 @@ def _filter_n_genes_min(adata): def _filter_n_cells(adata): + import scanpy as sc + adata_filter = adata.copy() sc.pp.filter_genes(adata_filter, min_cells=5) if adata_filter.shape[1] > 100: @@ -117,6 +122,7 @@ def _filter_has_chr(adata): def _beta(adata, test=False, top_genes=None, threshold=1): """Calculate gene scores and insert into .obsm.""" import pybedtools + import scanpy as sc if test: top_genes = top_genes or 100 diff --git a/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py b/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py index 29366d8565..e586a15e8a 100644 --- a/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py +++ b/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py @@ -1,11 +1,12 @@ from ....tools.decorators import metric import numpy as np -import scipy.sparse -import scipy.stats def _correlation(adata, method="pearson"): + import scipy.sparse + import scipy.stats + if method == "pearson": method = scipy.stats.pearsonr else: diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py index a5dc35b03f..b48ef15b12 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py @@ -9,15 +9,11 @@ from scipy.sparse import csr_matrix from scipy.spatial.distance import pdist from scipy.spatial.distance import squareform -from sklearn.cluster import AgglomerativeClustering -from sklearn.decomposition import PCA -from sklearn.neighbors import kneighbors_graph from typing import Optional import anndata import numpy as np import pandas as pd -import scanpy as sc def categorical(p, n_samples): @@ -66,6 +62,11 @@ def generate_synthetic_dataset( K_sampled: Optional[int] = None, # cells sampled for each spot seed: int = 0, ): + from sklearn.cluster import AgglomerativeClustering + from sklearn.decomposition import PCA + from sklearn.neighbors import kneighbors_graph + + import scanpy as sc import torch np.random.seed(seed) diff --git a/openproblems/tasks/spatial_decomposition/datasets/pancreas.py b/openproblems/tasks/spatial_decomposition/datasets/pancreas.py index 7263b55007..9c5c70bcbd 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/pancreas.py +++ b/openproblems/tasks/spatial_decomposition/datasets/pancreas.py @@ -6,7 +6,6 @@ from typing import Optional import functools -import scanpy as sc _pancreas_dataset = functools.partial( dataset, @@ -24,6 +23,8 @@ def _pancreas_synthetic( n_obs: int = 100, keep_techs: Optional[List[str]] = None, ): + import scanpy as sc + adata = load_pancreas(test=test, keep_techs=keep_techs or ["inDrop3"]) sc.pp.filter_genes(adata, min_counts=10) adata.obs["label"] = adata.obs["celltype"] diff --git a/openproblems/tools/conversion.py b/openproblems/tools/conversion.py index 7b32dc65f8..d80393accd 100644 --- a/openproblems/tools/conversion.py +++ b/openproblems/tools/conversion.py @@ -20,6 +20,7 @@ def r_function(filename, args="sce"): fun : scprep.run.RFunction Python callable evaluating the R code """ + assert filename.endswith(".R") # get the path to the module that called `r_function` diff --git a/openproblems/tools/decorators.py b/openproblems/tools/decorators.py index 7912799d19..9157403414 100644 --- a/openproblems/tools/decorators.py +++ b/openproblems/tools/decorators.py @@ -4,7 +4,6 @@ import anndata import functools import logging -import memory_profiler import time log = logging.getLogger("openproblems") @@ -180,6 +179,7 @@ def profile(func): result : dict Contains 'result', 'runtime_s', 'memory_mb', 'memory_leaked_mb' """ + import memory_profiler @functools.wraps(func) def decorated(*args, **kwargs): diff --git a/openproblems/tools/normalize.py b/openproblems/tools/normalize.py index e8fba61c23..e124cc0dab 100644 --- a/openproblems/tools/normalize.py +++ b/openproblems/tools/normalize.py @@ -2,7 +2,6 @@ import anndata as ad import logging -import scanpy as sc import scprep log = logging.getLogger("openproblems") @@ -31,6 +30,8 @@ @decorators.normalizer def log_scran_pooling(adata: ad.AnnData) -> ad.AnnData: """Normalize data with scran via rpy2.""" + import scanpy as sc + scprep.run.install_bioconductor("scran") adata.obs["size_factors"] = _scran(adata) adata.X = scprep.utils.matrix_vector_elementwise_multiply( @@ -41,6 +42,8 @@ def log_scran_pooling(adata: ad.AnnData) -> ad.AnnData: def _cpm(adata: ad.AnnData): + import scanpy as sc + adata.layers["counts"] = adata.X.copy() sc.pp.normalize_total(adata, target_sum=1e6, key_added="size_factors") @@ -55,6 +58,8 @@ def cpm(adata: ad.AnnData) -> ad.AnnData: @decorators.normalizer def log_cpm(adata: ad.AnnData) -> ad.AnnData: """Normalize data to log counts per million.""" + import scanpy as sc + _cpm(adata) sc.pp.log1p(adata) return adata @@ -75,6 +80,7 @@ def log_cpm_hvg(adata: ad.AnnData, n_genes: int = 1000) -> ad.AnnData: Normalize data to log counts per million and select n_genes highly variable genes """ + import scanpy as sc adata = log_cpm(adata) diff --git a/setup.cfg b/setup.cfg index b0c7316e85..b80460ab9b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,9 +2,9 @@ ignore = # top-level module docstring D100, D104, - # space before : conflicts with black + # space before: conflicts with black E203, - # import not in alphabetical : conflicts with isort + # import not in alphabetical: conflicts with isort H306 per-file-ignores = # imported but unused From d71a30219c2493c1b02fc43342a0aaf5c3a8c994 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 17 Nov 2022 09:08:10 -0500 Subject: [PATCH 117/266] move cwd back --- .github/workflows/run_tests.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 135e87f62e..af116047e7 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -345,9 +345,6 @@ jobs: for dir in bucket work cwd; do mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH} done - CWD=/mnt/openproblems-nextflow/cwd/${BRANCH}/${{ github.run_id }}_${{ github.run_attempt }} - echo "CWD=$CWD" >> $GITHUB_ENV - mkdir -p $CWD ls -l /mnt/openproblems-nextflow/*/${BRANCH} - name: Install package & dependencies @@ -417,7 +414,7 @@ jobs: AWS_DEFAULT_REGION: us-west-2 run: | RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" - cd $CWD + cd /mnt/openproblems-nextflow/cwd/${BRANCH} nextflow run \ -revision v1.9 \ -with-tower \ @@ -434,7 +431,7 @@ jobs: - name: Parse results run: | - python workflow/parse_nextflow.py $CWD /tmp/website + python workflow/parse_nextflow.py /mnt/openproblems-nextflow/cwd/${BRANCH} /tmp/website python workflow/generate_website_markdown.py /tmp/website - name: Rename nextflow log From 2d7198652875e48c63b6b6d6ab440eec27bafda0 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 17 Nov 2022 10:00:52 -0500 Subject: [PATCH 118/266] fix matrix error --- openproblems/api/load.py | 3 ++- openproblems/api/run.py | 3 ++- openproblems/api/utils.py | 7 ------- openproblems/data/utils.py | 21 +++++++++++++++------ 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/openproblems/api/load.py b/openproblems/api/load.py index a238bfa72e..43afd06c22 100644 --- a/openproblems/api/load.py +++ b/openproblems/api/load.py @@ -1,3 +1,4 @@ +from ..data.utils import write_h5ad from . import utils @@ -10,4 +11,4 @@ def load_dataset(task_name, function_name, test): def main(args): """Run the ``load`` subcommand.""" adata = load_dataset(args.task, args.name, args.test) - utils.write_h5ad(adata, args.output) + write_h5ad(adata, args.output) diff --git a/openproblems/api/run.py b/openproblems/api/run.py index 77f4d9dcb9..55a1daef56 100644 --- a/openproblems/api/run.py +++ b/openproblems/api/run.py @@ -1,3 +1,4 @@ +from ..data.utils import write_h5ad from ..utils import temporary from . import utils @@ -23,7 +24,7 @@ def main(args): """Run the ``run`` subcommand.""" adata = anndata.read_h5ad(args.input) adata = run_method(adata, args.task, args.name, args.test) - utils.write_h5ad(adata, args.output) + write_h5ad(adata, args.output) if args.version_file is not None: with open(args.version_file, "w") as handle: handle.write(adata.uns["method_code_version"]) diff --git a/openproblems/api/utils.py b/openproblems/api/utils.py index a6bc3c4222..1953f23633 100644 --- a/openproblems/api/utils.py +++ b/openproblems/api/utils.py @@ -1,5 +1,4 @@ import openproblems -import os class NoSuchFunctionError(RuntimeError): @@ -50,9 +49,3 @@ def print_output(output): print("\n".join(output)) else: print(output) - - -def write_h5ad(adata, filename): - if os.path.isfile(filename): - os.unlink(filename) - adata.write_h5ad(filename) diff --git a/openproblems/data/utils.py b/openproblems/data/utils.py index 9bc05a6e8b..9c63d2254f 100644 --- a/openproblems/data/utils.py +++ b/openproblems/data/utils.py @@ -4,6 +4,7 @@ import functools import hashlib import logging +import numpy as np import os import scipy.sparse @@ -37,9 +38,11 @@ def _cache_path(func, *args, **kwargs): return os.path.join(TEMPDIR, filename) -def _fix_sparse_format(X): +def _fix_matrix_format(X): if scipy.sparse.issparse(X) and not isinstance(X, scipy.sparse.csr_matrix): X = X.tocsr() + if isinstance(X, np.matrix): + X = X.A return X @@ -47,11 +50,11 @@ def _fix_adata(adata): adata.strings_to_categoricals() if "var_names_all" not in adata.uns: adata.uns["var_names_all"] = adata.var.index.to_numpy() - adata.X = _fix_sparse_format(adata.X) + adata.X = _fix_matrix_format(adata.X) for layer in adata.layers: - adata.layers[layer] = _fix_sparse_format(adata.layers[layer]) + adata.layers[layer] = _fix_matrix_format(adata.layers[layer]) for obsm in adata.obsm: - adata.obsm[obsm] = _fix_sparse_format(adata.obsm[obsm]) + adata.obsm[obsm] = _fix_matrix_format(adata.obsm[obsm]) if "counts" not in adata.layers: adata.layers["counts"] = adata.X @@ -80,9 +83,8 @@ def apply_func(*args, **kwargs): else: log.debug(f"Downloading {dataset_name} dataset") adata = func(*args, **kwargs) - _fix_adata(adata) adata.uns["_from_cache"] = False - adata.write_h5ad(filepath) + write_h5ad(adata, filepath) return adata apply_func.metadata = dict(data_url=data_url, data_reference=data_reference) @@ -134,3 +136,10 @@ def subsample_even(adata, n_obs, even_obs): adata_out.varm = adata.varm adata_out.varp = adata.varp return adata_out + + +def write_h5ad(adata, filepath): + if os.path.isfile(filepath): + os.unlink(filepath) + _fix_adata(adata) + adata.write_h5ad(filepath) From 820781294ec3595d97f22e3aa057f311c1d2c091 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 18 Nov 2022 12:16:09 -0500 Subject: [PATCH 119/266] Run test benchmark in Tower (#697) * run everything on tower * update PR template * set sessionId * setup env still * still fetch * fix * remove commented code --- .github/pull_request_template.md | 10 +- .github/workflows/run_tests.yml | 218 +++---------------------------- 2 files changed, 24 insertions(+), 204 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 203bf09d76..4ff20f797a 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -12,11 +12,11 @@ ### Testing -* [ ] This submission was written on a forked copy of SingleCellOpenProblems -* [ ] GitHub Actions "Run Benchmark" tests are passing on this base branch of this pull - request (include link to passed test: ) -* [ ] If this pull request is not ready for review (including passing the "Run - Benchmark" tests), I will open this PR as a draft (click on the down arrow next to the +* [ ] This submission was written on a forked copy of openproblems +* [ ] Nextflow test pipeline is passing on this base branch of this pull + request (include link to passed test on NF Tower found in GitHub Actions summary: ) +* [ ] If this pull request is not ready for review (including passing the Nextflow test + pipeline), I will open this PR as a draft (click on the down arrow next to the "Create Pull Request" button) ### Submission guidelines diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index af116047e7..9675d7f7f8 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -131,6 +131,7 @@ jobs: ) && !startsWith(github.ref, 'refs/heads/test_docker') && !startsWith(github.ref, 'refs/heads/test_benchmark') && + !startsWith(github.ref, 'refs/heads/test_full_benchmark') && !startsWith(github.ref, 'refs/heads/test_process') steps: @@ -221,8 +222,9 @@ jobs: !endsWith(github.event.head_commit.message, '# ci skip') && github.event_name == 'push' && ( - needs.run_tester.result == 'success' || - startsWith(github.ref, 'refs/heads/test_benchmark') + needs.run_tester.result == 'success'|| + startsWith(github.ref, 'refs/heads/test_benchmark') || + startsWith(github.ref, 'refs/heads/test_full_benchmark') ) steps: @@ -235,15 +237,7 @@ jobs: - uses: actions/checkout@v3 with: - fetch-depth: 1000 - - - name: Clear space on runner - run: ./scripts/clear_runner_diskspace.sh - - - name: Install system dependencies - run: | - sudo apt-get update -qq || (sudo rm /etc/apt/sources.list.d/* && sudo apt-get update -yqq) - sudo apt-get install -qy --no-install-recommends libhdf5-dev pandoc gfortran libblas-dev liblapack-dev libedit-dev llvm-dev + fetch-depth: 1 - name: Check Tower authentication env: @@ -271,30 +265,15 @@ jobs: exit 1 fi - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.8" - - - name: Set up Java ${{ matrix.java_version }} - uses: actions/setup-java@v3 - with: - java-version: 15 - architecture: x64 - distribution: zulu - - name: Set up environment run: | - SCRIPTS_PATH=$(python3 -c 'import os, sysconfig; print(sysconfig.get_path("scripts",f"{os.name}_user"))') - echo "PATH=${SCRIPTS_PATH}:${PATH}" >> $GITHUB_ENV - echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV - echo "PYTHON_LOCATION=$(which python3)" >> $GITHUB_ENV - echo "UBUNTU_VERSION=`grep DISTRIB_RELEASE /etc/lsb-release | sed 's/.*=//g'`" >> $GITHUB_ENV # If not on the base repository, append first 6 characters of username to the image name # to avoid clashes on ECR REPO_PARSED=$(echo ${{ github.repository }} | awk '{print $1}' FS=/ | head -c 6) BRANCH_PARSED=$(echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[^a-zA-Z0-9]:-:g') - if [[ "${{ github.repository }}" == "openproblems-bio/openproblems" ]]; then + if [[ ${{ startsWith(github.ref, 'refs/tags') || startsWith(github.ref, 'refs/heads/test_full_benchmark') }} == true ]]; then + BRANCH="prod" + elif [[ "${{ github.repository }}" == "openproblems-bio/openproblems" ]]; then BRANCH=`echo $BRANCH_PARSED | head -c 40` else BRANCH="${REPO_PARSED}-`echo $BRANCH_PARSED | head -c 33`" @@ -302,184 +281,25 @@ jobs: BRANCH=`echo $BRANCH | sed 's/[^a-zA-Z0-9]*$//'` echo "BRANCH=${BRANCH}" >> $GITHUB_ENV - - name: Cache Python packages - uses: actions/cache@v3 - with: - path: ${{ env.pythonLocation }} - key: ${{ env.UBUNTU_VERSION }}-pip-${{ env.pythonLocation }}-${{ hashFiles('setup.py') }} - restore-keys: ${{ env.UBUNTU_VERSION}}-pip-${{ env.pythonLocation }}- - - - name: Install Nextflow - env: - CAPSULE_LOG: none - NXF_VER: 22.10.2 - run: | - mkdir /tmp/nextflow - cd /tmp/nextflow - wget -qO- get.nextflow.io | bash - sudo ln -s /tmp/nextflow/nextflow /usr/local/bin/nextflow - nextflow -version - - - name: Install AWS CLI - run: | - mkdir /tmp/awscli - cd /tmp/awscli - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - unzip -q awscliv2.zip - sudo ./aws/install || sudo ./aws/install --update - aws --version - - - name: Set up S3FS - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - run: | - sudo apt-get install -qy --no-install-recommends s3fs - echo $AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY > ~/.passwd-s3fs - chmod 600 ~/.passwd-s3fs - sudo mkdir -p /mnt/openproblems-nextflow - sudo chown $USER /mnt/openproblems-nextflow - s3fs -o umask=0277,uid=$(id -u) openproblems-nextflow /mnt/openproblems-nextflow - # Create bucket/ work/ and cwd/ - for dir in bucket work cwd; do - mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH} - done - ls -l /mnt/openproblems-nextflow/*/${BRANCH} - - - name: Install package & dependencies - run: | - python -m pip install --upgrade pip - pip install -U wheel setuptools - pip install -U --editable .[evaluate,process] - python -c "import openproblems" - openproblems-cli --version - openproblems-cli --test-hash - - - name: Pull Docker images - if: | - startsWith(github.ref, 'refs/heads/main') && - github.repository == 'openproblems-bio/openproblems' - run: | - cd workflow - snakemake -j $(nproc) docker_pull - cd .. - - - name: Build Docker images - if: | - !( - startsWith(github.ref, 'refs/heads/main') && - github.repository == 'openproblems-bio/openproblems' - ) - run: | - cd workflow - SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker - cd .. - - - name: Upload Docker images - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - run: | - ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" - aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ - docker login --username AWS --password-stdin $ECR_ENDPOINT - for image in $(cd docker && ls -1d */ | tr -d '/'); do - docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} - docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} - done - - - name: Upload Docker images for full benchmark - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - if: >- - startsWith(github.ref, 'refs/tags') || - startsWith(github.ref, 'refs/heads/test_full_benchmark') - run: | - ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" - for image in $(cd docker && ls -1d */ | tr -d '/'); do - docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:prod-${image} - docker push --quiet ${ECR_ENDPOINT}/openproblems:prod-${image} - done - - - name: Run test benchmark - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - TOWER_ACCESS_TOKEN: ${{ secrets.TOWER_ACCESS_KEY }} - TOWER_WORKSPACE_ID: 53907369739130 - AWS_DEFAULT_REGION: us-west-2 - run: | - RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" - cd /mnt/openproblems-nextflow/cwd/${BRANCH} - nextflow run \ - -revision v1.9 \ - -with-tower \ - -ansi-log false -resume \ - -profile aws,test \ - -work-dir "/mnt/openproblems-nextflow/work/${BRANCH}" \ - -bucket-dir "s3://openproblems-nextflow/bucket/${BRANCH}" \ - -name "${RUN_NAME}" \ - -e.PYTHONPATH="${PYTHONPATH}" \ - openproblems-bio/nf-openproblems \ - --branch ${BRANCH} | \ - tee >(grep --color=never --line-buffered "Monitor the execution with Nextflow Tower using this url" >> $GITHUB_STEP_SUMMARY) - shell: /bin/bash -eou pipefail {0} - - - name: Parse results - run: | - python workflow/parse_nextflow.py /mnt/openproblems-nextflow/cwd/${BRANCH} /tmp/website - python workflow/generate_website_markdown.py /tmp/website - - - name: Rename nextflow log - if: always() - run: | - mv ${CWD}/.nextflow.log /tmp/nextflow.log - continue-on-error: true - - - name: Upload nextflow log - if: always() - uses: actions/upload-artifact@main - with: - name: nextflow.log - path: /tmp/nextflow.log - - run_full_benchmark: - needs: - - run_test_benchmark - - run_tester - runs-on: ubuntu-latest - if: >- - always() && - (needs.run_test_benchmark.result == 'success' || needs.run_test_benchmark.result == 'skipped') && - !endsWith(github.event.head_commit.message, '# ci skip') && - github.event_name == 'push' && - ( - needs.run_tester.result == 'success' || - startsWith(github.ref, 'refs/heads/test_full_benchmark') - ) && - ( - startsWith(github.ref, 'refs/tags') || - startsWith(github.ref, 'refs/heads/test_full_benchmark') - ) - - steps: - - - name: Run full benchmark + - name: Run benchmark env: TOWER_WATCH_URL: https://tower.nf/orgs/openproblems-bio/workspaces/openproblems-bio/watch TOWER_WORKSPACE_ID: 53907369739130 - TOWER_ACTION_ID: bVQhVSNah1JmJfnKkfyjg run: | + RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" + if [[ ${{ startsWith(github.ref, 'refs/tags') || startsWith(github.ref, 'refs/heads/test_full_benchmark') }} == true ]]; then + TOWER_ACTION_ID="bVQhVSNah1JmJfnKkfyjg" + WORKDIR="s3://openproblems-nextflow/work_main" + else + TOWER_ACTION_ID="5BQc88ZvjuXCYbc55Hot27" + WORKDIR="s3://openproblems-nextflow/work/$BRANCH" + fi generate_parameters() { cat <> $GITHUB_STEP_SUMMARY + echo "Benchmark running at ${TOWER_WATCH_URL}/${WORKFLOW_ID}" >> $GITHUB_STEP_SUMMARY From 635feca6b878894050780857363d31842f9037f2 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 18 Nov 2022 14:44:55 -0500 Subject: [PATCH 120/266] set num_samples on test --- .../spatial_decomposition/methods/cell2location.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index 3a599cb7e6..7907c79d86 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -22,7 +22,7 @@ def _cell2location( n_cells_per_location=20, hard_coded_reference=True, amortised=False, - num_samples=1000, + num_samples=None, sc_batch_size=2500, st_batch_size=None, test=False, @@ -160,7 +160,7 @@ def cell2location_detection_alpha_20( n_cells_per_location=20, hard_coded_reference=True, amortised=False, - num_samples=1000, + num_samples=None, sc_batch_size=2500, st_batch_size=None, test: bool = False, @@ -191,7 +191,7 @@ def cell2location_detection_alpha_20_nb( n_cells_per_location=20, hard_coded_reference=False, amortised=False, - num_samples=1000, + num_samples=None, sc_batch_size=2500, st_batch_size=None, test: bool = False, @@ -222,7 +222,7 @@ def cell2location_detection_alpha_200( n_cells_per_location=20, hard_coded_reference=True, amortised=False, - num_samples=1000, + num_samples=None, sc_batch_size=2500, st_batch_size=None, test: bool = False, @@ -253,7 +253,7 @@ def cell2location_amortised_detection_alpha_20( n_cells_per_location=20, hard_coded_reference=True, amortised=True, - num_samples=1000, + num_samples=None, sc_batch_size=2500, st_batch_size=1024, test: bool = False, From 17b6f681e4ec27367659c7be70561d92cd59617f Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 18 Nov 2022 16:12:12 -0500 Subject: [PATCH 121/266] Matrixed CI (#693) * matrix tests * don't fail fast * upload docker image artifact * runs on * don't login to save * mkdir * docker load --input * export all layers in one tar * use less disk space maybe * filter within docker * use ghcr * temporarily docker_pull * find * set -x * unquo * don't use jq * paste * document * need to checkout first * docker can handle multithreading * fix needs * temp * skip * wait * needs build_images * cleaner rm * parallel * fix cache * simplify run logic * tag on pull * dynamic matrix * fix typo * fix parens * use container-retention-policy * remove the old hacky way * bugfix * fewer suites * fix order * cache on linux version * retry coverage on failure * multi line with: * use actor, not repo owner * no retry * use repo owner * just don't run on pull request * uname -rs * maybe don't need to clear space for now * upload PR coverage * temp * don't use jq * tee * just run the curl twice * build docker images on PR * typo * don't skip on PR * checkout * install * do clear disk --- .github/workflows/run_tests.yml | 214 +++++++++++++++--- scripts/generate_test_matrix.py | 22 ++ test/{test_5_cli.py => test_0_cli.py} | 0 test/{test_5_tools.py => test_0_tools.py} | 0 test/{test_5_utils.py => test_0_utils.py} | 0 test/test_1_methods.py | 6 +- test/test_1_metrics.py | 9 +- ...t_2_load_data.py => test_2_1_load_data.py} | 8 +- ...est_3_datasets.py => test_2_2_datasets.py} | 6 +- ...n.py => test_3_cell_cell_communication.py} | 5 - ....py => test_3_dimensionality_reduction.py} | 5 - test/utils/git.py | 32 --- 12 files changed, 205 insertions(+), 102 deletions(-) create mode 100644 scripts/generate_test_matrix.py rename test/{test_5_cli.py => test_0_cli.py} (100%) rename test/{test_5_tools.py => test_0_tools.py} (100%) rename test/{test_5_utils.py => test_0_utils.py} (100%) rename test/{test_2_load_data.py => test_2_1_load_data.py} (84%) rename test/{test_3_datasets.py => test_2_2_datasets.py} (96%) rename test/{test_4_cell_cell_communication.py => test_3_cell_cell_communication.py} (96%) rename test/{test_4_dimensionality_reduction.py => test_3_dimensionality_reduction.py} (92%) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 9675d7f7f8..cbaeceb568 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -31,17 +31,18 @@ jobs: runs-on: ubuntu-latest if: | !endsWith(github.event.head_commit.message, '# ci skip') && + !startsWith(github.ref, 'refs/heads/test_process') && ( - startsWith(github.ref, 'refs/heads/test_docker') || - ( - startsWith(github.ref, 'refs/heads/main') && - github.repository == 'openproblems-bio/openproblems' - ) + github.event_name == 'push' || + startsWith(github.ref, 'refs/heads/test_docker') ) env: BRANCH_NAME: "auto_update_docker_${{ github.run_number }}" + outputs: + images: ${{ steps.export-images.outputs.images }} + steps: - uses: actions/checkout@v3 with: @@ -55,6 +56,17 @@ jobs: with: python-version: "3.8" + - name: Set up environment + run: | + echo "LINUX_VERSION=$(uname -rs)" >> $GITHUB_ENV + + - name: Cache Python packages + uses: actions/cache@v3 + with: + path: ${{ env.pythonLocation }} + key: pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}-${{ hashFiles('setup.py') }} + restore-keys: pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}- + - name: Install package & dependencies run: | python -m pip install --upgrade pip @@ -62,15 +74,35 @@ jobs: pip install --editable .[evaluate] python -c "import openproblems" + - name: Download docker images + run: | + for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do + docker pull singlecellopenproblems/${image} & + done + wait + + - name: Update Docker images + if: | + !( + startsWith(github.ref, 'refs/heads/test_docker') || + startsWith(github.ref, 'refs/heads/main') + ) + run: | + cd workflow + SNAKEMAKE_COPY_SOURCE=1 snakemake -j $(nproc) docker + cd .. + - name: Build Docker images - if: "!startsWith(github.ref, 'refs/heads/main')" + if: | + startsWith(github.ref, 'refs/heads/test_docker') run: | cd workflow snakemake -j $(nproc) docker_build cd .. - name: Build and push Docker images - if: "startsWith(github.ref, 'refs/heads/main')" + if: | + startsWith(github.ref, 'refs/heads/main') env: DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} run: | @@ -92,12 +124,69 @@ jobs: commit-message: "Update docker version # ci skip" add-paths: docker/.version - - name: Upload check results on fail - if: failure() - uses: actions/upload-artifact@main + - name: Log in to the Container registry + uses: docker/login-action@v2 with: - name: ${{ matrix.config.name }}_results - path: check + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Export docker images + id: export-images + run: | + IMAGES="$(find ./docker -mindepth 1 -type d -exec basename {} \;)" + for image in ${IMAGES}; do + GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" + docker tag singlecellopenproblems/$image $GHCR_IMAGE + docker push $GHCR_IMAGE & + done + wait + # convert to JSON + echo "images=[\"$(paste -s -d ' ' <(echo $IMAGES) | sed 's/ */\",\"/g')\"]" >> $GITHUB_OUTPUT + + create_matrix: + needs: cancel_previous_runs + runs-on: ubuntu-latest + if: | + !endsWith(github.event.head_commit.message, '# ci skip') + + outputs: + matrix: ${{ steps.generate-matrix.outputs.matrix }} + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.8" + + - name: Set up environment + run: | + echo "LINUX_VERSION=$(uname -rs)" >> $GITHUB_ENV + + - name: Cache Python packages + uses: actions/cache@v3 + with: + path: ${{ env.pythonLocation }} + key: pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}-${{ hashFiles('setup.py') }} + restore-keys: pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}- + + - name: Install package & dependencies + run: | + python -m pip install --upgrade pip + pip install -U wheel setuptools + pip install --editable .[evaluate] + python -c "import openproblems" + + + - name: Create test matrix + id: generate-matrix + run: | + set -eo pipefail + echo "matrix=$(python scripts/generate_test_matrix.py)" >> $GITHUB_OUTPUT run_tester: runs-on: ubuntu-latest @@ -119,11 +208,17 @@ jobs: - /usr/local/lib/android:/opt/remove/android options: --user root - needs: build_images + needs: + - build_images + - create_matrix if: | always() && - (needs.build_images.result == 'success' || needs.build_images.result == 'skipped') && !endsWith(github.event.head_commit.message, '# ci skip') && + needs.create_matrix.result == 'success' && + ( + needs.build_images.result == 'success' || + needs.build_images.result == 'skipped' + ) && ( startsWith(github.ref, 'refs/heads') || startsWith(github.ref, 'refs/tags') || @@ -131,28 +226,50 @@ jobs: ) && !startsWith(github.ref, 'refs/heads/test_docker') && !startsWith(github.ref, 'refs/heads/test_benchmark') && - !startsWith(github.ref, 'refs/heads/test_full_benchmark') && !startsWith(github.ref, 'refs/heads/test_process') + strategy: + fail-fast: false + matrix: + tests: ${{ fromJSON(needs.create_matrix.outputs.matrix) }} + steps: - name: Clear space on runner run: | - sudo rm -rf /opt/remove/*/* + sudo find /opt/remove -mindepth 2 -maxdepth 2 -type d -exec rm -rf {} \; - uses: actions/checkout@v3 with: fetch-depth: 0 + - name: Log in to the Container registry + uses: docker/login-action@v2 + if: "github.event_name == 'push'" + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Download docker images + if: "github.event_name == 'push'" + run: | + for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do + GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" + (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & + done + wait + - name: Set up environment run: | - echo "LINUX_VERSION=$(uname -a)" >> $GITHUB_ENV + echo "LINUX_VERSION=$(uname -rs)" >> $GITHUB_ENV + echo "pythonLocation=$(which python)" >> $GITHUB_ENV echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV echo "R_VERSION=$(R --version | head -n 1)" >> $GITHUB_ENV - name: Cache Python packages uses: actions/cache@v3 with: - path: ${{ env.PYTHON_VERSION }} + path: ${{ env.pythonLocation }} key: ${{env.LINUX_VERSION}}-pip-${{ env.PYTHON_VERSION }}-${{ hashFiles('setup.py') }} restore-keys: ${{env.LINUX_VERSION}}-pip-${{ env.PYTHON_VERSION }}- @@ -181,22 +298,15 @@ jobs: install_renv("docker/openproblems-github-actions/r_requirements.txt") shell: Rscript {0} - - name: Pull Docker images - if: "startsWith(github.ref, 'refs/heads/main') && github.repository == 'openproblems-bio/openproblems'" - run: | - cd workflow - snakemake -j $(nproc) docker_pull - cd .. - - - name: Update Docker images - if: "!(startsWith(github.ref, 'refs/heads/main') && github.repository == 'openproblems-bio/openproblems')" + - name: Update Docker docker images + if: "github.event_name == 'pull_request'" run: | cd workflow snakemake -j $(nproc) docker cd .. - name: Run tests - run: pytest --cov=openproblems --cov-report=xml -vv --durations=15 --tb=native + run: pytest --cov=openproblems --cov-report=xml -vv --durations=15 --tb=native -k "${{ matrix.tests }}" - name: Upload coverage uses: codecov/codecov-action@v3 @@ -214,7 +324,7 @@ jobs: path: results - run_test_benchmark: + run_benchmark: needs: run_tester runs-on: ubuntu-latest if: >- @@ -222,9 +332,8 @@ jobs: !endsWith(github.event.head_commit.message, '# ci skip') && github.event_name == 'push' && ( - needs.run_tester.result == 'success'|| - startsWith(github.ref, 'refs/heads/test_benchmark') || - startsWith(github.ref, 'refs/heads/test_full_benchmark') + needs.run_tester.result == 'success' || + startsWith(github.ref, 'refs/heads/test_benchmark') ) steps: @@ -235,10 +344,6 @@ jobs: exit 1 fi - - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - name: Check Tower authentication env: TOWER_ACCESS_TOKEN: ${{ secrets.TOWER_ACCESS_KEY }} @@ -265,6 +370,28 @@ jobs: exit 1 fi + - uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: Clear space on runner + run: ./scripts/clear_runner_diskspace.sh + + - name: Log in to the Container registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Download docker images + run: | + for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do + GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" + (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & + done + wait + - name: Set up environment run: | # If not on the base repository, append first 6 characters of username to the image name @@ -281,6 +408,21 @@ jobs: BRANCH=`echo $BRANCH | sed 's/[^a-zA-Z0-9]*$//'` echo "BRANCH=${BRANCH}" >> $GITHUB_ENV + - name: Upload Docker images + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + run: | + ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" + aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ + docker login --username AWS --password-stdin $ECR_ENDPOINT + for image in $(cd docker && ls -1d */ | tr -d '/'); do + docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} + docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} & + done + wait + - name: Run benchmark env: TOWER_WATCH_URL: https://tower.nf/orgs/openproblems-bio/workspaces/openproblems-bio/watch diff --git a/scripts/generate_test_matrix.py b/scripts/generate_test_matrix.py new file mode 100644 index 0000000000..11b974b145 --- /dev/null +++ b/scripts/generate_test_matrix.py @@ -0,0 +1,22 @@ +import json +import openproblems + +_CORE_TEST_SUITES = ["(test_0_ or test_3_)"] +_TASK_TEST_SUITES = ["test_1_", "test_2_"] + + +def generate_matrix(): + suites = _CORE_TEST_SUITES.copy() + for task in openproblems.TASKS: + task_name = task.__name__.split(".")[-1] + suites.extend([f"{suite} and {task_name}" for suite in _TASK_TEST_SUITES]) + return suites + + +def main(): + matrix = generate_matrix() + print(json.dumps(matrix)) + + +if __name__ == "__main__": + main() diff --git a/test/test_5_cli.py b/test/test_0_cli.py similarity index 100% rename from test/test_5_cli.py rename to test/test_0_cli.py diff --git a/test/test_5_tools.py b/test/test_0_tools.py similarity index 100% rename from test/test_5_tools.py rename to test/test_0_tools.py diff --git a/test/test_5_utils.py b/test/test_0_utils.py similarity index 100% rename from test/test_5_utils.py rename to test/test_0_utils.py diff --git a/test/test_1_methods.py b/test/test_1_methods.py index a1f49ea3ca..84f0c220bb 100644 --- a/test/test_1_methods.py +++ b/test/test_1_methods.py @@ -1,14 +1,10 @@ import openproblems import os import parameterized -import pytest import utils.docker import utils.git import utils.name -pytestmark = pytest.mark.skipif( - len(utils.git.list_modified_tasks()) == 0, reason="No tasks have been modified" -) RETRIES = ( int(os.environ["PYTEST_MAX_RETRIES"]) if "PYTEST_MAX_RETRIES" in os.environ else 2 ) @@ -21,7 +17,7 @@ method.__name__, method.metadata["image"], ) - for task in utils.git.list_modified_tasks() + for task in openproblems.TASKS for method in task.METHODS ], name_func=utils.name.name_test, diff --git a/test/test_1_metrics.py b/test/test_1_metrics.py index 97ac695c99..55075ad785 100644 --- a/test/test_1_metrics.py +++ b/test/test_1_metrics.py @@ -1,13 +1,8 @@ import openproblems import parameterized -import pytest -import utils.git +import utils.docker import utils.name -pytestmark = pytest.mark.skipif( - len(utils.git.list_modified_tasks()) == 0, reason="No tasks have been modified" -) - @parameterized.parameterized.expand( [(metric,) for task in openproblems.TASKS for metric in task.METRICS], @@ -31,7 +26,7 @@ def test_metric_metadata(metric): metric.__name__, metric.metadata["image"], ) - for task in utils.git.list_modified_tasks() + for task in openproblems.TASKS for metric in task.METRICS ], name_func=utils.name.name_test, diff --git a/test/test_2_load_data.py b/test/test_2_1_load_data.py similarity index 84% rename from test/test_2_load_data.py rename to test/test_2_1_load_data.py index bff0acee8c..18f3422889 100644 --- a/test/test_2_load_data.py +++ b/test/test_2_1_load_data.py @@ -1,14 +1,8 @@ import openproblems import parameterized -import pytest import utils.docker -import utils.git import utils.name -pytestmark = pytest.mark.skipif( - len(utils.git.list_modified_tasks()) == 0, reason="No tasks have been modified" -) - @parameterized.parameterized.expand( [ @@ -19,7 +13,7 @@ utils.TEMPDIR.name, dataset.metadata["image"], ) - for task in utils.git.list_modified_tasks() + for task in openproblems.TASKS for dataset in task.DATASETS for test in [True] ], diff --git a/test/test_3_datasets.py b/test/test_2_2_datasets.py similarity index 96% rename from test/test_3_datasets.py rename to test/test_2_2_datasets.py index 97666a0ed7..f35aaa7713 100644 --- a/test/test_3_datasets.py +++ b/test/test_2_2_datasets.py @@ -15,10 +15,6 @@ DATASET_SUMMARY_MINLEN = 40 DATASET_SUMMARY_MAXLEN = 1000 -pytestmark = pytest.mark.skipif( - len(utils.git.list_modified_tasks()) == 0, reason="No tasks have been modified" -) - def _assert_not_bytes(X): if isinstance(X, pd.Series): @@ -43,7 +39,7 @@ def _assert_not_bytes(X): ("dataset", "task", "test", "tempdir"), [ (staticmethod(dataset), task, test, utils.TEMPDIR.name) - for task in utils.git.list_modified_tasks() + for task in openproblems.TASKS for dataset in task.DATASETS for test in [True] ], diff --git a/test/test_4_cell_cell_communication.py b/test/test_3_cell_cell_communication.py similarity index 96% rename from test/test_4_cell_cell_communication.py rename to test/test_3_cell_cell_communication.py index d44678e2c8..5550ab94c5 100644 --- a/test/test_4_cell_cell_communication.py +++ b/test/test_3_cell_cell_communication.py @@ -6,7 +6,6 @@ import openproblems.tasks._cell_cell_communication._common.utils import os import pandas as pd -import pytest import tempfile import unittest import utils.docker @@ -17,10 +16,6 @@ openproblems.tasks.cell_cell_communication_source_target, openproblems.tasks.cell_cell_communication_ligand_target, ] -pytestmark = pytest.mark.skipif( - any([task not in utils.git.list_modified_tasks() for task in SUBTASKS]), - reason="Relevant task has not been modified", -) class TestApi(unittest.TestCase): diff --git a/test/test_4_dimensionality_reduction.py b/test/test_3_dimensionality_reduction.py similarity index 92% rename from test/test_4_dimensionality_reduction.py rename to test/test_3_dimensionality_reduction.py index e713c8d6ae..a933e4f7c9 100644 --- a/test/test_4_dimensionality_reduction.py +++ b/test/test_3_dimensionality_reduction.py @@ -1,15 +1,10 @@ """Specific tests for the dimensionality_reduction task""" import openproblems -import pytest import utils.docker import utils.git # global skip TASK = openproblems.tasks.dimensionality_reduction -pytestmark = pytest.mark.skipif( - TASK not in utils.git.list_modified_tasks(), - reason="Relevant task has not been modified", -) @utils.docker.docker_test(image=TASK.metrics.trustworthiness.metadata["image"]) diff --git a/test/utils/git.py b/test/utils/git.py index 3c20a6b575..261e83e0e2 100644 --- a/test/utils/git.py +++ b/test/utils/git.py @@ -1,7 +1,6 @@ from . import run import functools -import openproblems import os TESTDIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -59,22 +58,6 @@ def task_dir(task): return os.path.relpath(os.path.dirname(task.__file__), BASEDIR) -def task_modified(task): - """Check if the task has changed relative to base/main.""" - return git_has_diff(task_dir(task)) - - -def core_modified(): - """Check if the core repo has changed relative to base/main. - - We exclude all task directories as well as any md files and the website. - """ - task_exclusions = [f":^{task_dir(task)}" for task in openproblems.TASKS] - diff_target = ["./openproblems", "./docker", "./test", ":^*.md", ":^website"] - diff_target += task_exclusions - return git_has_diff(diff_target) - - def git_rev_parse(branch): """Get the current commit of a branch""" return run.run( @@ -94,18 +77,3 @@ def is_pull_request(): if "GITHUB_EVENT_NAME" in os.environ: return os.environ["GITHUB_EVENT_NAME"] == "pull_request" return False - - -@functools.lru_cache(None) -def list_modified_tasks(): - """List tasks for which testing must be run. - - Return all tasks if the core repo has changed, - otherwise just those that have changed relative to base/main. - - If we are currently in a pull request or at the HEAD of base/main, test all tasks. - """ - if is_pull_request() or core_modified() or is_main_head(): - return openproblems.TASKS - - return [task for task in openproblems.TASKS if task_modified(task)] From eb629a226a3646d3f2442891526c041d78bc9c15 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 18 Nov 2022 17:53:32 -0500 Subject: [PATCH 122/266] state success --- .github/workflows/run_tests.yml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index cbaeceb568..bbf80b58e2 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -226,6 +226,7 @@ jobs: ) && !startsWith(github.ref, 'refs/heads/test_docker') && !startsWith(github.ref, 'refs/heads/test_benchmark') && + !startsWith(github.ref, 'refs/heads/test_full_benchmark') && !startsWith(github.ref, 'refs/heads/test_process') strategy: @@ -323,6 +324,17 @@ jobs: name: ${{ matrix.config.name }}_results path: results + state_completion: + needs: run_tester + runs-on: ubuntu-latest + if: >- + always() && + !endsWith(github.event.head_commit.message, '# ci skip') && + needs.run_tester.result + steps: + - name: State that tests succeeded + run: | + echo "We did it! 🥳🥳🥳" run_benchmark: needs: run_tester @@ -333,7 +345,8 @@ jobs: github.event_name == 'push' && ( needs.run_tester.result == 'success' || - startsWith(github.ref, 'refs/heads/test_benchmark') + startsWith(github.ref, 'refs/heads/test_benchmark') || + startsWith(github.ref, 'refs/heads/test_full_benchmark') ) steps: From cddb2d171ff4559169033ed4ed0c7e6dcb34c8cf Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 18 Nov 2022 17:58:34 -0500 Subject: [PATCH 123/266] don't rebase automatically --- .github/dependabot.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index c69bbe6c5a..db7d0e9586 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,63 +5,74 @@ updates: schedule: interval: "daily" open-pull-requests-limit: 1 + rebase-strategy: "disabled" - package-ecosystem: "pip" directory: "/docker/openproblems" schedule: interval: "daily" open-pull-requests-limit: 1 + rebase-strategy: "disabled" - package-ecosystem: "pip" directory: "/docker/openproblems-github-actions" schedule: interval: "daily" open-pull-requests-limit: 1 + rebase-strategy: "disabled" - package-ecosystem: "pip" directory: "/docker/openproblems-python-batch-integration" schedule: interval: "daily" open-pull-requests-limit: 1 + rebase-strategy: "disabled" - package-ecosystem: "pip" directory: "/docker/openproblems-python-extras" schedule: interval: "daily" open-pull-requests-limit: 1 + rebase-strategy: "disabled" - package-ecosystem: "pip" directory: "/docker/openproblems-python-scvi" schedule: interval: "daily" open-pull-requests-limit: 1 + rebase-strategy: "disabled" - package-ecosystem: "pip" directory: "/docker/openproblems-python-tf2.4" schedule: interval: "daily" open-pull-requests-limit: 1 + rebase-strategy: "disabled" - package-ecosystem: "pip" directory: "/docker/openproblems-r-base" schedule: interval: "daily" open-pull-requests-limit: 1 + rebase-strategy: "disabled" - package-ecosystem: "pip" directory: "/docker/openproblems-r-extras" schedule: interval: "daily" open-pull-requests-limit: 1 + rebase-strategy: "disabled" - package-ecosystem: "pip" directory: "/docker/openproblems-r-pytorch" schedule: interval: "daily" open-pull-requests-limit: 1 + rebase-strategy: "disabled" - package-ecosystem: "github-actions" directory: "/" schedule: interval: "daily" open-pull-requests-limit: 1 + rebase-strategy: "disabled" From 6b50d483333b3838a489a674a36cd5cba88c98ef Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 18 Nov 2022 18:03:10 -0500 Subject: [PATCH 124/266] ci skip on R package update --- .github/workflows/check_r_dependencies.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_r_dependencies.yml b/.github/workflows/check_r_dependencies.yml index 91d87dd44a..ab5d30f2a7 100644 --- a/.github/workflows/check_r_dependencies.yml +++ b/.github/workflows/check_r_dependencies.yml @@ -54,5 +54,5 @@ jobs: title: "Update ${{ env.PKG_CHANGED }}" committer: "openproblems-bio " author: "openproblems-bio " - commit-message: "Update ${{ env.PKG_CHANGED }}" + commit-message: "Update ${{ env.PKG_CHANGED }} # ci skip" draft: true From c70d693232cbf59592a75d994c51c95da8ab469a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 21 Nov 2022 09:39:27 -0500 Subject: [PATCH 125/266] require success --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index bbf80b58e2..f7f15927ed 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -330,7 +330,7 @@ jobs: if: >- always() && !endsWith(github.event.head_commit.message, '# ci skip') && - needs.run_tester.result + needs.run_tester.result == 'success' steps: - name: State that tests succeeded run: | From a2855aaa4b51c8e6a2944814488a5c90a18211dc Mon Sep 17 00:00:00 2001 From: Daniel Dimitrov <50865230+dbdimitrov@users.noreply.github.com> Date: Mon, 21 Nov 2022 15:54:33 +0100 Subject: [PATCH 126/266] Metric odds ratio fix (#688) * inverse rank_aggregate * set seed for samples * ensure dropped duplicates are stable * run pre-commit locall * pre-commit * add sigmoid transf to odds * pre-commit * Remove pytestmark * Rename test_4_cell_cell_communication.py to test_3_cell_cell_communication.py Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../_cell_cell_communication/_common/api.py | 55 +++++++++++-------- .../_common/methods/liana.R | 4 +- .../_common/metrics/odds_ratio.py | 10 +++- test/test_3_cell_cell_communication.py | 15 +++-- 4 files changed, 51 insertions(+), 33 deletions(-) diff --git a/openproblems/tasks/_cell_cell_communication/_common/api.py b/openproblems/tasks/_cell_cell_communication/_common/api.py index 2010de8c2b..2b2f4a0be1 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/api.py +++ b/openproblems/tasks/_cell_cell_communication/_common/api.py @@ -199,6 +199,7 @@ def sample_dataset(merge_keys): import scanpy as sc adata = load_sample_data() + rng = np.random.default_rng(seed=1234) adata.uns["merge_keys"] = merge_keys @@ -215,19 +216,24 @@ def sample_dataset(merge_keys): # generate target interactions adata.uns["ccc_target"] = pd.DataFrame( { - "ligand": np.random.choice(adata.var.index, 50), - "receptor": np.random.choice(adata.var.index, 50), - "source": np.random.choice(list(set(adata.obs.label)), 50), - "target": np.random.choice(list(set(adata.obs.label)), 50), + "ligand": rng.choice(adata.var.index, 50), + "receptor": rng.choice(adata.var.index, 50), + "source": rng.choice(list(set(adata.obs.label)), 50), + "target": rng.choice(list(set(adata.obs.label)), 50), } ) # drop duplicates - adata.uns["ccc_target"] = adata.uns["ccc_target"].drop_duplicates(subset=merge_keys) - # ensure positive response class is always present + adata.uns["ccc_target"] = ( + adata.uns["ccc_target"] + .sort_values(merge_keys) + .reset_index() + .drop_duplicates(subset=merge_keys, keep="first") + .reset_index() + ) + n_rows = adata.uns["ccc_target"].shape[0] - response = np.zeros(n_rows, dtype=np.int64) - response[0 : np.int(n_rows * 0.3)] = 1 - adata.uns["ccc_target"]["response"] = response + adata.uns["ccc_target"]["response"] = rng.binomial(1, 0.5, n_rows) + # subset columns adata.uns["ccc_target"] = adata.uns["ccc_target"][["response"] + merge_keys] @@ -236,23 +242,23 @@ def sample_dataset(merge_keys): n_complexes = 5 n_genes = len(adata.var.index) ligand_complexes = [ - "_".join(np.random.choice(adata.var.index, 2)) for _ in range(n_complexes) + "_".join(rng.choice(adata.var.index, 2)) for _ in range(n_complexes) ] receptor_complexes = [ - "_".join(np.random.choice(adata.var.index, 2)) for _ in range(n_complexes) + "_".join(rng.choice(adata.var.index, 2)) for _ in range(n_complexes) ] adata.uns["ligand_receptor_resource"] = pd.DataFrame( { "ligand_genesymbol": np.concatenate( [ ligand_complexes, - np.random.choice(adata.var.index, n_genes, replace=False), + rng.choice(adata.var.index, n_genes, replace=False), ] ), "receptor_genesymbol": np.concatenate( [ receptor_complexes, - np.random.choice(adata.var.index, n_genes, replace=False), + rng.choice(adata.var.index, n_genes, replace=False), ] ), } @@ -264,7 +270,7 @@ def sample_dataset(merge_keys): def sample_method(adata, merge_keys): """Create sample method output for testing metrics in this task.""" row_num = 500 - np.random.seed(1234) + rng = np.random.default_rng(seed=1234) ligand_msk = ~adata.uns["ligand_receptor_resource"]["ligand_genesymbol"].isin( adata.var.index @@ -276,21 +282,22 @@ def sample_method(adata, merge_keys): # keep only plausible interactions resource = adata.uns["ligand_receptor_resource"][msk] - df = pd.DataFrame(np.random.random((row_num, 1)), columns=["score"]) - df["source"] = np.random.choice(np.unique(adata.obs[["label"]]), row_num) - df["target"] = np.random.choice(np.unique(adata.obs[["label"]]), row_num) - df["ligand"] = np.random.choice( - np.unique(resource["ligand_genesymbol"].values), row_num - ) - df["receptor"] = np.random.choice( + df = pd.DataFrame(rng.random((row_num, 1)), columns=["score"]) + df["source"] = rng.choice(np.unique(adata.obs[["label"]]), row_num) + df["target"] = rng.choice(np.unique(adata.obs[["label"]]), row_num) + df["ligand"] = rng.choice(np.unique(resource["ligand_genesymbol"].values), row_num) + df["receptor"] = rng.choice( np.unique(resource["receptor_genesymbol"].values), row_num ) + + # remove duplicates + df = df.sort_values(merge_keys + ["score"]).drop_duplicates( + subset=merge_keys, keep="first" + ) + # subset columns df = df[["score"] + merge_keys] - # deduplicate - df = df.loc[~df[merge_keys].duplicated()] - adata.uns["ccc_pred"] = df return adata diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.R b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.R index 3d2a976907..302016bde4 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.R +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.R @@ -37,7 +37,9 @@ liana_res <- liana_wrap(sce, # Aggregate if a run /w multiple methods if (!is.tibble(liana_res)) { liana_res <- liana_res %>% - liana_aggregate() + liana_aggregate() %>% + # inverse distribution + mutate(aggregate_rank = 1 - aggregate_rank) } # Return (Keep Complexes [not subunits] for Consistency) diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py index 5b4c1dcfe0..9fcabd7394 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py @@ -4,13 +4,17 @@ import numpy as np +def _sigmoid_transform(x): + return 1 - 1 / (1 + x / 2) + + @metric(metric_name="Odds Ratio", maximize=True) -def odds_ratio(adata, top_prop=0.05): +def odds_ratio(adata): # Join benchmark (assumed truth) and ccc results # Get /w ccc_target and a response [0, 1] column gt = join_truth_and_pred(adata) gt = gt.sort_values("score", ascending=False) - top_n = np.int(adata.uns["ccc_target"].shape[0] * top_prop) + top_n = np.sum(adata.uns["ccc_target"].response) # assign the top rank interactions to 1 a = np.zeros(len(gt["score"])) @@ -35,6 +39,6 @@ def odds_ratio(adata, top_prop=0.05): # perfect score oddsratio = np.inf else: - oddsratio = numerator / denominator + oddsratio = _sigmoid_transform(numerator / denominator) return oddsratio diff --git a/test/test_3_cell_cell_communication.py b/test/test_3_cell_cell_communication.py index 5550ab94c5..479ab8520a 100644 --- a/test/test_3_cell_cell_communication.py +++ b/test/test_3_cell_cell_communication.py @@ -119,13 +119,18 @@ def test_odds_ratio_no_match(): adata = task.api.sample_dataset() + # check expected output adata = task.api.sample_method(adata) - m = metric(adata, top_prop=0) # force numerator exception - assert m is np.nan - - m = metric(adata, top_prop=0.5) # check non-exception output + m = metric(adata) assert np.issubdtype("float64", m) + assert m == 0.813953488372093 + # force perfect score adata = task.methods.true_events(adata) - m = metric(adata, top_prop=0.9) # force denominator exception + m = metric(adata) assert m is np.inf + + # force exception + adata.uns["ccc_target"]["response"] = 0 + m = metric(adata) + assert m is np.nan From f07f75075a2d729e9afb7b4ce6cc5f6bd067b2d1 Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Mon, 21 Nov 2022 10:27:59 -0500 Subject: [PATCH 127/266] Add Dataset 'Tabula Muris Senis' to Task 'Denoising' (#611) * Create tabula_muris_senis.py * pre-commit * Update __init__.py * Update tabula_muris_senis.py * Update tabula_muris_senis.py * pre-commit * Update tabula_muris_senis.py * pre-commit * Update __init__.py * Update tabula_muris_senis.py * Update tabula_muris_senis.py * Update tabula_muris_senis.py * pre-commit * Update tabula_muris_senis.py * pre-commit * add ncells ngenes to data summary * Update tabula_muris_senis.py * Update tabula_muris_senis.py * pre-commit Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .../tasks/denoising/datasets/__init__.py | 1 + .../denoising/datasets/tabula_muris_senis.py | 22 +++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 openproblems/tasks/denoising/datasets/tabula_muris_senis.py diff --git a/openproblems/tasks/denoising/datasets/__init__.py b/openproblems/tasks/denoising/datasets/__init__.py index d9891bd987..7c5a8d850e 100644 --- a/openproblems/tasks/denoising/datasets/__init__.py +++ b/openproblems/tasks/denoising/datasets/__init__.py @@ -1 +1,2 @@ from .pbmc import pbmc +from .tabula_muris_senis import tabula_muris_senis_lung_random diff --git a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py new file mode 100644 index 0000000000..1e29d32288 --- /dev/null +++ b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py @@ -0,0 +1,22 @@ +from ....data.tabula_muris_senis import load_tabula_muris_senis +from ....tools.decorators import dataset +from . import utils + + +@dataset( + "Tabula Muris Senis Lung (random split)", + data_url=load_tabula_muris_senis.metadata["data_url"], + data_reference=load_tabula_muris_senis.metadata["data_reference"], + dataset_summary="All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 " + "organs and tissues across the mouse lifespan. Split into train/test randomly." + "24540 cells × 16160 genes across 3 time points.", + image="openproblems-python-extras", +) +def tabula_muris_senis_lung_random(test=False): + adata = load_tabula_muris_senis( + organ_list=["lung"], + method_list=["droplet"], + test=test, + ) + adata = utils.split_data(adata) + return adata From 42a5138034c7894e79d849f3c6c7b2248d8e8893 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 21 Nov 2022 14:07:13 -0500 Subject: [PATCH 128/266] add ncells to seuratv3 test mode (#699) * add ncells to seuratv3 test mode * define arg --- .../spatial_decomposition/methods/seuratv3.R | 13 +++++++++++-- .../spatial_decomposition/methods/seuratv3.py | 16 +++++++++++++--- .../methods/seuratv3_wrapper.R | 8 +++++++- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/openproblems/tasks/spatial_decomposition/methods/seuratv3.R b/openproblems/tasks/spatial_decomposition/methods/seuratv3.R index 7aa7d05273..a75b3961f2 100644 --- a/openproblems/tasks/spatial_decomposition/methods/seuratv3.R +++ b/openproblems/tasks/spatial_decomposition/methods/seuratv3.R @@ -2,6 +2,7 @@ #' @param sce_sc SingleCellExperiment single-cell data #' @param sce_sp SingleCellExperiment spatial data #' @param n_pcs int Number of principal components +#' @param sctransform_n_cells int Number of cells sampled to build NB regression options(error = rlang::entrace) @@ -16,20 +17,28 @@ args <- readRDS("/tmp/openproblems_seurat_args.rds") sce_sc <- args$sce_sc sce_sp <- args$sce_sp n_pcs <- args$n_pcs +sctransform_n_cells <- args$sctransform_n_cells # R base for seuratv3.py sce_sc <- as.Seurat(sce_sc, counts = "X", data = NULL) sce_sp <- as.Seurat(sce_sp, counts = "X", data = NULL) # Normalize and do dimred for spatial data -sce_sp <- SCTransform(sce_sp, assay = "originalexp", verbose = TRUE) +sce_sp <- SCTransform( + sce_sp, + assay = "originalexp", + ncells = min(sctransform_n_cells, nrow(sce_sp)), + verbose = TRUE +) sce_sp <- RunPCA(sce_sp, assay = "SCT", verbose = FALSE, n_pcs = n_pcs) # Normalize and do dimred for single cell data sce_sc <- SCTransform( sce_sc, - assay = "originalexp", ncells = min(3000, nrow(sce_sc)), verbose = TRUE + assay = "originalexp", + ncells = min(sctransform_n_cells, nrow(sce_sc)), + verbose = TRUE ) sce_sc <- RunPCA(sce_sc, verbose = FALSE, n_pcs = n_pcs) diff --git a/openproblems/tasks/spatial_decomposition/methods/seuratv3.py b/openproblems/tasks/spatial_decomposition/methods/seuratv3.py index 9e5809d33f..98f1516089 100644 --- a/openproblems/tasks/spatial_decomposition/methods/seuratv3.py +++ b/openproblems/tasks/spatial_decomposition/methods/seuratv3.py @@ -7,7 +7,9 @@ import pandas as pd import pathlib -_seuratv3 = r_function("seuratv3_wrapper.R", args="sce_sc, sce_sp, n_pcs, script_path") +_seuratv3 = r_function( + "seuratv3_wrapper.R", args="sce_sc, sce_sp, n_pcs, sctransform_n_cells, script_path" +) @method( @@ -18,11 +20,18 @@ code_url="https://satijalab.org/seurat/archive/v3.2/spatial_vignette.html", image="openproblems-r-extras", ) -def seuratv3(adata, test: bool = False, n_pca: Optional[int] = None): +def seuratv3( + adata, + test: bool = False, + n_pca: Optional[int] = None, + sctransform_n_cells: Optional[int] = None, +): if test: - n_pca = n_pca or 10 + n_pca = n_pca or 2 + sctransform_n_cells = sctransform_n_cells or 50 else: # pragma: nocover n_pca = n_pca or 30 + sctransform_n_cells = sctransform_n_cells or 5000 # extract single cell reference data adata_sc, adata = split_sc_and_sp(adata) # proportions_true gets lost in translation @@ -31,6 +40,7 @@ def seuratv3(adata, test: bool = False, n_pca: Optional[int] = None): adata_sc, adata, n_pcs=n_pca, + sctransform_n_cells=sctransform_n_cells, script_path=pathlib.Path(__file__).parent.joinpath("seuratv3.R").as_posix(), ) # get predicted cell type proportions from obs diff --git a/openproblems/tasks/spatial_decomposition/methods/seuratv3_wrapper.R b/openproblems/tasks/spatial_decomposition/methods/seuratv3_wrapper.R index f5fb26a1aa..f7f42c691e 100644 --- a/openproblems/tasks/spatial_decomposition/methods/seuratv3_wrapper.R +++ b/openproblems/tasks/spatial_decomposition/methods/seuratv3_wrapper.R @@ -6,10 +6,16 @@ #' @param sce_sc SingleCellExperiment single-cell data #' @param sce_sp SingleCellExperiment spatial data #' @param n_pcs int Number of principal components +#' @param sctransform_n_cells int Number of cells sampled to build NB regression #' @param script_path character Path to seuratv3.R saveRDS( - list(sce_sc = sce_sc, sce_sp = sce_sp, n_pcs = n_pcs), + list( + sce_sc = sce_sc, + sce_sp = sce_sp, + n_pcs = n_pcs, + sctransform_n_cells = sctransform_n_cells + ), "/tmp/openproblems_seurat_args.rds" ) # clear memory From 1adb3921c6fee532254f1cd6a031a17879ccac6c Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 21 Nov 2022 18:09:28 -0500 Subject: [PATCH 129/266] Put sample cells and genes in the repo (#701) * put sample cells and genes in the repo * fix api * add sample data --- openproblems/data/sample/__init__.py | 1 + openproblems/data/{ => sample}/sample.py | 28 +++++++++++++++---- openproblems/data/sample/sample_cells.csv.gz | Bin 0 -> 1132 bytes openproblems/data/sample/sample_genes.csv.gz | Bin 0 -> 5013 bytes 4 files changed, 24 insertions(+), 5 deletions(-) create mode 100644 openproblems/data/sample/__init__.py rename openproblems/data/{ => sample}/sample.py (55%) create mode 100644 openproblems/data/sample/sample_cells.csv.gz create mode 100644 openproblems/data/sample/sample_genes.csv.gz diff --git a/openproblems/data/sample/__init__.py b/openproblems/data/sample/__init__.py new file mode 100644 index 0000000000..723bebb943 --- /dev/null +++ b/openproblems/data/sample/__init__.py @@ -0,0 +1 @@ +from .sample import load_sample_data diff --git a/openproblems/data/sample.py b/openproblems/data/sample/sample.py similarity index 55% rename from openproblems/data/sample.py rename to openproblems/data/sample/sample.py index 008406807f..900d6b9aac 100644 --- a/openproblems/data/sample.py +++ b/openproblems/data/sample/sample.py @@ -1,23 +1,41 @@ -from .multimodal.scicar.cell_lines import rna_cells_url -from .multimodal.scicar.cell_lines import rna_genes_url -from .utils import loader +from ..utils import loader import anndata import numpy as np import pandas as pd +import pathlib import scipy.sparse +SCRIPT_PATH = pathlib.Path(__file__) + @loader( data_url="https://openproblems.bio", data_reference="https://github.com/openproblems-bio/openproblems", ) def load_sample_data(test=True): - """Create a simple dataset to use for testing in multimodal applications.""" - assert test + """Create a simple dataset to use for testing in multimodal applications. + Genes and cells generated by: + ``` + from ..multimodal.scicar.cell_lines import rna_cells_url + from ..multimodal.scicar.cell_lines import rna_genes_url genes = pd.read_csv(rna_genes_url, low_memory=False, index_col=0, nrows=500) cells = pd.read_csv(rna_cells_url, low_memory=False, index_col=0, nrows=200) + ``` + """ + assert test + + genes = pd.read_csv( + SCRIPT_PATH.parent.joinpath("sample_genes.csv.gz"), + low_memory=False, + index_col=0, + ) + cells = pd.read_csv( + SCRIPT_PATH.parent.joinpath("sample_cells.csv.gz"), + low_memory=False, + index_col=0, + ) rna_data = scipy.sparse.csr_matrix( np.random.poisson(0.3, (cells.shape[0], genes.shape[0])).astype(np.float32) diff --git a/openproblems/data/sample/sample_cells.csv.gz b/openproblems/data/sample/sample_cells.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..13b7452b6a5852ab093ea4ef73f6071f8f51bc1c GIT binary patch literal 1132 zcmV-y1e5z8iwFpV&wFD6|8rq&aBO8?V`Xe?b1q|Zb^wf6O>f&U488BSxJG2jUq?X) z-F6ssFM`g80&O=mIduPid~~t7aV$Or4T9o<6v@XYQkV1V@0X={T3%ir-_EZ~vwZws z-k)EWw`+5KU(Q$jdc4X%zFnT4AAWp~5Aor2a&5{f#hgm(b1up9T*PE7F~|HB>F%9T zoSQ*Qys(&Kv<$+QIF1rdCFmpwge|EtLXnDSAh*c9XMrUR#Vib5!)~!tuGJCuv6`MN z;YvU`Wy_!>GM3d~^^jeQ^9dWpbj5W?)3Kw8CP=UXye;M-C672}l|DFRS&3`Vr)!aH zlshI1DWwroJ`iCG2Q=M!%qX*@^~zjjC9hA}L45WRTplskcIg zWSHK?u^ymQ%ScftYz5^o-vN4}b-vBw?dbx29R*rX0;eZIBVNnR7*t_4C@zN-l*V2x zB&!lg=BoA8>m$%&5@?peX3lyknbm=79eZ+>7Xm}fcFH$&Jha!htL`9=b$dO{$ev?Q z->z;Ev*lU~t$&Pb#8V8{q&iB}XqcpzR?Q&HDKeS{3Ayd*enA%XPDr`fS;W;*b~DeN zERv1TqX$*%vzbVC;u?^ltb>XX`#_U-pvgOI$$MCzVau}`3gj+`yM|&_bfC3ypu}Xr zsE)fWCo0pJ8?A;Jx0BIv!;Gq+prf_PjNCpr(gXO2IaC+ITh8v0sGsty?wY+Fb!uD$qLG-yQ{9vwZUxKPV0xondo7C z+Onr+kI)-4Cm<9z&z6HT%N^2ifV^&$jJ{_%?$+aKZ<|bA>Bc(gA$uVa|=Jn;c z8n?bGNqx*bZYcKp{IUGJZ_%*|AxNU{9sRNmB-i)wO!CaJyBloUGc{1!Ef8BX8t%zT zGeqrBuA^S`0OO?FbyNwv?F?K&kt49{XhEUhpj}57y4wm$Es2{)|5|Fue(u=Y+2jVvJU4K0O3jhHB{{sNWhsqKvAOHZilQT5{ literal 0 HcmV?d00001 diff --git a/openproblems/data/sample/sample_genes.csv.gz b/openproblems/data/sample/sample_genes.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..9058128d51864eda988e44189dbde790cf24642f GIT binary patch literal 5013 zcmV;G6Kd=qiwFpM&wFD6|8rq&aBO8?XJu|>b1q|Zb^wfA*^V2>l6}A5VhS0#=VP&o zWYb+_7mCy}dJzm+<11j?G9=Bz{QX20saBrIEa4ikByp?{qyPeuW$eU^7Z?N=Rf}b^sn9da!~m{txXF0mHq9XZ=e46&o5c-w_$ikG*T%P zv=KsYyURnE3K?r`bg%Sp|MU6z<9zOZI~`{-N^htf$OKQO5ZZSqGO;Tpd{kPoL^0LV zY!4QuQpVFOk}^bv5Jj)jvgWT}zJL1s{Ndx5KR-YJl?C^%Tv1di6)T!2qK&Z=FEf;4 zDOtx;HpF5nW`a|%Ou4fD*iSm(8AS8S$%S>J*TL{w2u>p@gF7u|rg9@brAX_d9yg;r zi~jNa{qwg^&)-nemM5K$G2h15FCRaB`}XP258uE3@%-)M*U$fahYy)jTeR1C(chkz zlRCfltOQEI4U$W#eVnNrDOzv%4u6PslTx0?f2Cn8AB4ud$f543WM741PP_3wjTByy zD&(AU!O4Ss5yji~&eXSEUbePzQTX;?(G$$r4*$YcviPEE1}Es-n8`7m(JO#6mEV+ zDm!>8-fi%;g~5T?nQZ6HB>-%Fgl6ter~)ml6%tu7@Mi{Wqvp&fH{IBEpY@FMKM6 z5L_sZi{pL_TQHLtRk5q_c<%Xg;9YbK?7;1RFg08{gN2BY!%|-g*gzt1USDswY$l-A z$*9;oBkMf>ro5uSV5m|KXP<%y!-`|Jyz!eDgg1(+IhY2O4Z;KdkTRv&1#j#tFE?@N zPJ^&v3h5V{EnP#5iA}T`^ZeB75vERz*^(4{93MC|fShA=R1G(cyT^w!;a;q+VN_3F zzWz4{7q|{bx_uombJR)IGe7JuRs=3BL)^=F3LTSAwdvmk#ZLH4tDPJ#hf@7{eZ)3Y zonUvE&!xI>Sjy_M%=qE+^Y^b`oB=^s=GOQ`+(^PPWarMB>UuY&k}2SF}ta`Af$ zz$<97w6cuy1eDW@H~o3<*Xzh`Y13WGTld<9hC$q9fyczuk~{C`%f3ANEXfhXwm!T9RJ}$jP5R}@QLpNR_nll79wctYTG>|)W^D8B?N=i?8G3C)H6|=9D z1HRtw^zRT06%&c}cuE{KJAlfPe6ys~pu9b{MbIK)6WXAi!5v!@1my3LMiLmN>i_rM~)x z(wyGSU8#i}=oydIZtC^?*c}0SX1Y*uqaQm4JVq(R`ndc4JZj`xLDg!Q83+fJ-wA`X z&Cs`wai@72b?L@A$|SRRg>N4}-0)zyjNU==gCo9kf=Auw1Zr#;_s058y7S{actI{I z-1stN0B=L&>rG!zyHkvauykBoB0#ldbIL)<5#PM+%FD~{+cmNn*KRO9+H%wO7Qt=l zeM8{&rKVg58&*j*)S5TUmLX<#jg4F`uY-o2ghV2LkbzFns4(Od+4`hkWCgE|R`Tz= z9B6iQ$<|7^1VL>KMxS=h#!a?#4}6Y}qhPG``nWA!Y)qLP)>0>CuUnvp#xz`eKRgm` zVRaZOoUkH*Tl5BGVDp#ta~+C5J5}z0g{Vq-fc4p_fx{vX3$fq(778PA>^A3MyrcFoVS>kVe^tl zz9FkR$V+l;iLqh*eK(%Q5*aYk&{V5gHda@vqh)B2*3h8V5c7?*x&^Dlpe-_!1rJsz z;?Qm9p}Z5ng434)3ma6P5s`14^)x^B?@MD~#wKkw$2I*Z7RZb?r#HpS6zM>a%)+7%(wx4n(xMcli$5Ofb z%j2;SB2TV?eNRtwNd>cS^iBB05splUuGwb?**qYI|3;+WnG*PQ3+Lgj>e0h6SFx&2 zf!+iiLI$l?7QF}u(5R&8SHcO^&xT(K16+%D8eK*zN5%$7Lo7kkH?Zuyo23}lj$zup zgCJnjl65pS&F>!&(wQhjfZ3SjP8zdfjL{a9))X|bK<3f4N=*#GEJV?K;I5Gj5O-*i z&%De@;Ikqv*_jnRdihYqh2~_XSs{KF!J^4|?dIm88WELQJYWcH-hnG{2x(+DA3G?7yM_ZX zth0kJk0b|l=D#(x-c6(|n4@F4`qGB|9vFhP8LHp4i5V7iQjMz45u9`l0`F;Ja3~xI zMVirU$nDT}*zXu+rI;uCNLVP*-h?$dj9faZ7I_J_ms7hXp9JYHK2o*~taC8}@S?E5 z!!1m|9F6tK;{xwerJW_umR6r%_j@-Xad6bAQs`bRjbVNex%iEXTXA;YnkCixGZSkG zRNN<9J0)iyM3tGmvbiVahkh96eKKMyQaF{52~vU_8X95vJUvVn%*4k~s3K3LJ?u*H zqBBFO9L!^X?n-jb#M0bK{1T!n<~|CdWp;0_mA&g91}DTCcqS*iG;$$p>o}v;A=3c> zdND_&*vj*(fD>oPQElefpRNF5I5O}}$+&7vIan~BF;QrsGJ{J?4U<1iGdro`K2lIi z14AA3pRNeLgRO(FR{AL-Jxs9$TzGYmc?oLiIFVwz%T(5JyfaB-F~T9Pi9W6@N@TZU zqquZKk-Ah7b{g3cwM%aY2a(9;`hluzeq+f)oXPbAD5D2c_Gq#1bCg=ev?wBEzIoqD zBQDwO(w>!#u904PzrAj4#64v1s#^s~EetF)+O?*KeVN%+mLp9oVqG7gbignp=@uLr z&fV`l6k&A?u5zhN8uysy(1wYbUyk`p7UREOW6n7+$Zn+3NN=%{%X*%{V~sGn)3w9+ zl3X)nEkk0ng7J;^2V?A6xN61xz?aHGn>gVvl&(qbZEc(56}iTej}Ix~p`NG=Tf zD$DL{%4icZ4Ec1bel-)%BD@9mSZ+m5W3yg}D{`P!Npk?JyzHCq1U)c3oOUh7&%5?v zq%uRXjOiQJJMCsr8;)PTKE!1>D~4;7rC(o5ueV60M%I3DLy)}4NvTGEQ_}m~NQZ_Q z8(4Dc=0_-ys9VF-3x55t{HGYFamKrO-X966NviD88hVuWsN(`o%ATcbw_#5_kZ3eZ zM!AL^-3NtK9Y-sN<^csLy>xc87iX|#vF>BN%K@BmmTZOkgN7z3my0&&;8TJoT=ufM zrE8b5w+?)Y(+iGSzoy*S&`J@{yLktvpGeoFi-@N~o7SYF`RO+==cl;~5cidix>m&> zV}YNuOl%rGqsAN&bZb6q*XWR31i>}&>C!LGL+2}UYg1y4Df1eQJ)Vmc>LSPgMkuD} zwgd{Uf{jd+D~sQqpGf$RRQseXCEd010U7lo1ngI{1J(|8jd|E4GOywe9!`g|o{&0d zox8PgpX}E{I0a23}A?T4I#Fmct9pvR5S*i@mIxrXJMwS}Xee87?&=z7VH@FBo zu&SlD|Dxl4$A1y@ZZNxVVYvk@gC7miyAqHZ+hHYBW2dg!FeMxWvoqG&6)_PmOrt#c z;HLq07YmyBK1F4%Kr~)AlD(JdvPIvM7VPo$b}yZ~Ol2iGw`Th)Gr;ith?&bgT`Sj+ zZ^d;`%{4@gX8oLQI3X^g82N_U6*RU9oLsd4=yZHE{Qw32Xw^#N9LE_rI$vCO!X$7&P2p3hs^bdqAD+81p{P@l=6 zi`=j@I1QN#=M(gQlUcxR zb7?9ZWCGV_QxD2iag=^Sw@JHdTtR@ z7$HdIwjA3a=)-;wT|SJcq(51pS8kLr;$#}=Jr^9|Arsp|STcdo#YH88$VzKfGm>B4 zW4&30*f?i<5IwW`{9*%CvCeN1nCB~LmSk>;YDn8)nKTxrQ{B|&joVR#+9!F>I#Ty*&L;>Y4EP!_ z^xe~?$3|=)ZCaZ);8TIBJo+N&@i|N~jqa)I(~M7l6jG5aAaJcSDhhw)(2IoJlDD4U z$>%_+)l+zwt|KBihMPE*2D+B()S-HAIGO}+cd}aO2I1cxy8-z!C4t9t1MeFxp6?jb zG7++ALf)a3^}6wTVY>DY0|W}^WvBYarxU&z%R1J2kg(srdn(0Des$6ovsMh(c(;({ zJc>07>!)teqlm|?*9gPhy*r+vAnijdZg+?4+)bja-sq0K`4}Zulsk${_LS6*9bd3Pg5tVpzJ4A>$wR_qK=vdYGx?}A4xUd z(j|7}gd8vN&mhy`b{j+;QIYsPgciQ{j8Lft9bLztE^~g-EJ&0r{6i4A zuNYjzg)lTYA+2Pzo*lei>8SzqXS{d};*CfhCto+OGeU`j;6_X@Z<&xX(WFYY8-r@3 zP~S`!DgkHC^jkTQOTfZ8V#^9KGsqLBpBLxi>yR-#%opstzk&0gN8eq$o}%b{kJaNU z8|;&KEvq>8#73`SH+207i721T8I9Y#jj8eJS+q5jyp0KTJBC2N2%&Cyw6yY;%po!f zx^Z&T+?{rN(n7M;Xm*ptho9oLAm5pFPuk+MB0j)Y9qKk7*5XOE-c{Vj1n7{c^^WD? zJfEi?pR%ar^xD4RS%p!d!5D6W79aXqx4IuCcCKDgTT%cVW5v5&M!A4+tZF6|n5iAb zoAMlw);8kd`vPKGAJXDRR9qVpo19amM*H{Mqu#Kl#!0--59;=ezof>uK@BXF$TX5p zZ09G!F4nwK(WibN&|xc#@iH%V9}QbvZq=@piLvT*e|g##wfInlbdK>wzIRdF2_Dke z3WerVNvW3}Biu>| z5IONXQnYDx$lqqgSDitr^;zZNtR5qNxd$J4_2B(Bps;_N+?v@ysXI%q&Q0h|ZCo`) z@>zq8^HZ)?Z`Wh@{tSFPYT>f3+rFEizsC_1tkCX|U!Dt*B!UcG8pQl2 zi(2XJ8t+XG^J-e*w_GhfRrEwr<{X=iWBUh@0Wr5Hrd12YqflS7^eL_gVgo-}B_15y z#Vjt6fl%Gju!lrRwguCAS9_g%(WLGgQoFH($K8BaN(;q-bS0Ohl<@MWXg0C#xV; f>lmUyZXs+K+f4uZe*gdg|NjF3U!&zG#!UbK-Dk|> literal 0 HcmV?d00001 From bfcc974be437780d641660016951c7de391cb088 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 22 Nov 2022 14:43:53 -0500 Subject: [PATCH 130/266] store mean scaled metric as mean score (#703) --- workflow/parse_nextflow.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index a88fdf5417..336aa3cb9a 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -229,7 +229,9 @@ def compute_ranking(dataset_results): method_names[method_idx]: rank + 1 for rank, method_idx in enumerate(np.argsort(metric_sums)[::-1]) } - return final_ranking + for method_name, metrics_sum in zip(method_names, metric_sums): + dataset_results[method_name]["mean_score"] = metrics_sum / len(metric_names) + return dataset_results, final_ranking def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): @@ -239,12 +241,12 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): name=dataset.metadata["dataset_name"], data_url=dataset.metadata["data_url"], data_reference=dataset.metadata["data_reference"], - headers=dict(names=["Rank"], fixed=["Name", "Paper", "Library"]), + headers=dict(names=["Rank", "Mean score"], fixed=["Name", "Paper", "Library"]), results=list(), ) dataset_results_raw = normalize_scores(task_name, dataset_results_raw) dataset_results = drop_baselines(task_name, dataset_results_raw) - ranking = compute_ranking(dataset_results) + dataset_results, ranking = compute_ranking(dataset_results) metric_names = set() for method_name, rank in ranking.items(): method_results = dataset_results[method_name] @@ -262,6 +264,7 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): "CPU (%)": float(method_results["%cpu"].replace("%", "")), "Memory (GB)": parse_size_to_gb(method_results["peak_rss"]), "Rank": rank, + "Mean score": method_results["mean_score"], } for metric_name, metric_result in method_results["metrics"].items(): metric = openproblems.api.utils.get_function( From 8b4a31e4db25f34f9199ae538b0435939ed2643c Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 22 Nov 2022 15:52:12 -0500 Subject: [PATCH 131/266] time out after an hour of tests --- .github/workflows/run_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index f7f15927ed..fec225e4b3 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -307,6 +307,7 @@ jobs: cd .. - name: Run tests + timeout-minutes: 60 run: pytest --cov=openproblems --cov-report=xml -vv --durations=15 --tb=native -k "${{ matrix.tests }}" - name: Upload coverage From 0c748c90c9d49e1e4088786061e30042aaeb36e9 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Wed, 23 Nov 2022 09:10:22 -0500 Subject: [PATCH 132/266] Fix proportions test (#702) * fix proportions test * Set atol * fix cell2location proportions --- openproblems/tasks/spatial_decomposition/api.py | 4 ++-- .../tasks/spatial_decomposition/methods/cell2location.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/openproblems/tasks/spatial_decomposition/api.py b/openproblems/tasks/spatial_decomposition/api.py index deb6a0187b..53a6a9797a 100644 --- a/openproblems/tasks/spatial_decomposition/api.py +++ b/openproblems/tasks/spatial_decomposition/api.py @@ -41,8 +41,8 @@ def check_method(adata: AnnData, is_baseline=False): assert isinstance(adata.obsm["proportions_true"], np.ndarray) assert np.all(np.isfinite(adata.obsm["proportions_true"])) assert adata.obsm["proportions_pred"].shape == adata.obsm["proportions_true"].shape - proportions_sum = np.sum(adata.obsm["proportions_true"], axis=1) - np.testing.assert_allclose(proportions_sum, 1) + proportions_sum = np.sum(adata.obsm["proportions_pred"], axis=1) + np.testing.assert_allclose(proportions_sum, 1, atol=1e-6) return True diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index 7907c79d86..a32ec54597 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -147,6 +147,9 @@ def _cell2location( ) adata.obsm["proportions_pred"] = adata.obsm["q05_cell_abundance_w_sf"].values + adata.obsm["proportions_pred"] /= adata.obsm["proportions_pred"].sum(axis=1)[ + :, None + ] adata.uns["method_code_version"] = check_version("cell2location") return adata From 725ff0c46140aaa6bbded68646256f64bc63df6d Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Wed, 23 Nov 2022 10:32:02 -0500 Subject: [PATCH 133/266] Task denoising dataset pancreas (#593) * Create mouse_blood_olssen_labelled.py * Update __init__.py * Update mouse_blood_olssen_labelled.py * pre-commit * Update mouse_blood_olssen_labelled.py * pre-commit * add image spec * pre-commit * Rename mouse_blood_olssen_labelled.py to pancreas.py * Update pancreas.py * Update __init__.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * pre-commit * Update pancreas.py * Update pancreas.py * pre-commit * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Update pancreas.py * Improve dataset name Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .../tasks/denoising/datasets/__init__.py | 1 + .../tasks/denoising/datasets/pancreas.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 openproblems/tasks/denoising/datasets/pancreas.py diff --git a/openproblems/tasks/denoising/datasets/__init__.py b/openproblems/tasks/denoising/datasets/__init__.py index 7c5a8d850e..6f8abdf0a7 100644 --- a/openproblems/tasks/denoising/datasets/__init__.py +++ b/openproblems/tasks/denoising/datasets/__init__.py @@ -1,2 +1,3 @@ +from .pancreas import pancreas from .pbmc import pbmc from .tabula_muris_senis import tabula_muris_senis_lung_random diff --git a/openproblems/tasks/denoising/datasets/pancreas.py b/openproblems/tasks/denoising/datasets/pancreas.py new file mode 100644 index 0000000000..c18ddbeee1 --- /dev/null +++ b/openproblems/tasks/denoising/datasets/pancreas.py @@ -0,0 +1,19 @@ +from ....data.pancreas import load_pancreas +from ....tools.decorators import dataset +from . import utils + + +@dataset( + dataset_name="Pancreas (inDrop)", + data_url=load_pancreas.metadata["data_url"], + data_reference=load_pancreas.metadata["data_reference"], + dataset_summary="Human pancreatic islet scRNA-seq data from 6 datasets " + "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " + "and SMARTER-seq). Here we just use the inDrop1 batch, which includes" + "1937 cells × 15502 genes.", + image="openproblems-python-extras", +) +def pancreas(test=False): + adata = load_pancreas(test=test, keep_techs=["inDrop1"]) + adata = utils.split_data(adata) + return adata From b065ac1369410571a6d1fe180c90ef73cb1babd3 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 25 Nov 2022 14:06:46 -0500 Subject: [PATCH 134/266] Perfect score is 1, not inf --- .../_cell_cell_communication/_common/metrics/odds_ratio.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py index 9fcabd7394..cfb361f12b 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py @@ -39,6 +39,6 @@ def odds_ratio(adata): # perfect score oddsratio = np.inf else: - oddsratio = _sigmoid_transform(numerator / denominator) + oddsratio = numerator / denominator - return oddsratio + return _sigmoid_transform(oddsratio) From 3bb88517eb327bd8733a1a9a4c2fa9ea724dc987 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Sat, 26 Nov 2022 15:27:17 -0500 Subject: [PATCH 135/266] Upload coverage in a single job (#707) * try to run codecov only once * quo * download all artifacts by path * remove artifacts on completion * mkdir * fix filenames? * fix path * cleanup * single quotes * exclude . * fix file format? * fix upload * checkout * Switch order * remove redundant * temp * Revert "temp" This reverts commit b5604f4ea13dd51535d9f0eadeee734806c9ddcf. --- .github/workflows/run_tests.yml | 48 +++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index fec225e4b3..6c7f6d4bcd 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -308,34 +308,48 @@ jobs: - name: Run tests timeout-minutes: 60 - run: pytest --cov=openproblems --cov-report=xml -vv --durations=15 --tb=native -k "${{ matrix.tests }}" - - - name: Upload coverage - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - flags: unittests - fail_ci_if_error: ${{ github.repository == 'openproblems-bio/openproblems' }} - verbose: true + run: | + pytest --cov=openproblems --cov-report=xml -vv --durations=15 --tb=native -k "${{ matrix.tests }}" + mkdir -p coverage + mv coverage.xml "$(echo 'coverage_${{ matrix.tests }}.xml' | sed 's/[^a-z0-9\.]/_/g')" - - name: Upload check results on fail - if: failure() + - name: Upload coverage to GitHub Actions uses: actions/upload-artifact@main with: - name: ${{ matrix.config.name }}_results - path: results + path: coverage_*.xml + name: coverage - state_completion: + upload_coverage: needs: run_tester runs-on: ubuntu-latest if: >- always() && !endsWith(github.event.head_commit.message, '# ci skip') && needs.run_tester.result == 'success' + steps: - - name: State that tests succeeded - run: | - echo "We did it! 🥳🥳🥳" + + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Download coverage from GitHub Actions + uses: actions/download-artifact@v3 + with: + name: coverage + + - name: Upload coverage + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + flags: unittests + fail_ci_if_error: ${{ github.repository == 'openproblems-bio/openproblems' }} + verbose: true + + - name: Delete coverage artifacts + uses: geekyeggo/delete-artifact@v2 + with: + name: coverage run_benchmark: needs: run_tester From ab4bec497cd993325addb50462b3eac0829c500d Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sat, 26 Nov 2022 15:39:37 -0500 Subject: [PATCH 136/266] improve tabula muris senis docs --- openproblems/tasks/denoising/datasets/tabula_muris_senis.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py index 1e29d32288..a5c62f953e 100644 --- a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py +++ b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py @@ -4,12 +4,12 @@ @dataset( - "Tabula Muris Senis Lung (random split)", + "Tabula Muris Senis Lung", data_url=load_tabula_muris_senis.metadata["data_url"], data_reference=load_tabula_muris_senis.metadata["data_reference"], dataset_summary="All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 " - "organs and tissues across the mouse lifespan. Split into train/test randomly." - "24540 cells × 16160 genes across 3 time points.", + "organs and tissues across the mouse lifespan. Here we use just 10x data from lung." + " 24540 cells × 16160 genes across 3 time points.", image="openproblems-python-extras", ) def tabula_muris_senis_lung_random(test=False): From 5bc4eb1ddece0ea894e7258c8f6f911f6a7d6761 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Sat, 26 Nov 2022 22:02:41 -0500 Subject: [PATCH 137/266] Fix test --- test/test_3_cell_cell_communication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_3_cell_cell_communication.py b/test/test_3_cell_cell_communication.py index 479ab8520a..fc04d46542 100644 --- a/test/test_3_cell_cell_communication.py +++ b/test/test_3_cell_cell_communication.py @@ -128,7 +128,7 @@ def test_odds_ratio_no_match(): # force perfect score adata = task.methods.true_events(adata) m = metric(adata) - assert m is np.inf + assert m == 1 # force exception adata.uns["ccc_target"]["response"] = 0 From d965e48bc1f71838443943fdb4767bdfad2ec39a Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Sun, 27 Nov 2022 10:07:06 -0500 Subject: [PATCH 138/266] Fix test again --- .../_cell_cell_communication/_common/metrics/odds_ratio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py index cfb361f12b..21e9903e1a 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py @@ -34,7 +34,7 @@ def odds_ratio(adata): if denominator == 0: if numerator == 0: # undefined - oddsratio = np.nan + return np.nan else: # perfect score oddsratio = np.inf From a796e02e13a43e8861b124edbb9d287f162d4a14 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Sun, 27 Nov 2022 10:08:46 -0500 Subject: [PATCH 139/266] Fix references to `adata.layers["counts"]` (#698) * remove redundant references to layers['counts'] * test normalizations do not occur in-place * Rename test_5_tools.py to test_0_tools.py * Don't run normalize_total inplace * pre-commit * Set X in simulated data * allow for changes in shape * use isclose * require that shape and class not change in normalize() Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- openproblems/data/immune_cells.py | 1 - openproblems/data/pancreas.py | 1 - .../batch_integration_feature/api.py | 1 - .../tasks/dimensionality_reduction/README.md | 10 ++++---- .../methods/densmap.py | 2 ++ .../methods/neuralee.py | 1 + .../dimensionality_reduction/methods/pca.py | 1 + .../dimensionality_reduction/methods/phate.py | 1 + .../dimensionality_reduction/methods/tsne.py | 1 + .../dimensionality_reduction/methods/umap.py | 1 + .../metrics/nn_ranking.py | 1 + .../datasets/destvi/utils.py | 3 --- .../spatial_decomposition/datasets/utils.py | 2 +- openproblems/tools/normalize.py | 13 ++++++---- test/test_0_tools.py | 24 +++++++++++++++---- test/utils/data.py | 8 +++++-- 16 files changed, 47 insertions(+), 24 deletions(-) diff --git a/openproblems/data/immune_cells.py b/openproblems/data/immune_cells.py index ce740d14f0..81b1f6cd58 100644 --- a/openproblems/data/immune_cells.py +++ b/openproblems/data/immune_cells.py @@ -38,7 +38,6 @@ def load_immune(test=False): # NOTE: adata.X contains log-normalized data, so we're moving it adata.layers["log_normalized"] = adata.X adata.X = adata.layers["counts"] - del adata.layers["counts"] # Ensure there are no cells or genes with 0 counts utils.filter_genes_cells(adata) diff --git a/openproblems/data/pancreas.py b/openproblems/data/pancreas.py index ff0c0af843..98303becfc 100644 --- a/openproblems/data/pancreas.py +++ b/openproblems/data/pancreas.py @@ -58,7 +58,6 @@ def load_pancreas(test=False, keep_techs=None): # NOTE: adata.X contains log-normalized data, so we're moving it adata.layers["log_normalized"] = adata.X adata.X = adata.layers["counts"] - del adata.layers["counts"] # Ensure there are no cells or genes with 0 counts utils.filter_genes_cells(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py index 088cd38ea9..8baf5ff663 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py @@ -38,7 +38,6 @@ def sample_dataset(): adata.obsm["X_uni"] = sc.pp.pca(adata.X) adata.obs["batch"] = np.random.choice(2, adata.shape[0], replace=True).astype(str) adata.obs["labels"] = np.random.choice(5, adata.shape[0], replace=True).astype(str) - adata.layers["counts"] = adata.X adata.layers["log_normalized"] = adata.X.multiply( 10000 / adata.X.sum(axis=1) ).tocsr() diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index 96376edad1..0c78cb164d 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -56,11 +56,11 @@ slot as the input to the method. Raw counts are also stored in `adata.layers["co by the standard pre-processing functions, if a method performs its own pre-processing it should also do this for use by metrics. For most methods a standard pre-processing with the `log_cpm_hvg()` function is used which normalizes the expression matrix to counts -per million (CPM), performs a log transformation and subsets the data to highly-variable -genes (HVGs) as selected by scanpy's `high_variable_genes(adata, n_top_genes=n_genes, -flavor="cell_ranger")` (1000 genes by default). Variants of methods can be created by -applying different pre-processing prior to the method itself (see `phate.py` for an -example). +per million (CPM), performs a log transformation and annotates highly-variable +genes (HVGs) (as selected by scanpy's `high_variable_genes(adata, n_top_genes=1000, +flavor="cell_ranger")`) to `adata.var["highly_variable"]`. Variants of methods can be +created by applying different pre-processing prior to the method itself (see `phate.py` +for an example). ## The methods diff --git a/openproblems/tasks/dimensionality_reduction/methods/densmap.py b/openproblems/tasks/dimensionality_reduction/methods/densmap.py index b7a0285774..a31567d3cc 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/densmap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/densmap.py @@ -30,6 +30,7 @@ def _densmap(adata, obsm=None): @_densmap_method(method_name="densMAP (logCPM, 1kHVG)") def densmap_logCPM_1kHVG(adata, test: bool = False): adata = log_cpm_hvg(adata) + adata = adata[:, adata.var["highly_variable"]].copy() return _densmap(adata) @@ -38,5 +39,6 @@ def densmap_pca_logCPM_1kHVG(adata, test: bool = False): import scanpy as sc adata = log_cpm_hvg(adata) + adata = adata[:, adata.var["highly_variable"]].copy() sc.tl.pca(adata, n_comps=50, svd_solver="arpack") return _densmap(adata, obsm="X_pca") diff --git a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py index 279123b42f..41f55dbb65 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py +++ b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py @@ -97,4 +97,5 @@ def neuralee_default(adata: AnnData, test: bool = False) -> AnnData: @_neuralee_method(method_name="NeuralEE (CPU) (logCPM, 1kHVG)") def neuralee_logCPM_1kHVG(adata: AnnData, test: bool = False) -> AnnData: adata = log_cpm_hvg(adata) + adata = adata[:, adata.var["highly_variable"]].copy() return _neuralee(adata, test=test, normalize=False, subsample_genes=None) diff --git a/openproblems/tasks/dimensionality_reduction/methods/pca.py b/openproblems/tasks/dimensionality_reduction/methods/pca.py index 1c7c186471..20a9d08902 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pca.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pca.py @@ -15,6 +15,7 @@ def pca_logCPM_1kHVG(adata, test: bool = False): import scanpy as sc adata = log_cpm_hvg(adata) + adata = adata[:, adata.var["highly_variable"]].copy() sc.tl.pca(adata, n_comps=50, svd_solver="arpack") adata.obsm["X_emb"] = adata.obsm["X_pca"][:, :2] adata.uns["method_code_version"] = check_version("scikit-learn") diff --git a/openproblems/tasks/dimensionality_reduction/methods/phate.py b/openproblems/tasks/dimensionality_reduction/methods/phate.py index 9384880d13..43009bd34d 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/phate.py +++ b/openproblems/tasks/dimensionality_reduction/methods/phate.py @@ -45,4 +45,5 @@ def phate_sqrt(adata, test: bool = False, n_pca: Optional[int] = None): @_phate_method(method_name="PHATE (logCPM, 1kHVG)") def phate_logCPM_1kHVG(adata, test: bool = False, n_pca: Optional[int] = None): adata = log_cpm_hvg(adata) + adata = adata[:, adata.var["highly_variable"]].copy() return _phate(adata, test=test, n_pca=n_pca) diff --git a/openproblems/tasks/dimensionality_reduction/methods/tsne.py b/openproblems/tasks/dimensionality_reduction/methods/tsne.py index e19fa6cd3e..0a9bf7a44e 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/tsne.py +++ b/openproblems/tasks/dimensionality_reduction/methods/tsne.py @@ -16,6 +16,7 @@ def tsne_logCPM_1kHVG(adata, test: bool = False, n_pca=50): import scanpy as sc adata = log_cpm_hvg(adata) + adata = adata[:, adata.var["highly_variable"]].copy() sc.tl.pca(adata, n_comps=n_pca, svd_solver="arpack") sc.tl.tsne(adata, use_rep="X_pca", n_pcs=n_pca) adata.obsm["X_emb"] = adata.obsm["X_tsne"] diff --git a/openproblems/tasks/dimensionality_reduction/methods/umap.py b/openproblems/tasks/dimensionality_reduction/methods/umap.py index b9e73adf90..6a8f96bf43 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/umap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/umap.py @@ -16,6 +16,7 @@ def umap_logCPM_1kHVG(adata, test: bool = False, n_pca=50): import scanpy as sc adata = log_cpm_hvg(adata) + adata = adata[:, adata.var["highly_variable"]].copy() sc.tl.pca(adata, n_comps=50, svd_solver="arpack") sc.pp.neighbors(adata, use_rep="X_pca", n_pcs=n_pca) sc.tl.umap(adata) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py index 690b95eaeb..9ee479dbb7 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py @@ -157,6 +157,7 @@ def _high_dim(adata: AnnData) -> np.ndarray: adata.X = adata.layers["counts"] adata = log_cpm_hvg(adata) + adata = adata[:, adata.var["highly_variable"]].copy() high_dim = adata.X return high_dim.A if issparse(high_dim) else high_dim diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py index b48ef15b12..95a50c079e 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/utils.py @@ -226,9 +226,6 @@ def generate_synthetic_dataset( st_anndata.uns["key_clustering"] = key_list st_anndata.uns["target_list"] = [1] + target_list - sc_anndata.layers["counts"] = sc_anndata.X.copy() - st_anndata.layers["counts"] = st_anndata.X.copy() - merged_anndata = merge_sc_and_sp(sc_anndata, st_anndata, test=test) return merged_anndata diff --git a/openproblems/tasks/spatial_decomposition/datasets/utils.py b/openproblems/tasks/spatial_decomposition/datasets/utils.py index 07f8b604ca..bdb7ffe648 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/utils.py +++ b/openproblems/tasks/spatial_decomposition/datasets/utils.py @@ -139,7 +139,7 @@ def generate_synthetic_dataset( adata_spatial.obsm["n_cells"] = sp_c adata_merged = merge_sc_and_sp(adata, adata_spatial, test=test) adata_merged.X[adata_merged.X == np.inf] = adata_merged.X.max() # remove inf - adata_merged.layers["counts"] = adata_merged.X.copy() + adata_merged.layers["counts"] = adata_merged.X return adata_merged diff --git a/openproblems/tools/normalize.py b/openproblems/tools/normalize.py index e124cc0dab..c43b956610 100644 --- a/openproblems/tools/normalize.py +++ b/openproblems/tools/normalize.py @@ -44,8 +44,9 @@ def log_scran_pooling(adata: ad.AnnData) -> ad.AnnData: def _cpm(adata: ad.AnnData): import scanpy as sc - adata.layers["counts"] = adata.X.copy() - sc.pp.normalize_total(adata, target_sum=1e6, key_added="size_factors") + adata.X = sc.pp.normalize_total( + adata, target_sum=1e6, key_added="size_factors", inplace=False + )["X"] @decorators.normalizer @@ -77,8 +78,11 @@ def sqrt_cpm(adata: ad.AnnData) -> ad.AnnData: def log_cpm_hvg(adata: ad.AnnData, n_genes: int = 1000) -> ad.AnnData: """Normalize logCPM HVG - Normalize data to log counts per million and select n_genes highly - variable genes + Normalize data to log counts per million and annotate n_genes highly + variable genes. In order to subset the data to HVGs, use + ``` + adata = adata[:, adata.var["highly_variable"]].copy() + ``` """ import scanpy as sc @@ -91,6 +95,5 @@ def log_cpm_hvg(adata: ad.AnnData, n_genes: int = 1000) -> ad.AnnData: n_genes = int(adata.n_vars * 0.5) sc.pp.highly_variable_genes(adata, n_top_genes=n_genes, flavor="cell_ranger") - adata = adata[:, adata.var["highly_variable"]].copy() return adata diff --git a/test/test_0_tools.py b/test/test_0_tools.py index d0667fd9a8..cd45106482 100644 --- a/test/test_0_tools.py +++ b/test/test_0_tools.py @@ -17,11 +17,12 @@ def _dense_data(X): @parameterized.parameterized_class( - ("normalizer"), + ("normalizer", "sparse"), [ - (staticmethod(normalizer),) - for normalizer in openproblems.utils.get_callable_members( - openproblems.tools.normalize + (staticmethod(normalizer), sparse) + for normalizer, sparse in zip( + openproblems.utils.get_callable_members(openproblems.tools.normalize), + [True, False], ) ], class_name_func=utils.name.name_test, @@ -32,16 +33,29 @@ class TestNormalizeX(unittest.TestCase): @classmethod def setUpClass(cls): """Generate and normalize data.""" - cls.adata = utils.data.data() + cls.adata = utils.data.data(sparse=cls.sparse) + cls.counts = cls.adata.layers["counts"].copy() cls.cache_name = cls.normalizer.__name__ assert utils.asserts.assert_finite(cls.adata.X) assert cls.cache_name not in cls.adata.layers cls.adata = cls.normalizer(cls.adata) + def test_shape(self): + """Test that normalized data is the same shape as the input.""" + assert self.adata.X.shape == self.counts.shape + + def test_class(self): + """Test that normalized data is the same class as the input.""" + assert isinstance(self.adata.X, type(self.counts)) + def test_finite(self): """Test that normalized data is finite.""" assert utils.asserts.assert_finite(self.adata.X) + def test_not_inplace(self): + """Test that normalization does not happen inplace.""" + utils.asserts.assert_array_equal(self.adata.layers["counts"], self.counts) + def test_layers(self): """Test that normalized data is cached in adata.layers.""" assert self.cache_name in self.adata.layers diff --git a/test/utils/data.py b/test/utils/data.py index 2e1d8921b9..8b71216fc3 100644 --- a/test/utils/data.py +++ b/test/utils/data.py @@ -1,10 +1,14 @@ import anndata import numpy as np +import scipy.sparse -def data(obsm=None): +def data(sparse=False, obsm=None): """Create fake data.""" - adata = anndata.AnnData(np.random.poisson(2, (100, 30)).astype(np.float32)) + data = np.random.poisson(2, (100, 30)).astype(np.float32) + if sparse: + data = scipy.sparse.csr_matrix(data) + adata = anndata.AnnData(data, layers={"counts": data}) if obsm is not None: adata.obsm[obsm] = adata.X * 2 + 1 adata.uns["{}_obs".format(obsm)] = np.arange(adata.shape[0]) + 5 From 3bfa313a509b04bbcb12d7f6f0d1d66b7f5fb06c Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 28 Nov 2022 11:21:20 -0500 Subject: [PATCH 140/266] use concurrency --- .github/workflows/check_r_dependencies.yml | 4 ++++ .github/workflows/pre-commit.yml | 8 ++++---- .github/workflows/process_results.yml | 4 ++++ .github/workflows/run_tests.yml | 21 ++++----------------- 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/.github/workflows/check_r_dependencies.yml b/.github/workflows/check_r_dependencies.yml index ab5d30f2a7..d95e7ddd82 100644 --- a/.github/workflows/check_r_dependencies.yml +++ b/.github/workflows/check_r_dependencies.yml @@ -12,6 +12,10 @@ on: branches: - 'main' +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: check-r-dependencies: runs-on: ubuntu-latest diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 47c14ff13e..c2e9331f64 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -6,6 +6,10 @@ on: pull_request: types: [opened, synchronize, reopened, ready_for_review] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: pre-commit: runs-on: ubuntu-latest @@ -22,10 +26,6 @@ jobs: ) steps: - - name: Cancel Previous Runs - uses: styfle/cancel-workflow-action@0.11.0 - with: - access_token: ${{ github.token }} - uses: actions/checkout@v3 with: diff --git a/.github/workflows/process_results.yml b/.github/workflows/process_results.yml index 5838f52773..4b70b69d00 100644 --- a/.github/workflows/process_results.yml +++ b/.github/workflows/process_results.yml @@ -7,6 +7,10 @@ on: branches: - 'test_process' +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: process_results: runs-on: ubuntu-latest diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 6c7f6d4bcd..810f5ce898 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -9,25 +9,13 @@ on: pull_request: types: [opened, synchronize, reopened, ready_for_review] -jobs: - cancel_previous_runs: - if: | - !endsWith(github.event.head_commit.message, '# ci skip') && - ( - startsWith(github.ref, 'refs/heads') || - startsWith(github.ref, 'refs/tags') || - github.event.pull_request.draft == false - ) - runs-on: ubuntu-latest - steps: - - name: Cancel Previous Runs - uses: styfle/cancel-workflow-action@0.11.0 - with: - access_token: ${{ github.token }} +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true +jobs: build_images: - needs: cancel_previous_runs runs-on: ubuntu-latest if: | !endsWith(github.event.head_commit.message, '# ci skip') && @@ -145,7 +133,6 @@ jobs: echo "images=[\"$(paste -s -d ' ' <(echo $IMAGES) | sed 's/ */\",\"/g')\"]" >> $GITHUB_OUTPUT create_matrix: - needs: cancel_previous_runs runs-on: ubuntu-latest if: | !endsWith(github.event.head_commit.message, '# ci skip') From 4866d1ee5108397f562f5fffbce525fc3932dda7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 28 Nov 2022 13:36:49 -0500 Subject: [PATCH 141/266] skip matrix on draft --- .github/workflows/run_tests.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 810f5ce898..d813b2c9d2 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -135,7 +135,12 @@ jobs: create_matrix: runs-on: ubuntu-latest if: | - !endsWith(github.event.head_commit.message, '# ci skip') + !endsWith(github.event.head_commit.message, '# ci skip') && + ( + startsWith(github.ref, 'refs/heads') || + startsWith(github.ref, 'refs/tags') || + github.event.pull_request.draft == false + ) outputs: matrix: ${{ steps.generate-matrix.outputs.matrix }} @@ -206,11 +211,6 @@ jobs: needs.build_images.result == 'success' || needs.build_images.result == 'skipped' ) && - ( - startsWith(github.ref, 'refs/heads') || - startsWith(github.ref, 'refs/tags') || - github.event.pull_request.draft == false - ) && !startsWith(github.ref, 'refs/heads/test_docker') && !startsWith(github.ref, 'refs/heads/test_benchmark') && !startsWith(github.ref, 'refs/heads/test_full_benchmark') && From f42dcb4e4c16944382f660b2c974e5b9d8f9eee1 Mon Sep 17 00:00:00 2001 From: Vitalii Kleshchevnikov Date: Tue, 29 Nov 2022 14:03:27 +0000 Subject: [PATCH 142/266] cell2location with max normalisation flexibility (#704) * cell2location with max normalisation flexibility I think this is necessary to properly analyse the data where total UMI is completely decoupled from biological RNA count https://github.com/openproblems-bio/openproblems/issues/589#issuecomment-1325831148 * Update __init__.py * pre-commit Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .../spatial_decomposition/methods/__init__.py | 1 + .../methods/cell2location.py | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/openproblems/tasks/spatial_decomposition/methods/__init__.py b/openproblems/tasks/spatial_decomposition/methods/__init__.py index 2610c91831..68c1251809 100644 --- a/openproblems/tasks/spatial_decomposition/methods/__init__.py +++ b/openproblems/tasks/spatial_decomposition/methods/__init__.py @@ -1,6 +1,7 @@ from .baseline import random_proportions from .baseline import true_proportions from .cell2location import cell2location_amortised_detection_alpha_20 +from .cell2location import cell2location_detection_alpha_1 from .cell2location import cell2location_detection_alpha_20 from .cell2location import cell2location_detection_alpha_20_nb from .cell2location import cell2location_detection_alpha_200 diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index a32ec54597..2caf9d4956 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -185,6 +185,37 @@ def cell2location_detection_alpha_20( ) +@_cell2location_method( + method_name="Cell2location (detection_alpha=1, reference hard-coded)" +) +def cell2location_detection_alpha_1( + adata, + detection_alpha=1, + n_cells_per_location=20, + hard_coded_reference=True, + amortised=False, + num_samples=None, + sc_batch_size=2500, + st_batch_size=None, + test: bool = False, + max_epochs_sc: Optional[int] = None, + max_epochs_st: Optional[int] = None, +): + return _cell2location( + adata, + detection_alpha=detection_alpha, + n_cells_per_location=n_cells_per_location, + hard_coded_reference=hard_coded_reference, + amortised=amortised, + num_samples=num_samples, + sc_batch_size=sc_batch_size, + st_batch_size=st_batch_size, + test=test, + max_epochs_sc=max_epochs_sc, + max_epochs_st=max_epochs_st, + ) + + @_cell2location_method( method_name="Cell2location (detection_alpha=20, reference NB without batch info)" ) From 8cfae942ab0d317ce9fa1647c0648ed93dfe3e46 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 29 Nov 2022 09:46:31 -0500 Subject: [PATCH 143/266] Add spectral baseline to dimensionality reduction (#705) * add spectral baseline * import --- .../methods/__init__.py | 1 + .../methods/baseline.py | 28 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/openproblems/tasks/dimensionality_reduction/methods/__init__.py b/openproblems/tasks/dimensionality_reduction/methods/__init__.py index 1ddab56615..08b09386d1 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/methods/__init__.py @@ -1,4 +1,5 @@ from .baseline import high_dim_pca +from .baseline import high_dim_spectral from .baseline import random_features from .densmap import densmap_logCPM_1kHVG from .densmap import densmap_pca_logCPM_1kHVG diff --git a/openproblems/tasks/dimensionality_reduction/methods/baseline.py b/openproblems/tasks/dimensionality_reduction/methods/baseline.py index c2a19bc80f..ec8b5e71bf 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/baseline.py +++ b/openproblems/tasks/dimensionality_reduction/methods/baseline.py @@ -41,3 +41,31 @@ def high_dim_pca(adata, n_comps: Optional[int] = None, test=False): adata.obsm["X_emb"] = adata.obsm["X_pca"] adata.uns["method_code_version"] = check_version("openproblems") return adata + + +@method( + method_name="High-dimensional Laplacian Eigenmaps", + paper_name="High-dimensional Laplacian Eigenmaps (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def high_dim_spectral(adata, n_comps: Optional[int] = None, test=False): + # We wanted to use all features, but output must be dense + # so this is a close approximation + import umap + import umap.spectral + + if test: + n_comps = n_comps or 10 + else: # pragma: nocover + n_comps = n_comps or 200 + + graph = umap.UMAP(transform_mode="graph").fit_transform(adata.X) + adata.obsm["X_emb"] = umap.spectral.spectral_layout( + adata.X, graph, n_comps, random_state=None + ) + + adata.uns["method_code_version"] = check_version("openproblems") + return adata From 5f62be8c8dbffb3f84d77094645bc3e1aaeddf37 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 29 Nov 2022 10:14:33 -0500 Subject: [PATCH 144/266] make celltype clusters tighter in celltype random embedding/graph (#709) --- .../batch_integration_graph/methods/baseline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py index 015fecd0e8..e95153e25a 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py @@ -58,7 +58,7 @@ def _random_embedding(partition): embedding = OneHotEncoder().fit_transform( LabelEncoder().fit_transform(partition)[:, None] ) - embedding = embedding + np.random.uniform(-0.1, 0.1, embedding.shape) + embedding = embedding + np.random.uniform(-0.01, 0.01, embedding.shape) return embedding From 33581306f24d757fea9ada00e609afe9f5f28cba Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 29 Nov 2022 13:17:07 -0500 Subject: [PATCH 145/266] Send raw files to NBT repro (#713) * store mean scaled metric as mean score * # publish * send PR to nbt2022-reproducibility * don't delete qmd etc * temp * empty * verbose * skip * better logic * fix rsync args * include dirs * bugfix * remove temp --- .github/workflows/process_results.yml | 48 +++++++++++++++++++++++++-- .github/workflows/run_tests.yml | 18 +++++----- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/.github/workflows/process_results.yml b/.github/workflows/process_results.yml index 4b70b69d00..cdefe415f4 100644 --- a/.github/workflows/process_results.yml +++ b/.github/workflows/process_results.yml @@ -32,11 +32,24 @@ jobs: path: website token: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} - - name: Set up Git branch + - name: Checkout NBT reproduciblity repo + uses: actions/checkout@v3 + with: + fetch-depth: 0 + repository: openproblems-bio/nbt2022-reproducibility + path: nbt2022-reproducibility + token: ${{ secrets.GH_ACTIONS_NBT_REPRODUCIBILITY_PAT }} + + - name: Set up website Git branch working-directory: website run: | git checkout -b $UPDATE_BRANCH_NAME + - name: Set up nbt2022-reproducibility Git branch + working-directory: nbt2022-reproducibility + run: | + git checkout -b $UPDATE_BRANCH_NAME + - name: Set up Python uses: actions/setup-python@v4 with: @@ -70,6 +83,7 @@ jobs: S3_URI="s3://openproblems-nextflow/cwd_example" fi aws s3 cp --quiet --recursive "${S3_URI}" /tmp/results/ + rm -r nbt2022-reproducibility/results/*/*.json rm -r website/data/results/*/ rm -r website/content/benchmarks/*/ python openproblems/workflow/parse_nextflow.py /tmp website/data/results @@ -87,11 +101,12 @@ jobs: name: markdown path: website/content/benchmarks - - name: Remove raw output + - name: Move raw output if: | github.event_name == 'repository_dispatch' || endsWith(github.event.head_commit.message, '# publish') run: | + rsync -v -r --include "*.raw.json" --include "*/" --exclude "*" website/data/results/ nbt2022-reproducibility/results rm website/data/results/*/*.raw.json - name: Push to openproblems-bio/website @@ -105,7 +120,7 @@ jobs: run: | git push origin "${UPDATE_BRANCH_NAME}" - - name: Create Pull Request + - name: Create website Pull Request if: | github.event_name == 'repository_dispatch' || endsWith(github.event.head_commit.message, '# publish') @@ -121,6 +136,33 @@ jobs: author: "openproblems-bio " commit-message: "Update benchmark results # ci skip" + - name: Push to openproblems-bio/nbt2022-reproducibility + if: | + github.event_name == 'repository_dispatch' || + endsWith(github.event.head_commit.message, '# publish') + shell: bash + working-directory: './nbt2022-reproducibility' + env: + GITHUB_TOKEN: ${{ secrets.GH_ACTIONS_NBT_REPRODUCIBILITY_PAT }} + run: | + git push origin "${UPDATE_BRANCH_NAME}" + + - name: Create nbt2022-reproducibility Pull Request + if: | + github.event_name == 'repository_dispatch' || + endsWith(github.event.head_commit.message, '# publish') + uses: peter-evans/create-pull-request@v4 + with: + branch: ${{ env.UPDATE_BRANCH_NAME }} + delete-branch: true + base: main + title: '[auto] Update benchmark results' + reviewers: scottgigante-immunai,rcannood + path: './nbt2022-reproducibility' + token: ${{ secrets.GH_ACTIONS_NBT_REPRODUCIBILITY_PAT }} + author: "openproblems-bio " + commit-message: "Update benchmark results # ci skip" + - name: AWS S3 cleanup if: "github.event_name == 'repository_dispatch'" env: diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index d813b2c9d2..48c4e31ca6 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -136,6 +136,10 @@ jobs: runs-on: ubuntu-latest if: | !endsWith(github.event.head_commit.message, '# ci skip') && + !startsWith(github.ref, 'refs/heads/test_docker') && + !startsWith(github.ref, 'refs/heads/test_benchmark') && + !startsWith(github.ref, 'refs/heads/test_full_benchmark') && + !startsWith(github.ref, 'refs/heads/test_process') && ( startsWith(github.ref, 'refs/heads') || startsWith(github.ref, 'refs/tags') || @@ -210,11 +214,7 @@ jobs: ( needs.build_images.result == 'success' || needs.build_images.result == 'skipped' - ) && - !startsWith(github.ref, 'refs/heads/test_docker') && - !startsWith(github.ref, 'refs/heads/test_benchmark') && - !startsWith(github.ref, 'refs/heads/test_full_benchmark') && - !startsWith(github.ref, 'refs/heads/test_process') + ) strategy: fail-fast: false @@ -339,16 +339,18 @@ jobs: name: coverage run_benchmark: - needs: run_tester + needs: + - run_tester + - build_images runs-on: ubuntu-latest if: >- always() && !endsWith(github.event.head_commit.message, '# ci skip') && + needs.build_images.result == 'success' && github.event_name == 'push' && ( needs.run_tester.result == 'success' || - startsWith(github.ref, 'refs/heads/test_benchmark') || - startsWith(github.ref, 'refs/heads/test_full_benchmark') + needs.run_tester.result == 'skipped' ) steps: From 0869273c7c023349b85680a9c766012166156346 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 29 Nov 2022 14:03:47 -0500 Subject: [PATCH 146/266] Push task readmes to website from main (#715) * store mean scaled metric as mean score * # publish * send PR to nbt2022-reproducibility * don't delete qmd etc * temp * empty * verbose * skip * better logic * fix rsync args * include dirs * bugfix * remove temp * update website content from main, not benchmark * don't run tests * rename --- .github/workflows/process_results.yml | 12 +-- .github/workflows/run_tests.yml | 1 + .github/workflows/update_website_content.yml | 87 ++++++++++++++++++++ 3 files changed, 90 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/update_website_content.yml diff --git a/.github/workflows/process_results.yml b/.github/workflows/process_results.yml index cdefe415f4..d2087c890a 100644 --- a/.github/workflows/process_results.yml +++ b/.github/workflows/process_results.yml @@ -27,7 +27,7 @@ jobs: - name: Checkout website repo uses: actions/checkout@v3 with: - fetch-depth: 0 + fetch-depth: 1 repository: openproblems-bio/website path: website token: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} @@ -35,7 +35,7 @@ jobs: - name: Checkout NBT reproduciblity repo uses: actions/checkout@v3 with: - fetch-depth: 0 + fetch-depth: 1 repository: openproblems-bio/nbt2022-reproducibility path: nbt2022-reproducibility token: ${{ secrets.GH_ACTIONS_NBT_REPRODUCIBILITY_PAT }} @@ -85,9 +85,7 @@ jobs: aws s3 cp --quiet --recursive "${S3_URI}" /tmp/results/ rm -r nbt2022-reproducibility/results/*/*.json rm -r website/data/results/*/ - rm -r website/content/benchmarks/*/ python openproblems/workflow/parse_nextflow.py /tmp website/data/results - python openproblems/workflow/generate_website_markdown.py website/content/benchmarks - name: Upload results uses: actions/upload-artifact@main @@ -95,12 +93,6 @@ jobs: name: results path: website/data/results - - name: Upload markdown - uses: actions/upload-artifact@main - with: - name: markdown - path: website/content/benchmarks - - name: Move raw output if: | github.event_name == 'repository_dispatch' || diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 48c4e31ca6..27e95f4656 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -140,6 +140,7 @@ jobs: !startsWith(github.ref, 'refs/heads/test_benchmark') && !startsWith(github.ref, 'refs/heads/test_full_benchmark') && !startsWith(github.ref, 'refs/heads/test_process') && + !startsWith(github.ref, 'refs/heads/test_website') && ( startsWith(github.ref, 'refs/heads') || startsWith(github.ref, 'refs/tags') || diff --git a/.github/workflows/update_website_content.yml b/.github/workflows/update_website_content.yml new file mode 100644 index 0000000000..721096e356 --- /dev/null +++ b/.github/workflows/update_website_content.yml @@ -0,0 +1,87 @@ +name: Update website content + +on: + push: + branches: + - 'main' + - 'test_website' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + update_content: + runs-on: ubuntu-latest + + env: + UPDATE_BRANCH_NAME: "auto_update_content_${{ github.run_number }}" + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 1 + path: openproblems + + - name: Checkout website repo + uses: actions/checkout@v3 + with: + fetch-depth: 1 + repository: openproblems-bio/website + path: website + token: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} + + - name: Set up website Git branch + working-directory: website + run: | + git checkout -b $UPDATE_BRANCH_NAME + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.8" + + - name: Install package & dependencies + run: | + python -m pip install --upgrade pip + pip install -U wheel setuptools + pip install -U --editable ./openproblems[process] + python -c "import openproblems" + + - name: Parse results + run: | + rm -r website/content/benchmarks/*/ + python openproblems/workflow/generate_website_markdown.py website/content/benchmarks + + - name: Upload markdown + uses: actions/upload-artifact@main + with: + name: markdown + path: website/content/benchmarks + + - name: Push to openproblems-bio/website + if: | + startsWith(github.ref, 'refs/heads/main') || + endsWith(github.event.head_commit.message, '# publish') + shell: bash + working-directory: './website' + env: + GITHUB_TOKEN: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} + run: | + git diff --exit-code --quiet || git push origin "${UPDATE_BRANCH_NAME}" + + - name: Create website Pull Request + if: | + startsWith(github.ref, 'refs/heads/main') || + endsWith(github.event.head_commit.message, '# publish') + uses: peter-evans/create-pull-request@v4 + with: + branch: ${{ env.UPDATE_BRANCH_NAME }} + delete-branch: true + base: main + title: '[auto] Update benchmark content' + reviewers: scottgigante-immunai,rcannood,dburkhardt + path: './website' + token: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} + author: "openproblems-bio " + commit-message: "Update benchmark content # ci skip" From f5e587fc6817a830bee3c78653081665265004b9 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 29 Nov 2022 20:26:53 +0100 Subject: [PATCH 147/266] Turn off ci skip (#714) * don't skip ci on pr * Don't skip ci on website either Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .github/workflows/process_results.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/process_results.yml b/.github/workflows/process_results.yml index d2087c890a..4a9389143e 100644 --- a/.github/workflows/process_results.yml +++ b/.github/workflows/process_results.yml @@ -126,7 +126,7 @@ jobs: path: './website' token: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} author: "openproblems-bio " - commit-message: "Update benchmark results # ci skip" + commit-message: "Update benchmark results" - name: Push to openproblems-bio/nbt2022-reproducibility if: | @@ -153,7 +153,7 @@ jobs: path: './nbt2022-reproducibility' token: ${{ secrets.GH_ACTIONS_NBT_REPRODUCIBILITY_PAT }} author: "openproblems-bio " - commit-message: "Update benchmark results # ci skip" + commit-message: "Update benchmark results" - name: AWS S3 cleanup if: "github.event_name == 'repository_dispatch'" From 566df047ea6bd3c34c8295324b0fcae506464005 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 29 Nov 2022 14:53:12 -0500 Subject: [PATCH 148/266] fix diff check # publish (#716) --- .github/workflows/update_website_content.yml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/update_website_content.yml b/.github/workflows/update_website_content.yml index 721096e356..0d72f23d62 100644 --- a/.github/workflows/update_website_content.yml +++ b/.github/workflows/update_website_content.yml @@ -52,6 +52,8 @@ jobs: run: | rm -r website/content/benchmarks/*/ python openproblems/workflow/generate_website_markdown.py website/content/benchmarks + cd website + git diff --exit-code --quiet || echo "CHANGED=true" >> $GITHUB_ENV - name: Upload markdown uses: actions/upload-artifact@main @@ -61,19 +63,25 @@ jobs: - name: Push to openproblems-bio/website if: | - startsWith(github.ref, 'refs/heads/main') || - endsWith(github.event.head_commit.message, '# publish') + env.CHANGED == 'true' && + ( + startsWith(github.ref, 'refs/heads/main') || + endsWith(github.event.head_commit.message, '# publish') + ) shell: bash working-directory: './website' env: GITHUB_TOKEN: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} run: | - git diff --exit-code --quiet || git push origin "${UPDATE_BRANCH_NAME}" + git push origin "${UPDATE_BRANCH_NAME}" - name: Create website Pull Request if: | - startsWith(github.ref, 'refs/heads/main') || - endsWith(github.event.head_commit.message, '# publish') + env.CHANGED == 'true' && + ( + startsWith(github.ref, 'refs/heads/main') || + endsWith(github.event.head_commit.message, '# publish') + ) uses: peter-evans/create-pull-request@v4 with: branch: ${{ env.UPDATE_BRANCH_NAME }} From c8c08c8bf50cc1ee829e3b99d692a83333411973 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 29 Nov 2022 16:49:49 -0500 Subject: [PATCH 149/266] Run pip check on build (#678) * squash * revert changes to installation * building takes half a cpu --- .github/workflows/run_tests.yml | 7 ----- .../Dockerfile | 24 ---------------- .../README.md | 15 ---------- .../requirements.txt | 6 ---- .../openproblems-python-bedtools/Dockerfile | 28 +++++++++++++++++++ docker/openproblems-python-bedtools/README.md | 12 ++++++++ .../requirements.txt | 2 ++ .../requirements.txt | 10 ------- .../Dockerfile | 4 +-- .../README.md | 6 ++++ .../requirements.txt | 11 ++++++++ .../openproblems-python-scvi/requirements.txt | 5 ---- .../openproblems-python-tensorflow/Dockerfile | 15 ++++++++++ .../README.md | 5 ++-- .../requirements.txt | 4 +++ docker/openproblems-python-tf2.4/Dockerfile | 18 ------------ .../requirements.txt | 4 --- docker/openproblems-r-base/README.md | 2 +- .../openproblems-r-extras/r_requirements.txt | 1 + docker/openproblems-r-extras/requirements.txt | 2 +- docker/openproblems-r-pytorch/README.md | 14 +++++----- .../openproblems-r-pytorch/requirements.txt | 7 ++++- .../metrics/cc_score.py | 2 +- .../metrics/iso_label_sil.py | 2 +- .../batch_integration_embed/metrics/pcr.py | 2 +- .../metrics/sil_batch.py | 2 +- .../metrics/silhouette.py | 2 +- .../metrics/hvg_conservation.py | 2 +- .../batch_integration_graph/methods/bbknn.py | 2 +- .../batch_integration_graph/methods/combat.py | 2 +- .../batch_integration_graph/methods/mnn.py | 2 +- .../batch_integration_graph/methods/scalex.py | 2 +- .../methods/scanorama.py | 2 +- .../batch_integration_graph/methods/scanvi.py | 2 +- .../batch_integration_graph/methods/scvi.py | 2 +- .../batch_integration_graph/metrics/ari.py | 2 +- .../metrics/graph_connectivity.py | 2 +- .../metrics/iso_label_f1.py | 2 +- .../batch_integration_graph/metrics/nmi.py | 2 +- .../_common/methods/liana.py | 7 +++++ .../tasks/denoising/datasets/pancreas.py | 2 +- openproblems/tasks/denoising/datasets/pbmc.py | 2 +- .../denoising/datasets/tabula_muris_senis.py | 2 +- openproblems/tasks/denoising/methods/dca.py | 2 +- .../tasks/denoising/metrics/poisson.py | 2 +- .../methods/neuralee.py | 2 +- .../label_projection/methods/scvi_tools.py | 4 +-- .../methods/beta.py | 2 +- .../datasets/destvi/generate.py | 2 +- .../methods/cell2location.py | 2 +- .../spatial_decomposition/methods/destvi.py | 2 +- .../methods/stereoscope.py | 2 +- .../spatial_decomposition/methods/tangram.py | 2 +- pytest.ini | 1 + setup.py | 10 +++---- workflow/Snakefile | 4 +++ workflow/snakemake_tools.py | 12 ++++++++ 57 files changed, 158 insertions(+), 140 deletions(-) delete mode 100644 docker/openproblems-python-batch-integration/Dockerfile delete mode 100644 docker/openproblems-python-batch-integration/README.md delete mode 100644 docker/openproblems-python-batch-integration/requirements.txt create mode 100644 docker/openproblems-python-bedtools/Dockerfile create mode 100644 docker/openproblems-python-bedtools/README.md create mode 100644 docker/openproblems-python-bedtools/requirements.txt rename docker/{openproblems-python-scvi => openproblems-python-pytorch}/Dockerfile (65%) rename docker/{openproblems-python-scvi => openproblems-python-pytorch}/README.md (64%) create mode 100644 docker/openproblems-python-pytorch/requirements.txt delete mode 100644 docker/openproblems-python-scvi/requirements.txt create mode 100644 docker/openproblems-python-tensorflow/Dockerfile rename docker/{openproblems-python-tf2.4 => openproblems-python-tensorflow}/README.md (56%) create mode 100644 docker/openproblems-python-tensorflow/requirements.txt delete mode 100644 docker/openproblems-python-tf2.4/Dockerfile delete mode 100644 docker/openproblems-python-tf2.4/requirements.txt diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 27e95f4656..653769fbca 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -62,13 +62,6 @@ jobs: pip install --editable .[evaluate] python -c "import openproblems" - - name: Download docker images - run: | - for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do - docker pull singlecellopenproblems/${image} & - done - wait - - name: Update Docker images if: | !( diff --git a/docker/openproblems-python-batch-integration/Dockerfile b/docker/openproblems-python-batch-integration/Dockerfile deleted file mode 100644 index 697905d79c..0000000000 --- a/docker/openproblems-python-batch-integration/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -FROM singlecellopenproblems/openproblems-r-base:latest - -USER root -WORKDIR / - -ARG NB_USER="sagemaker-user" -ARG NB_UID="1000" -ARG NB_GID="100" - -RUN sed -i '$ d' /etc/apt/sources.list -RUN \ -apt-get update --allow-releaseinfo-change && \ -apt-get -y install --no-install-recommends gcc git python3-llvmlite && \ -apt-get autoremove -y && \ -rm -rf /var/lib/apt/lists/* - -# Install Python packages -COPY ./docker/openproblems-python-batch-integration/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt -# force reinstall annoy addresses https://github.com/spotify/annoy/issues/513 -RUN pip install --no-cache-dir --force annoy==1.17.0 - -USER $NB_UID -WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-batch-integration/README.md b/docker/openproblems-python-batch-integration/README.md deleted file mode 100644 index 02a18e1c20..0000000000 --- a/docker/openproblems-python-batch-integration/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# openproblems-python-extras Docker image - -Base image: singlecellopenproblems/openproblems-r-base - -OS: Debian Stretch - -Python: 3.8 - -Python packages: - -* scIB -* mnnpy -* scanorama -* bbknn -* scVI diff --git a/docker/openproblems-python-batch-integration/requirements.txt b/docker/openproblems-python-batch-integration/requirements.txt deleted file mode 100644 index 9c92302df9..0000000000 --- a/docker/openproblems-python-batch-integration/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -annoy==1.17.1 -bbknn==1.5.* -git+https://github.com/scottgigante-immunai/mnnpy@eb4c551 # branch: patch-2 -git+https://github.com/theislab/scib@77ab015 -scanorama==1.7.0 -scvi-tools~=0.16 # pinned in #313 diff --git a/docker/openproblems-python-bedtools/Dockerfile b/docker/openproblems-python-bedtools/Dockerfile new file mode 100644 index 0000000000..d2ad393378 --- /dev/null +++ b/docker/openproblems-python-bedtools/Dockerfile @@ -0,0 +1,28 @@ +FROM singlecellopenproblems/openproblems:latest + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +USER root +WORKDIR / + +# Install pybedtools dependency +ARG BUILD_PACKAGES="" +ARG PACKAGE_VERSION=2.27.1 +RUN apt-get update && \ + apt-get install --yes git openssl build-essential zlib1g-dev && \ + cd /tmp && \ + git clone https://github.com/arq5x/bedtools2.git && \ + cd bedtools2 && \ + git checkout v$PACKAGE_VERSION && \ + make && \ + mv bin/* /usr/local/bin && \ + cd / + +# install dependencies and openproblems +COPY ./docker/openproblems-python-bedtools/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +USER $NB_UID +WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-bedtools/README.md b/docker/openproblems-python-bedtools/README.md new file mode 100644 index 0000000000..d7b9935395 --- /dev/null +++ b/docker/openproblems-python-bedtools/README.md @@ -0,0 +1,12 @@ +# openproblems-python-extras Docker image + +Base image: singlecellopenproblems/openproblems + +OS: Debian Stretch + +Python: 3.8 + +Python packages: + +* pybedtools +* pyensembl diff --git a/docker/openproblems-python-bedtools/requirements.txt b/docker/openproblems-python-bedtools/requirements.txt new file mode 100644 index 0000000000..5f308af4ed --- /dev/null +++ b/docker/openproblems-python-bedtools/requirements.txt @@ -0,0 +1,2 @@ +pybedtools==0.9.* +pyensembl==2.0.* diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index c51a5d771a..7d6aee63d5 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -1,17 +1,7 @@ cmake==3.24.1.1 -git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac -git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 git+https://github.com/jorvis/Multicore-TSNE@6832575 git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix git+https://github.com/scottgigante-immunai/knn-smoothing@python_package magic-impute==3.0.* phate==1.0.* -pybedtools==0.9.* -pyensembl==2.0.* -scalex==1.0.2 -scvi-tools==0.16.* -tangram-sc==1.0.* -tensorflow-cpu==2.9.* -torch==1.12.* xgboost==1.6.* diff --git a/docker/openproblems-python-scvi/Dockerfile b/docker/openproblems-python-pytorch/Dockerfile similarity index 65% rename from docker/openproblems-python-scvi/Dockerfile rename to docker/openproblems-python-pytorch/Dockerfile index f7edd2e4dc..0f5b8521a3 100644 --- a/docker/openproblems-python-scvi/Dockerfile +++ b/docker/openproblems-python-pytorch/Dockerfile @@ -7,8 +7,8 @@ ARG NB_GID="100" USER root WORKDIR / -# Install Python packages -COPY ./docker/openproblems-python-scvi/requirements.txt ./requirements.txt +# install dependencies and openproblems +COPY ./docker/openproblems-python-pytorch/requirements.txt ./requirements.txt RUN pip install --no-cache-dir -r requirements.txt USER $NB_UID diff --git a/docker/openproblems-python-scvi/README.md b/docker/openproblems-python-pytorch/README.md similarity index 64% rename from docker/openproblems-python-scvi/README.md rename to docker/openproblems-python-pytorch/README.md index 546cec9bc0..d566a8efd5 100644 --- a/docker/openproblems-python-scvi/README.md +++ b/docker/openproblems-python-pytorch/README.md @@ -9,3 +9,9 @@ Python: 3.8 Python packages: * scvi-tools +* tangram +* torch +* neuralee +* xgboost +* molecular-cross-validation +* cell2location diff --git a/docker/openproblems-python-pytorch/requirements.txt b/docker/openproblems-python-pytorch/requirements.txt new file mode 100644 index 0000000000..56bd2a53dc --- /dev/null +++ b/docker/openproblems-python-pytorch/requirements.txt @@ -0,0 +1,11 @@ +git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac +git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 +git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix +jax==0.3.23 +jaxlib==0.3.22 +scalex==1.0.2 +scikit-misc==0.1.* +scvi-tools~=0.17 # pinned in #313 +tangram-sc==1.0.* +torch==1.12.* +xgboost==1.6.* diff --git a/docker/openproblems-python-scvi/requirements.txt b/docker/openproblems-python-scvi/requirements.txt deleted file mode 100644 index 87aa041993..0000000000 --- a/docker/openproblems-python-scvi/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -jax==0.3.23 -jaxlib==0.3.22 -scikit-misc==0.1.* -scvi-tools~=0.17 # pinned in #313 -xgboost==1.6.* diff --git a/docker/openproblems-python-tensorflow/Dockerfile b/docker/openproblems-python-tensorflow/Dockerfile new file mode 100644 index 0000000000..0a996f9221 --- /dev/null +++ b/docker/openproblems-python-tensorflow/Dockerfile @@ -0,0 +1,15 @@ +FROM singlecellopenproblems/openproblems:latest + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +USER root +WORKDIR / + +# install dependencies and openproblems +COPY ./docker/openproblems-python-tensorflow/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +USER $NB_UID +WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-tf2.4/README.md b/docker/openproblems-python-tensorflow/README.md similarity index 56% rename from docker/openproblems-python-tf2.4/README.md rename to docker/openproblems-python-tensorflow/README.md index f08a69ea20..3f4c4dbf67 100644 --- a/docker/openproblems-python-tf2.4/README.md +++ b/docker/openproblems-python-tensorflow/README.md @@ -1,4 +1,4 @@ -# openproblems-python-tf2.4 Docker image +# openproblems-python-tensorflow Docker image Base image: singlecellopenproblems/openproblems @@ -8,6 +8,5 @@ Python: 3.8 Python packages: -* keras >=2.4,<2.6 -* tensorflow >=2.4,<2.5 +* tensorflow * dca diff --git a/docker/openproblems-python-tensorflow/requirements.txt b/docker/openproblems-python-tensorflow/requirements.txt new file mode 100644 index 0000000000..f2a476acf9 --- /dev/null +++ b/docker/openproblems-python-tensorflow/requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/Avsecz/kopt@6a5c890 # master +git+https://github.com/scottgigante-immunai/dca@1f4edbc # patch-1 contains tf version bump +protobuf==3.20.* +tensorflow==2.9.0 diff --git a/docker/openproblems-python-tf2.4/Dockerfile b/docker/openproblems-python-tf2.4/Dockerfile deleted file mode 100644 index dcdabaf28d..0000000000 --- a/docker/openproblems-python-tf2.4/Dockerfile +++ /dev/null @@ -1,18 +0,0 @@ -FROM singlecellopenproblems/openproblems:latest - -ARG NB_USER="sagemaker-user" -ARG NB_UID="1000" -ARG NB_GID="100" - -USER root -WORKDIR / - -# Install Python packages -COPY ./docker/openproblems-python-tf2.4/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -# tensorflow downgrades numpy and h5py (and therefore anndata) -RUN pip install --no-cache-dir -e /usr/src/singlecellopenproblems - -USER $NB_UID -WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-tf2.4/requirements.txt b/docker/openproblems-python-tf2.4/requirements.txt deleted file mode 100644 index 31a56c2ea2..0000000000 --- a/docker/openproblems-python-tf2.4/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -dca==0.3.* -keras>=2.4,<2.11 -pyyaml==6.0 # pinned in #431 -tensorflow-cpu==2.4.* # pinned in dca diff --git a/docker/openproblems-r-base/README.md b/docker/openproblems-r-base/README.md index 785a9ace1b..ebca77780d 100644 --- a/docker/openproblems-r-base/README.md +++ b/docker/openproblems-r-base/README.md @@ -28,4 +28,4 @@ R packages: Python packages: * rpy2 -* anndata2ri>=1.0.6 +* anndata2ri>=1.1 diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index efd4ceeba6..0b8da4bde9 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -46,6 +46,7 @@ shiny@1.4.0.2 sparsesvd@0.2 systemfonts@1.0.4 textshaping@0.3.6 +theislab/kBET@a10ffea # master tibble@3.1.7 tidymodels@0.1.2 tidyverse@1.3.0 diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 9d03431a90..1795471bae 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,3 +1,3 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -git+https://github.com/theislab/scib@v1.0.2 +git+https://github.com/theislab/scib@f0be826 xgboost==1.6.* diff --git a/docker/openproblems-r-pytorch/README.md b/docker/openproblems-r-pytorch/README.md index 03ccbc07d4..33d3e5a30c 100644 --- a/docker/openproblems-r-pytorch/README.md +++ b/docker/openproblems-r-pytorch/README.md @@ -8,12 +8,12 @@ Python: 3.8 R: 4.0 -R packages: - -* batchelor -* sparsesvd -* dplyr - Python packages: -* harmonic-alignment +* harmony-pytorch +* torch +* bbknn +* mnnpy +* scib +* scanorama +* scvi-tools diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index e7c8df42b7..c9dfe890a9 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -1,3 +1,8 @@ -git+https://github.com/theislab/scib@v1.0.2 +annoy==1.17.1 +bbknn==1.5.* +git+https://github.com/scottgigante-immunai/mnnpy@eb4c551 # branch: patch-2 +git+https://github.com/theislab/scib@f0be826 harmony-pytorch==0.1.* +scanorama==1.7.0 +scvi-tools~=0.16 # pinned in #313 torch==1.13.* diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py index 322891b202..e83d47bb54 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py @@ -21,7 +21,7 @@ @metric( metric_name="Cell Cycle Score", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def cc_score(adata, test=False): from ._utils import _get_split diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py index c3575de5b8..c1f8c4be2d 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py @@ -15,7 +15,7 @@ @metric( metric_name="Isolated label Silhouette", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def isolated_labels_sil(adata): from scib.metrics import isolated_labels diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py index 886f26078b..7efca62ffe 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py @@ -18,7 +18,7 @@ @metric( metric_name="PC Regression", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def pcr(adata): from ._utils import _get_split diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py index c02e5e42aa..9f28cd1284 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py @@ -24,7 +24,7 @@ @metric( metric_name="Batch ASW", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def silhouette_batch(adata): from scib.metrics import silhouette_batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py index 36991e1d67..bb2bece193 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py @@ -12,7 +12,7 @@ @metric( metric_name="Silhouette", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def silhouette(adata): from scib.metrics import silhouette diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py index d40b36b740..bb7f90cae8 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py @@ -21,7 +21,7 @@ @metric( metric_name="HVG conservation", maximize=True, - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) def hvg_conservation(adata): from scib.metrics import hvg_overlap diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py index 017ca8f766..97570dccd8 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py @@ -10,7 +10,7 @@ paper_url="https://academic.oup.com/bioinformatics/article/36/3/964/5545955", paper_year=2020, code_url="https://github.com/Teichlab/bbknn", - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py index 3043a552e1..96e53538d3 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py @@ -10,7 +10,7 @@ paper_url="https://academic.oup.com/biostatistics/article/8/1/118/252073", paper_year=2007, code_url="https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py index 99dab39203..0146f5b6e3 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/nbt.4091", paper_year=2018, code_url="https://github.com/chriscainx/mnnpy", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py index 36843b81ed..461ea04a94 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.1038/s41467-022-33758-z", paper_year=2022, code_url="https://github.com/jsxlei/SCALEX", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py index d6e80162b5..a5efc04b35 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/s41587-019-0113-3", paper_year=2019, code_url="https://github.com/brianhie/scanorama", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py index d5bf463974..8f98a3c931 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.15252/msb.20209620", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py index 9e9a82a9f2..35f1cd7ac5 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/s41592-018-0229-2", paper_year=2018, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py index 13e7eb8ce1..0d082fff44 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py @@ -16,7 +16,7 @@ @metric( metric_name="ARI", maximize=True, - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) def ari(adata): from scib.metrics import ari diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py index 3a9732d0e2..52dd7c44b2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py @@ -22,7 +22,7 @@ @metric( metric_name="Graph connectivity", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def graph_connectivity(adata): import scib.metrics diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py index df86b043d9..71cd7ca209 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py @@ -27,7 +27,7 @@ @metric( metric_name="Isolated label F1", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def isolated_labels_f1(adata): from scib.metrics import isolated_labels diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py index bbcdc7cd9d..3356507b2e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py @@ -15,7 +15,7 @@ @metric( metric_name="NMI", maximize=True, - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) def nmi(adata): from scib.metrics.clustering import opt_louvain # isort:skip diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py index dc2bba944c..a528c9c9e3 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py @@ -43,6 +43,10 @@ def _liana( adata.layers["logcounts"] = adata.layers["log_cpm"] del adata.layers["log_cpm"] + # remove dataframe before R conversion + target = adata.uns["ccc_target"] + del adata.uns["ccc_target"] + # Run LIANA liana_res = _r_liana( adata, @@ -53,6 +57,9 @@ def _liana( **kwargs, ) + # return target to uns + adata.uns["ccc_target"] = target + # Format results liana_res["score"] = liana_res[score_col] adata.uns["ccc_pred"] = liana_res diff --git a/openproblems/tasks/denoising/datasets/pancreas.py b/openproblems/tasks/denoising/datasets/pancreas.py index c18ddbeee1..bd039ad88e 100644 --- a/openproblems/tasks/denoising/datasets/pancreas.py +++ b/openproblems/tasks/denoising/datasets/pancreas.py @@ -11,7 +11,7 @@ "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " "and SMARTER-seq). Here we just use the inDrop1 batch, which includes" "1937 cells × 15502 genes.", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def pancreas(test=False): adata = load_pancreas(test=test, keep_techs=["inDrop1"]) diff --git a/openproblems/tasks/denoising/datasets/pbmc.py b/openproblems/tasks/denoising/datasets/pbmc.py index fcf0fc782b..440ebe8a3b 100644 --- a/openproblems/tasks/denoising/datasets/pbmc.py +++ b/openproblems/tasks/denoising/datasets/pbmc.py @@ -11,7 +11,7 @@ "1k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. " "Sequenced on 10X v3 chemistry in November 2018 by 10X Genomics." ), - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def pbmc(test=False): adata = load_tenx_1k_pbmc(test=test) diff --git a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py index a5c62f953e..9524cc4e95 100644 --- a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py +++ b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py @@ -10,7 +10,7 @@ dataset_summary="All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 " "organs and tissues across the mouse lifespan. Here we use just 10x data from lung." " 24540 cells × 16160 genes across 3 time points.", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def tabula_muris_senis_lung_random(test=False): adata = load_tabula_muris_senis( diff --git a/openproblems/tasks/denoising/methods/dca.py b/openproblems/tasks/denoising/methods/dca.py index fddfb85d51..458f9ecb05 100644 --- a/openproblems/tasks/denoising/methods/dca.py +++ b/openproblems/tasks/denoising/methods/dca.py @@ -30,7 +30,7 @@ def _dca(adata, test=False, epochs=None): paper_url="https://www.nature.com/articles/s41467-018-07931-2", paper_year=2019, code_url="https://github.com/theislab/dca", - image="openproblems-python-tf2.4", + image="openproblems-python-tensorflow", ) def dca(adata, test=False, epochs=None): return _dca(adata, test=test, epochs=epochs) diff --git a/openproblems/tasks/denoising/metrics/poisson.py b/openproblems/tasks/denoising/metrics/poisson.py index ebd2a73378..93db71eee2 100644 --- a/openproblems/tasks/denoising/metrics/poisson.py +++ b/openproblems/tasks/denoising/metrics/poisson.py @@ -1,7 +1,7 @@ from ....tools.decorators import metric -@metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-extras") +@metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-pytorch") def poisson(adata): from molecular_cross_validation.mcv_sweep import poisson_nll_loss diff --git a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py index 41f55dbb65..420132dbf2 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py +++ b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py @@ -18,7 +18,7 @@ paper_url="https://www.frontiersin.org/articles/10.3389/fgene.2020.00786/full", paper_year=2020, code_url="https://github.com/HiBearME/NeuralEE", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 6212f8cd69..1cec698b27 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.15252/msb.20209620", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-scvi", + image="openproblems-python-pytorch", ) _scanvi_scarches_method = functools.partial( @@ -19,7 +19,7 @@ paper_url="https://doi.org/10.1101/2020.07.16.205997", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-scvi", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py index e5dffb6fa7..1eeff4992e 100644 --- a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py +++ b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py @@ -235,7 +235,7 @@ def _beta(adata, test=False, top_genes=None, threshold=1): paper_year=2013, code_version="1.0", code_url="http://cistrome.org/BETA", - image="openproblems-python-extras", + image="openproblems-python-bedtools", ) def beta(adata, test=False, top_genes=None, threshold=1): adata = _beta(adata, test=test, top_genes=top_genes, threshold=threshold) diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py index 86005d07f2..e446b07687 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py @@ -11,7 +11,7 @@ "from the destVI manuscripts leveraging sparsePCA. Number of cells and " "cell types present in each spatial spot is computed via combination of " "kernel-based parametrization of a categorical distribution and the NB model.", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def destvi(test=False): from .utils import generate_synthetic_dataset diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index 2caf9d4956..168472cbbb 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -12,7 +12,7 @@ paper_url="https://doi.org/10.1038/s41587-021-01139-4", paper_year=2022, code_url="https://github.com/BayraktarLab/cell2location", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/spatial_decomposition/methods/destvi.py b/openproblems/tasks/spatial_decomposition/methods/destvi.py index 9330ba1d92..4338a465fd 100644 --- a/openproblems/tasks/spatial_decomposition/methods/destvi.py +++ b/openproblems/tasks/spatial_decomposition/methods/destvi.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.1038/s41587-022-01272-8", paper_year=2022, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def destvi( adata, diff --git a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py index c2695e4253..f9c025319e 100644 --- a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py +++ b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.1038/s41587-022-01272-8", paper_year=2020, code_url="https://github.com/scverse/scvi-tools", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def stereoscope(adata, test=False, max_epochs_sc=None, max_epochs_sp=None): from scvi.external import RNAStereoscope diff --git a/openproblems/tasks/spatial_decomposition/methods/tangram.py b/openproblems/tasks/spatial_decomposition/methods/tangram.py index 329676c4aa..c05eb0339f 100644 --- a/openproblems/tasks/spatial_decomposition/methods/tangram.py +++ b/openproblems/tasks/spatial_decomposition/methods/tangram.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.1038/s41592-021-01264-7", paper_year=2021, code_url="https://github.com/broadinstitute/Tangram", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def tangram(adata, test=False, num_epochs=None, n_markers=None): # analysis based on: diff --git a/pytest.ini b/pytest.ini index 447d92b2a1..273a414dbc 100644 --- a/pytest.ini +++ b/pytest.ini @@ -10,4 +10,5 @@ filterwarnings = ignore:X\.dtype being converted to np\.float32 from float64:FutureWarning ignore:is_categorical is deprecated and will be removed in a future version:FutureWarning ignore:The use of (converter|py2rpy|rpy2py) in module rpy2.robjects.conversion is deprecated.:DeprecationWarning + ignore:`Model\.state_updates` will be removed in a future version\.:UserWarning always:Container failed with AssertionError\. Retrying [0-9]* more time:RuntimeWarning diff --git a/setup.py b/setup.py index a6a1c213f4..89ead383cd 100644 --- a/setup.py +++ b/setup.py @@ -4,14 +4,14 @@ import os install_requires = [ - "numpy>=1.22,<1.24", - "scikit-learn==1.1.*", + "numpy>=1.19.2,<1.24", + "scikit-learn<=1.1.*", "anndata==0.8.*", "scprep>=1.2.1", - "scipy>=1.8,<1.10", + "scipy>=1.7,<1.10", "scanpy>=1.6", - "louvain==0.7.*", - "python-igraph<0.10", + "louvain==0.8.*", + "python-igraph==0.10.*", "decorator<5.0", # pinned in #324 "memory-profiler==0.60", "colorama==0.4.*", diff --git a/workflow/Snakefile b/workflow/Snakefile index c8378a3cea..6f02c7fe2b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -27,6 +27,7 @@ rule docker_refresh: input: tools.refresh_images rule refresh_docker_image: + threads: 0 priority: 50 input: dockerfile = "{}/{{image}}/refresh.Dockerfile".format(tools.IMAGES_DIR), @@ -78,6 +79,7 @@ rule update_docker_image: "touch {output}" rule build_docker_image: + threads: 0.5 input: dockerfile = "{}/{{image}}/Dockerfile".format(tools.IMAGES_DIR), requirements = tools.docker_build_requirements, @@ -108,6 +110,7 @@ rule login_docker: "docker login --username=singlecellopenproblems --password=$(cat {input})" rule push_docker_image: + threads: 0 input: build = "{}/{{image}}/.docker_update".format(tools.IMAGES_DIR), login = ".docker_login", @@ -117,6 +120,7 @@ rule push_docker_image: "docker push --quiet singlecellopenproblems/{wildcards.image}" rule pull_docker_image: + threads: 0 output: temp(touch("{}/{{image}}/.docker_pull".format(tools.IMAGES_DIR))) shell: diff --git a/workflow/snakemake_tools.py b/workflow/snakemake_tools.py index 8c9b92e42b..02d327dc55 100644 --- a/workflow/snakemake_tools.py +++ b/workflow/snakemake_tools.py @@ -264,6 +264,7 @@ def docker_image_label(image, label): return output +@functools.lru_cache(None) def docker_imagespec_changed(image, dockerfile): """Check if the Dockerfile has changed @@ -273,6 +274,17 @@ def docker_imagespec_changed(image, dockerfile): If working with a github actions-built image, check if there is any diff between the Dockerfile and base/main """ + base_image = _docker_base(image) + if base_image is not None: + base_docker_path = os.path.join(IMAGES_DIR, base_image) + base_dockerfile = os.path.join(base_docker_path, "Dockerfile") + if docker_imagespec_changed(base_image, base_dockerfile): + print( + "{}: base image spec changed".format(image), + file=sys.stderr, + ) + return True + if not docker_image_exists(image): # will be downloaded from dockerhub build_type = "github_actions" From 4ac6faf34507ac9105f02ef7f83eceea556881ea Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 29 Nov 2022 20:20:10 -0500 Subject: [PATCH 150/266] Revert "Run pip check on build (#678)" This reverts commit c8c08c8bf50cc1ee829e3b99d692a83333411973. --- .github/workflows/run_tests.yml | 7 +++++ .../Dockerfile | 24 ++++++++++++++++ .../README.md | 15 ++++++++++ .../requirements.txt | 6 ++++ .../openproblems-python-bedtools/Dockerfile | 28 ------------------- docker/openproblems-python-bedtools/README.md | 12 -------- .../requirements.txt | 2 -- .../requirements.txt | 10 +++++++ .../requirements.txt | 11 -------- .../Dockerfile | 4 +-- .../README.md | 6 ---- .../openproblems-python-scvi/requirements.txt | 5 ++++ .../openproblems-python-tensorflow/Dockerfile | 15 ---------- .../requirements.txt | 4 --- docker/openproblems-python-tf2.4/Dockerfile | 18 ++++++++++++ .../README.md | 5 ++-- .../requirements.txt | 4 +++ docker/openproblems-r-base/README.md | 2 +- .../openproblems-r-extras/r_requirements.txt | 1 - docker/openproblems-r-extras/requirements.txt | 2 +- docker/openproblems-r-pytorch/README.md | 14 +++++----- .../openproblems-r-pytorch/requirements.txt | 7 +---- .../metrics/cc_score.py | 2 +- .../metrics/iso_label_sil.py | 2 +- .../batch_integration_embed/metrics/pcr.py | 2 +- .../metrics/sil_batch.py | 2 +- .../metrics/silhouette.py | 2 +- .../metrics/hvg_conservation.py | 2 +- .../batch_integration_graph/methods/bbknn.py | 2 +- .../batch_integration_graph/methods/combat.py | 2 +- .../batch_integration_graph/methods/mnn.py | 2 +- .../batch_integration_graph/methods/scalex.py | 2 +- .../methods/scanorama.py | 2 +- .../batch_integration_graph/methods/scanvi.py | 2 +- .../batch_integration_graph/methods/scvi.py | 2 +- .../batch_integration_graph/metrics/ari.py | 2 +- .../metrics/graph_connectivity.py | 2 +- .../metrics/iso_label_f1.py | 2 +- .../batch_integration_graph/metrics/nmi.py | 2 +- .../_common/methods/liana.py | 7 ----- .../tasks/denoising/datasets/pancreas.py | 2 +- openproblems/tasks/denoising/datasets/pbmc.py | 2 +- .../denoising/datasets/tabula_muris_senis.py | 2 +- openproblems/tasks/denoising/methods/dca.py | 2 +- .../tasks/denoising/metrics/poisson.py | 2 +- .../methods/neuralee.py | 2 +- .../label_projection/methods/scvi_tools.py | 4 +-- .../methods/beta.py | 2 +- .../datasets/destvi/generate.py | 2 +- .../methods/cell2location.py | 2 +- .../spatial_decomposition/methods/destvi.py | 2 +- .../methods/stereoscope.py | 2 +- .../spatial_decomposition/methods/tangram.py | 2 +- pytest.ini | 1 - setup.py | 10 +++---- workflow/Snakefile | 4 --- workflow/snakemake_tools.py | 12 -------- 57 files changed, 140 insertions(+), 158 deletions(-) create mode 100644 docker/openproblems-python-batch-integration/Dockerfile create mode 100644 docker/openproblems-python-batch-integration/README.md create mode 100644 docker/openproblems-python-batch-integration/requirements.txt delete mode 100644 docker/openproblems-python-bedtools/Dockerfile delete mode 100644 docker/openproblems-python-bedtools/README.md delete mode 100644 docker/openproblems-python-bedtools/requirements.txt delete mode 100644 docker/openproblems-python-pytorch/requirements.txt rename docker/{openproblems-python-pytorch => openproblems-python-scvi}/Dockerfile (65%) rename docker/{openproblems-python-pytorch => openproblems-python-scvi}/README.md (64%) create mode 100644 docker/openproblems-python-scvi/requirements.txt delete mode 100644 docker/openproblems-python-tensorflow/Dockerfile delete mode 100644 docker/openproblems-python-tensorflow/requirements.txt create mode 100644 docker/openproblems-python-tf2.4/Dockerfile rename docker/{openproblems-python-tensorflow => openproblems-python-tf2.4}/README.md (56%) create mode 100644 docker/openproblems-python-tf2.4/requirements.txt diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 653769fbca..27e95f4656 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -62,6 +62,13 @@ jobs: pip install --editable .[evaluate] python -c "import openproblems" + - name: Download docker images + run: | + for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do + docker pull singlecellopenproblems/${image} & + done + wait + - name: Update Docker images if: | !( diff --git a/docker/openproblems-python-batch-integration/Dockerfile b/docker/openproblems-python-batch-integration/Dockerfile new file mode 100644 index 0000000000..697905d79c --- /dev/null +++ b/docker/openproblems-python-batch-integration/Dockerfile @@ -0,0 +1,24 @@ +FROM singlecellopenproblems/openproblems-r-base:latest + +USER root +WORKDIR / + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +RUN sed -i '$ d' /etc/apt/sources.list +RUN \ +apt-get update --allow-releaseinfo-change && \ +apt-get -y install --no-install-recommends gcc git python3-llvmlite && \ +apt-get autoremove -y && \ +rm -rf /var/lib/apt/lists/* + +# Install Python packages +COPY ./docker/openproblems-python-batch-integration/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt +# force reinstall annoy addresses https://github.com/spotify/annoy/issues/513 +RUN pip install --no-cache-dir --force annoy==1.17.0 + +USER $NB_UID +WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-batch-integration/README.md b/docker/openproblems-python-batch-integration/README.md new file mode 100644 index 0000000000..02a18e1c20 --- /dev/null +++ b/docker/openproblems-python-batch-integration/README.md @@ -0,0 +1,15 @@ +# openproblems-python-extras Docker image + +Base image: singlecellopenproblems/openproblems-r-base + +OS: Debian Stretch + +Python: 3.8 + +Python packages: + +* scIB +* mnnpy +* scanorama +* bbknn +* scVI diff --git a/docker/openproblems-python-batch-integration/requirements.txt b/docker/openproblems-python-batch-integration/requirements.txt new file mode 100644 index 0000000000..9c92302df9 --- /dev/null +++ b/docker/openproblems-python-batch-integration/requirements.txt @@ -0,0 +1,6 @@ +annoy==1.17.1 +bbknn==1.5.* +git+https://github.com/scottgigante-immunai/mnnpy@eb4c551 # branch: patch-2 +git+https://github.com/theislab/scib@77ab015 +scanorama==1.7.0 +scvi-tools~=0.16 # pinned in #313 diff --git a/docker/openproblems-python-bedtools/Dockerfile b/docker/openproblems-python-bedtools/Dockerfile deleted file mode 100644 index d2ad393378..0000000000 --- a/docker/openproblems-python-bedtools/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM singlecellopenproblems/openproblems:latest - -ARG NB_USER="sagemaker-user" -ARG NB_UID="1000" -ARG NB_GID="100" - -USER root -WORKDIR / - -# Install pybedtools dependency -ARG BUILD_PACKAGES="" -ARG PACKAGE_VERSION=2.27.1 -RUN apt-get update && \ - apt-get install --yes git openssl build-essential zlib1g-dev && \ - cd /tmp && \ - git clone https://github.com/arq5x/bedtools2.git && \ - cd bedtools2 && \ - git checkout v$PACKAGE_VERSION && \ - make && \ - mv bin/* /usr/local/bin && \ - cd / - -# install dependencies and openproblems -COPY ./docker/openproblems-python-bedtools/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -USER $NB_UID -WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-bedtools/README.md b/docker/openproblems-python-bedtools/README.md deleted file mode 100644 index d7b9935395..0000000000 --- a/docker/openproblems-python-bedtools/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# openproblems-python-extras Docker image - -Base image: singlecellopenproblems/openproblems - -OS: Debian Stretch - -Python: 3.8 - -Python packages: - -* pybedtools -* pyensembl diff --git a/docker/openproblems-python-bedtools/requirements.txt b/docker/openproblems-python-bedtools/requirements.txt deleted file mode 100644 index 5f308af4ed..0000000000 --- a/docker/openproblems-python-bedtools/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pybedtools==0.9.* -pyensembl==2.0.* diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index 7d6aee63d5..c51a5d771a 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -1,7 +1,17 @@ cmake==3.24.1.1 +git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac +git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 git+https://github.com/jorvis/Multicore-TSNE@6832575 git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python +git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix git+https://github.com/scottgigante-immunai/knn-smoothing@python_package magic-impute==3.0.* phate==1.0.* +pybedtools==0.9.* +pyensembl==2.0.* +scalex==1.0.2 +scvi-tools==0.16.* +tangram-sc==1.0.* +tensorflow-cpu==2.9.* +torch==1.12.* xgboost==1.6.* diff --git a/docker/openproblems-python-pytorch/requirements.txt b/docker/openproblems-python-pytorch/requirements.txt deleted file mode 100644 index 56bd2a53dc..0000000000 --- a/docker/openproblems-python-pytorch/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac -git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 -git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix -jax==0.3.23 -jaxlib==0.3.22 -scalex==1.0.2 -scikit-misc==0.1.* -scvi-tools~=0.17 # pinned in #313 -tangram-sc==1.0.* -torch==1.12.* -xgboost==1.6.* diff --git a/docker/openproblems-python-pytorch/Dockerfile b/docker/openproblems-python-scvi/Dockerfile similarity index 65% rename from docker/openproblems-python-pytorch/Dockerfile rename to docker/openproblems-python-scvi/Dockerfile index 0f5b8521a3..f7edd2e4dc 100644 --- a/docker/openproblems-python-pytorch/Dockerfile +++ b/docker/openproblems-python-scvi/Dockerfile @@ -7,8 +7,8 @@ ARG NB_GID="100" USER root WORKDIR / -# install dependencies and openproblems -COPY ./docker/openproblems-python-pytorch/requirements.txt ./requirements.txt +# Install Python packages +COPY ./docker/openproblems-python-scvi/requirements.txt ./requirements.txt RUN pip install --no-cache-dir -r requirements.txt USER $NB_UID diff --git a/docker/openproblems-python-pytorch/README.md b/docker/openproblems-python-scvi/README.md similarity index 64% rename from docker/openproblems-python-pytorch/README.md rename to docker/openproblems-python-scvi/README.md index d566a8efd5..546cec9bc0 100644 --- a/docker/openproblems-python-pytorch/README.md +++ b/docker/openproblems-python-scvi/README.md @@ -9,9 +9,3 @@ Python: 3.8 Python packages: * scvi-tools -* tangram -* torch -* neuralee -* xgboost -* molecular-cross-validation -* cell2location diff --git a/docker/openproblems-python-scvi/requirements.txt b/docker/openproblems-python-scvi/requirements.txt new file mode 100644 index 0000000000..87aa041993 --- /dev/null +++ b/docker/openproblems-python-scvi/requirements.txt @@ -0,0 +1,5 @@ +jax==0.3.23 +jaxlib==0.3.22 +scikit-misc==0.1.* +scvi-tools~=0.17 # pinned in #313 +xgboost==1.6.* diff --git a/docker/openproblems-python-tensorflow/Dockerfile b/docker/openproblems-python-tensorflow/Dockerfile deleted file mode 100644 index 0a996f9221..0000000000 --- a/docker/openproblems-python-tensorflow/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM singlecellopenproblems/openproblems:latest - -ARG NB_USER="sagemaker-user" -ARG NB_UID="1000" -ARG NB_GID="100" - -USER root -WORKDIR / - -# install dependencies and openproblems -COPY ./docker/openproblems-python-tensorflow/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -USER $NB_UID -WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-tensorflow/requirements.txt b/docker/openproblems-python-tensorflow/requirements.txt deleted file mode 100644 index f2a476acf9..0000000000 --- a/docker/openproblems-python-tensorflow/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -git+https://github.com/Avsecz/kopt@6a5c890 # master -git+https://github.com/scottgigante-immunai/dca@1f4edbc # patch-1 contains tf version bump -protobuf==3.20.* -tensorflow==2.9.0 diff --git a/docker/openproblems-python-tf2.4/Dockerfile b/docker/openproblems-python-tf2.4/Dockerfile new file mode 100644 index 0000000000..dcdabaf28d --- /dev/null +++ b/docker/openproblems-python-tf2.4/Dockerfile @@ -0,0 +1,18 @@ +FROM singlecellopenproblems/openproblems:latest + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +USER root +WORKDIR / + +# Install Python packages +COPY ./docker/openproblems-python-tf2.4/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# tensorflow downgrades numpy and h5py (and therefore anndata) +RUN pip install --no-cache-dir -e /usr/src/singlecellopenproblems + +USER $NB_UID +WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-tensorflow/README.md b/docker/openproblems-python-tf2.4/README.md similarity index 56% rename from docker/openproblems-python-tensorflow/README.md rename to docker/openproblems-python-tf2.4/README.md index 3f4c4dbf67..f08a69ea20 100644 --- a/docker/openproblems-python-tensorflow/README.md +++ b/docker/openproblems-python-tf2.4/README.md @@ -1,4 +1,4 @@ -# openproblems-python-tensorflow Docker image +# openproblems-python-tf2.4 Docker image Base image: singlecellopenproblems/openproblems @@ -8,5 +8,6 @@ Python: 3.8 Python packages: -* tensorflow +* keras >=2.4,<2.6 +* tensorflow >=2.4,<2.5 * dca diff --git a/docker/openproblems-python-tf2.4/requirements.txt b/docker/openproblems-python-tf2.4/requirements.txt new file mode 100644 index 0000000000..31a56c2ea2 --- /dev/null +++ b/docker/openproblems-python-tf2.4/requirements.txt @@ -0,0 +1,4 @@ +dca==0.3.* +keras>=2.4,<2.11 +pyyaml==6.0 # pinned in #431 +tensorflow-cpu==2.4.* # pinned in dca diff --git a/docker/openproblems-r-base/README.md b/docker/openproblems-r-base/README.md index ebca77780d..785a9ace1b 100644 --- a/docker/openproblems-r-base/README.md +++ b/docker/openproblems-r-base/README.md @@ -28,4 +28,4 @@ R packages: Python packages: * rpy2 -* anndata2ri>=1.1 +* anndata2ri>=1.0.6 diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 0b8da4bde9..efd4ceeba6 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -46,7 +46,6 @@ shiny@1.4.0.2 sparsesvd@0.2 systemfonts@1.0.4 textshaping@0.3.6 -theislab/kBET@a10ffea # master tibble@3.1.7 tidymodels@0.1.2 tidyverse@1.3.0 diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 1795471bae..9d03431a90 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,3 +1,3 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -git+https://github.com/theislab/scib@f0be826 +git+https://github.com/theislab/scib@v1.0.2 xgboost==1.6.* diff --git a/docker/openproblems-r-pytorch/README.md b/docker/openproblems-r-pytorch/README.md index 33d3e5a30c..03ccbc07d4 100644 --- a/docker/openproblems-r-pytorch/README.md +++ b/docker/openproblems-r-pytorch/README.md @@ -8,12 +8,12 @@ Python: 3.8 R: 4.0 +R packages: + +* batchelor +* sparsesvd +* dplyr + Python packages: -* harmony-pytorch -* torch -* bbknn -* mnnpy -* scib -* scanorama -* scvi-tools +* harmonic-alignment diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index c9dfe890a9..e7c8df42b7 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -1,8 +1,3 @@ -annoy==1.17.1 -bbknn==1.5.* -git+https://github.com/scottgigante-immunai/mnnpy@eb4c551 # branch: patch-2 -git+https://github.com/theislab/scib@f0be826 +git+https://github.com/theislab/scib@v1.0.2 harmony-pytorch==0.1.* -scanorama==1.7.0 -scvi-tools~=0.16 # pinned in #313 torch==1.13.* diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py index e83d47bb54..322891b202 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py @@ -21,7 +21,7 @@ @metric( metric_name="Cell Cycle Score", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def cc_score(adata, test=False): from ._utils import _get_split diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py index c1f8c4be2d..c3575de5b8 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py @@ -15,7 +15,7 @@ @metric( metric_name="Isolated label Silhouette", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def isolated_labels_sil(adata): from scib.metrics import isolated_labels diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py index 7efca62ffe..886f26078b 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py @@ -18,7 +18,7 @@ @metric( metric_name="PC Regression", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def pcr(adata): from ._utils import _get_split diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py index 9f28cd1284..c02e5e42aa 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py @@ -24,7 +24,7 @@ @metric( metric_name="Batch ASW", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def silhouette_batch(adata): from scib.metrics import silhouette_batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py index bb2bece193..36991e1d67 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py @@ -12,7 +12,7 @@ @metric( metric_name="Silhouette", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def silhouette(adata): from scib.metrics import silhouette diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py index bb7f90cae8..d40b36b740 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py @@ -21,7 +21,7 @@ @metric( metric_name="HVG conservation", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) def hvg_conservation(adata): from scib.metrics import hvg_overlap diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py index 97570dccd8..017ca8f766 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py @@ -10,7 +10,7 @@ paper_url="https://academic.oup.com/bioinformatics/article/36/3/964/5545955", paper_year=2020, code_url="https://github.com/Teichlab/bbknn", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py index 96e53538d3..3043a552e1 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py @@ -10,7 +10,7 @@ paper_url="https://academic.oup.com/biostatistics/article/8/1/118/252073", paper_year=2007, code_url="https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py index 0146f5b6e3..99dab39203 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/nbt.4091", paper_year=2018, code_url="https://github.com/chriscainx/mnnpy", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py index 461ea04a94..36843b81ed 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.1038/s41467-022-33758-z", paper_year=2022, code_url="https://github.com/jsxlei/SCALEX", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py index a5efc04b35..d6e80162b5 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/s41587-019-0113-3", paper_year=2019, code_url="https://github.com/brianhie/scanorama", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py index 8f98a3c931..d5bf463974 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.15252/msb.20209620", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py index 35f1cd7ac5..9e9a82a9f2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/s41592-018-0229-2", paper_year=2018, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py index 0d082fff44..13e7eb8ce1 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py @@ -16,7 +16,7 @@ @metric( metric_name="ARI", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) def ari(adata): from scib.metrics import ari diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py index 52dd7c44b2..3a9732d0e2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py @@ -22,7 +22,7 @@ @metric( metric_name="Graph connectivity", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def graph_connectivity(adata): import scib.metrics diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py index 71cd7ca209..df86b043d9 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py @@ -27,7 +27,7 @@ @metric( metric_name="Isolated label F1", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def isolated_labels_f1(adata): from scib.metrics import isolated_labels diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py index 3356507b2e..bbcdc7cd9d 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py @@ -15,7 +15,7 @@ @metric( metric_name="NMI", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) def nmi(adata): from scib.metrics.clustering import opt_louvain # isort:skip diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py index a528c9c9e3..dc2bba944c 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py @@ -43,10 +43,6 @@ def _liana( adata.layers["logcounts"] = adata.layers["log_cpm"] del adata.layers["log_cpm"] - # remove dataframe before R conversion - target = adata.uns["ccc_target"] - del adata.uns["ccc_target"] - # Run LIANA liana_res = _r_liana( adata, @@ -57,9 +53,6 @@ def _liana( **kwargs, ) - # return target to uns - adata.uns["ccc_target"] = target - # Format results liana_res["score"] = liana_res[score_col] adata.uns["ccc_pred"] = liana_res diff --git a/openproblems/tasks/denoising/datasets/pancreas.py b/openproblems/tasks/denoising/datasets/pancreas.py index bd039ad88e..c18ddbeee1 100644 --- a/openproblems/tasks/denoising/datasets/pancreas.py +++ b/openproblems/tasks/denoising/datasets/pancreas.py @@ -11,7 +11,7 @@ "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " "and SMARTER-seq). Here we just use the inDrop1 batch, which includes" "1937 cells × 15502 genes.", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def pancreas(test=False): adata = load_pancreas(test=test, keep_techs=["inDrop1"]) diff --git a/openproblems/tasks/denoising/datasets/pbmc.py b/openproblems/tasks/denoising/datasets/pbmc.py index 440ebe8a3b..fcf0fc782b 100644 --- a/openproblems/tasks/denoising/datasets/pbmc.py +++ b/openproblems/tasks/denoising/datasets/pbmc.py @@ -11,7 +11,7 @@ "1k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. " "Sequenced on 10X v3 chemistry in November 2018 by 10X Genomics." ), - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def pbmc(test=False): adata = load_tenx_1k_pbmc(test=test) diff --git a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py index 9524cc4e95..a5c62f953e 100644 --- a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py +++ b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py @@ -10,7 +10,7 @@ dataset_summary="All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 " "organs and tissues across the mouse lifespan. Here we use just 10x data from lung." " 24540 cells × 16160 genes across 3 time points.", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def tabula_muris_senis_lung_random(test=False): adata = load_tabula_muris_senis( diff --git a/openproblems/tasks/denoising/methods/dca.py b/openproblems/tasks/denoising/methods/dca.py index 458f9ecb05..fddfb85d51 100644 --- a/openproblems/tasks/denoising/methods/dca.py +++ b/openproblems/tasks/denoising/methods/dca.py @@ -30,7 +30,7 @@ def _dca(adata, test=False, epochs=None): paper_url="https://www.nature.com/articles/s41467-018-07931-2", paper_year=2019, code_url="https://github.com/theislab/dca", - image="openproblems-python-tensorflow", + image="openproblems-python-tf2.4", ) def dca(adata, test=False, epochs=None): return _dca(adata, test=test, epochs=epochs) diff --git a/openproblems/tasks/denoising/metrics/poisson.py b/openproblems/tasks/denoising/metrics/poisson.py index 93db71eee2..ebd2a73378 100644 --- a/openproblems/tasks/denoising/metrics/poisson.py +++ b/openproblems/tasks/denoising/metrics/poisson.py @@ -1,7 +1,7 @@ from ....tools.decorators import metric -@metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-pytorch") +@metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-extras") def poisson(adata): from molecular_cross_validation.mcv_sweep import poisson_nll_loss diff --git a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py index 420132dbf2..41f55dbb65 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py +++ b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py @@ -18,7 +18,7 @@ paper_url="https://www.frontiersin.org/articles/10.3389/fgene.2020.00786/full", paper_year=2020, code_url="https://github.com/HiBearME/NeuralEE", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 1cec698b27..6212f8cd69 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.15252/msb.20209620", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-pytorch", + image="openproblems-python-scvi", ) _scanvi_scarches_method = functools.partial( @@ -19,7 +19,7 @@ paper_url="https://doi.org/10.1101/2020.07.16.205997", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-pytorch", + image="openproblems-python-scvi", ) diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py index 1eeff4992e..e5dffb6fa7 100644 --- a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py +++ b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py @@ -235,7 +235,7 @@ def _beta(adata, test=False, top_genes=None, threshold=1): paper_year=2013, code_version="1.0", code_url="http://cistrome.org/BETA", - image="openproblems-python-bedtools", + image="openproblems-python-extras", ) def beta(adata, test=False, top_genes=None, threshold=1): adata = _beta(adata, test=test, top_genes=top_genes, threshold=threshold) diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py index e446b07687..86005d07f2 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py @@ -11,7 +11,7 @@ "from the destVI manuscripts leveraging sparsePCA. Number of cells and " "cell types present in each spatial spot is computed via combination of " "kernel-based parametrization of a categorical distribution and the NB model.", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def destvi(test=False): from .utils import generate_synthetic_dataset diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index 168472cbbb..2caf9d4956 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -12,7 +12,7 @@ paper_url="https://doi.org/10.1038/s41587-021-01139-4", paper_year=2022, code_url="https://github.com/BayraktarLab/cell2location", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) diff --git a/openproblems/tasks/spatial_decomposition/methods/destvi.py b/openproblems/tasks/spatial_decomposition/methods/destvi.py index 4338a465fd..9330ba1d92 100644 --- a/openproblems/tasks/spatial_decomposition/methods/destvi.py +++ b/openproblems/tasks/spatial_decomposition/methods/destvi.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.1038/s41587-022-01272-8", paper_year=2022, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def destvi( adata, diff --git a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py index f9c025319e..c2695e4253 100644 --- a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py +++ b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.1038/s41587-022-01272-8", paper_year=2020, code_url="https://github.com/scverse/scvi-tools", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def stereoscope(adata, test=False, max_epochs_sc=None, max_epochs_sp=None): from scvi.external import RNAStereoscope diff --git a/openproblems/tasks/spatial_decomposition/methods/tangram.py b/openproblems/tasks/spatial_decomposition/methods/tangram.py index c05eb0339f..329676c4aa 100644 --- a/openproblems/tasks/spatial_decomposition/methods/tangram.py +++ b/openproblems/tasks/spatial_decomposition/methods/tangram.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.1038/s41592-021-01264-7", paper_year=2021, code_url="https://github.com/broadinstitute/Tangram", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def tangram(adata, test=False, num_epochs=None, n_markers=None): # analysis based on: diff --git a/pytest.ini b/pytest.ini index 273a414dbc..447d92b2a1 100644 --- a/pytest.ini +++ b/pytest.ini @@ -10,5 +10,4 @@ filterwarnings = ignore:X\.dtype being converted to np\.float32 from float64:FutureWarning ignore:is_categorical is deprecated and will be removed in a future version:FutureWarning ignore:The use of (converter|py2rpy|rpy2py) in module rpy2.robjects.conversion is deprecated.:DeprecationWarning - ignore:`Model\.state_updates` will be removed in a future version\.:UserWarning always:Container failed with AssertionError\. Retrying [0-9]* more time:RuntimeWarning diff --git a/setup.py b/setup.py index 89ead383cd..a6a1c213f4 100644 --- a/setup.py +++ b/setup.py @@ -4,14 +4,14 @@ import os install_requires = [ - "numpy>=1.19.2,<1.24", - "scikit-learn<=1.1.*", + "numpy>=1.22,<1.24", + "scikit-learn==1.1.*", "anndata==0.8.*", "scprep>=1.2.1", - "scipy>=1.7,<1.10", + "scipy>=1.8,<1.10", "scanpy>=1.6", - "louvain==0.8.*", - "python-igraph==0.10.*", + "louvain==0.7.*", + "python-igraph<0.10", "decorator<5.0", # pinned in #324 "memory-profiler==0.60", "colorama==0.4.*", diff --git a/workflow/Snakefile b/workflow/Snakefile index 6f02c7fe2b..c8378a3cea 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -27,7 +27,6 @@ rule docker_refresh: input: tools.refresh_images rule refresh_docker_image: - threads: 0 priority: 50 input: dockerfile = "{}/{{image}}/refresh.Dockerfile".format(tools.IMAGES_DIR), @@ -79,7 +78,6 @@ rule update_docker_image: "touch {output}" rule build_docker_image: - threads: 0.5 input: dockerfile = "{}/{{image}}/Dockerfile".format(tools.IMAGES_DIR), requirements = tools.docker_build_requirements, @@ -110,7 +108,6 @@ rule login_docker: "docker login --username=singlecellopenproblems --password=$(cat {input})" rule push_docker_image: - threads: 0 input: build = "{}/{{image}}/.docker_update".format(tools.IMAGES_DIR), login = ".docker_login", @@ -120,7 +117,6 @@ rule push_docker_image: "docker push --quiet singlecellopenproblems/{wildcards.image}" rule pull_docker_image: - threads: 0 output: temp(touch("{}/{{image}}/.docker_pull".format(tools.IMAGES_DIR))) shell: diff --git a/workflow/snakemake_tools.py b/workflow/snakemake_tools.py index 02d327dc55..8c9b92e42b 100644 --- a/workflow/snakemake_tools.py +++ b/workflow/snakemake_tools.py @@ -264,7 +264,6 @@ def docker_image_label(image, label): return output -@functools.lru_cache(None) def docker_imagespec_changed(image, dockerfile): """Check if the Dockerfile has changed @@ -274,17 +273,6 @@ def docker_imagespec_changed(image, dockerfile): If working with a github actions-built image, check if there is any diff between the Dockerfile and base/main """ - base_image = _docker_base(image) - if base_image is not None: - base_docker_path = os.path.join(IMAGES_DIR, base_image) - base_dockerfile = os.path.join(base_docker_path, "Dockerfile") - if docker_imagespec_changed(base_image, base_dockerfile): - print( - "{}: base image spec changed".format(image), - file=sys.stderr, - ) - return True - if not docker_image_exists(image): # will be downloaded from dockerhub build_type = "github_actions" From 4a0ee9b3731ff10d8cd2e584726a61b502aef613 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 1 Dec 2022 09:38:38 -0500 Subject: [PATCH 151/266] just use true features but smaller in test (#712) --- .../methods/__init__.py | 3 +- .../methods/baseline.py | 47 +++---------------- 2 files changed, 7 insertions(+), 43 deletions(-) diff --git a/openproblems/tasks/dimensionality_reduction/methods/__init__.py b/openproblems/tasks/dimensionality_reduction/methods/__init__.py index 08b09386d1..715ea1decc 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/methods/__init__.py @@ -1,6 +1,5 @@ -from .baseline import high_dim_pca -from .baseline import high_dim_spectral from .baseline import random_features +from .baseline import true_features from .densmap import densmap_logCPM_1kHVG from .densmap import densmap_pca_logCPM_1kHVG from .neuralee import neuralee_default diff --git a/openproblems/tasks/dimensionality_reduction/methods/baseline.py b/openproblems/tasks/dimensionality_reduction/methods/baseline.py index ec8b5e71bf..8035bdd403 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/baseline.py +++ b/openproblems/tasks/dimensionality_reduction/methods/baseline.py @@ -1,6 +1,5 @@ from ....tools.decorators import method from ....tools.utils import check_version -from typing import Optional import numpy as np @@ -20,52 +19,18 @@ def random_features(adata, test=False): @method( - method_name="High-dimensional PCA", - paper_name="High-dimensional PCA (baseline)", + method_name="True Features", + paper_name="True Features (baseline)", paper_url="https://openproblems.bio", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, ) -def high_dim_pca(adata, n_comps: Optional[int] = None, test=False): - # We wanted to use all features, but output must be dense - # so this is a close approximation - import scanpy as sc - +def true_features(adata, test=False): + adata.obsm["X_emb"] = adata.X if test: - n_comps = n_comps or 50 - else: # pragma: nocover - n_comps = n_comps or 500 - - sc.pp.pca(adata, n_comps=min(min(adata.shape), n_comps)) - adata.obsm["X_emb"] = adata.obsm["X_pca"] - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - -@method( - method_name="High-dimensional Laplacian Eigenmaps", - paper_name="High-dimensional Laplacian Eigenmaps (baseline)", - paper_url="https://openproblems.bio", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) -def high_dim_spectral(adata, n_comps: Optional[int] = None, test=False): - # We wanted to use all features, but output must be dense - # so this is a close approximation - import umap - import umap.spectral - - if test: - n_comps = n_comps or 10 - else: # pragma: nocover - n_comps = n_comps or 200 - - graph = umap.UMAP(transform_mode="graph").fit_transform(adata.X) - adata.obsm["X_emb"] = umap.spectral.spectral_layout( - adata.X, graph, n_comps, random_state=None - ) + adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100] + adata.obsm["X_emb"] = adata.obsm["X_emb"].toarray() adata.uns["method_code_version"] = check_version("openproblems") return adata From c6c889ba16df36b7961eed3f88c3634d945f5c1d Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 1 Dec 2022 10:57:53 -0500 Subject: [PATCH 152/266] don't create matrix on PR if dependabot --- .github/workflows/run_tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 27e95f4656..26816cf648 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -144,7 +144,10 @@ jobs: ( startsWith(github.ref, 'refs/heads') || startsWith(github.ref, 'refs/tags') || - github.event.pull_request.draft == false + ( + github.event.pull_request.draft == false && + github.actor != 'dependabot[bot]' + ) ) outputs: From 2d6950fd0c2125f46ec04a7eebf1c896153a9078 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 1 Dec 2022 13:20:58 -0500 Subject: [PATCH 153/266] use mnnpy from base repo (#723) --- docker/openproblems-python-batch-integration/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-python-batch-integration/requirements.txt b/docker/openproblems-python-batch-integration/requirements.txt index 9c92302df9..61bdfd360a 100644 --- a/docker/openproblems-python-batch-integration/requirements.txt +++ b/docker/openproblems-python-batch-integration/requirements.txt @@ -1,6 +1,6 @@ annoy==1.17.1 bbknn==1.5.* -git+https://github.com/scottgigante-immunai/mnnpy@eb4c551 # branch: patch-2 +git+https://github.com/chriscainx/mnnpy@2097dec # master git+https://github.com/theislab/scib@77ab015 scanorama==1.7.0 scvi-tools~=0.16 # pinned in #313 From fd38cc0b77250e9f790addb24b5c42cd1250e700 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 1 Dec 2022 14:22:15 -0500 Subject: [PATCH 154/266] set num_samples=2 in test mode (#724) --- .../tasks/spatial_decomposition/methods/cell2location.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index 2caf9d4956..b0645e2b0d 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -38,7 +38,7 @@ def _cell2location( if test: max_epochs_sc = max_epochs_sc or 2 max_epochs_st = max_epochs_st or 2 - num_samples = num_samples or 10 + num_samples = num_samples or 2 else: # pragma: nocover max_epochs_sc = max_epochs_sc or 250 max_epochs_st = max_epochs_st or 30000 From 2ca603719c5c1e72dae1b307f41dab65092395db Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 1 Dec 2022 14:28:43 -0500 Subject: [PATCH 155/266] reinstall if failed --- .github/workflows/run_tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 26816cf648..22aa6ac4b8 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -59,7 +59,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -U wheel setuptools - pip install --editable .[evaluate] + pip install --editable .[evaluate] || pip install --force-reinstall --editable .[evaluate] python -c "import openproblems" - name: Download docker images @@ -178,7 +178,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -U wheel setuptools - pip install --editable .[evaluate] + pip install --editable .[evaluate] || pip install --force-reinstall --editable .[evaluate] python -c "import openproblems" @@ -269,7 +269,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -U wheel setuptools - pip install -U --editable .[test,r,evaluate] + pip install --editable .[test,r,evaluate] || pip install --force-reinstall --editable .[test,r,evaluate] python -c "import openproblems" - name: Cache R packages From 23a69b25e482052f971009c33c686d0fcb3cb28d Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 1 Dec 2022 15:28:55 -0500 Subject: [PATCH 156/266] run builds in parallel (#728) --- .github/workflows/run_tests.yml | 7 ------- workflow/Snakefile | 4 ++++ workflow/snakemake_tools.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 22aa6ac4b8..d559209b46 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -62,13 +62,6 @@ jobs: pip install --editable .[evaluate] || pip install --force-reinstall --editable .[evaluate] python -c "import openproblems" - - name: Download docker images - run: | - for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do - docker pull singlecellopenproblems/${image} & - done - wait - - name: Update Docker images if: | !( diff --git a/workflow/Snakefile b/workflow/Snakefile index c8378a3cea..6f02c7fe2b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -27,6 +27,7 @@ rule docker_refresh: input: tools.refresh_images rule refresh_docker_image: + threads: 0 priority: 50 input: dockerfile = "{}/{{image}}/refresh.Dockerfile".format(tools.IMAGES_DIR), @@ -78,6 +79,7 @@ rule update_docker_image: "touch {output}" rule build_docker_image: + threads: 0.5 input: dockerfile = "{}/{{image}}/Dockerfile".format(tools.IMAGES_DIR), requirements = tools.docker_build_requirements, @@ -108,6 +110,7 @@ rule login_docker: "docker login --username=singlecellopenproblems --password=$(cat {input})" rule push_docker_image: + threads: 0 input: build = "{}/{{image}}/.docker_update".format(tools.IMAGES_DIR), login = ".docker_login", @@ -117,6 +120,7 @@ rule push_docker_image: "docker push --quiet singlecellopenproblems/{wildcards.image}" rule pull_docker_image: + threads: 0 output: temp(touch("{}/{{image}}/.docker_pull".format(tools.IMAGES_DIR))) shell: diff --git a/workflow/snakemake_tools.py b/workflow/snakemake_tools.py index 8c9b92e42b..02d327dc55 100644 --- a/workflow/snakemake_tools.py +++ b/workflow/snakemake_tools.py @@ -264,6 +264,7 @@ def docker_image_label(image, label): return output +@functools.lru_cache(None) def docker_imagespec_changed(image, dockerfile): """Check if the Dockerfile has changed @@ -273,6 +274,17 @@ def docker_imagespec_changed(image, dockerfile): If working with a github actions-built image, check if there is any diff between the Dockerfile and base/main """ + base_image = _docker_base(image) + if base_image is not None: + base_docker_path = os.path.join(IMAGES_DIR, base_image) + base_dockerfile = os.path.join(base_docker_path, "Dockerfile") + if docker_imagespec_changed(base_image, base_dockerfile): + print( + "{}: base image spec changed".format(image), + file=sys.stderr, + ) + return True + if not docker_image_exists(image): # will be downloaded from dockerhub build_type = "github_actions" From 73ad16d604cc8b6d6fb24886170dad6289053b2d Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 1 Dec 2022 15:30:48 -0500 Subject: [PATCH 157/266] Revert "reinstall if failed" This reverts commit 2ca603719c5c1e72dae1b307f41dab65092395db. --- .github/workflows/run_tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 22aa6ac4b8..26816cf648 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -59,7 +59,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -U wheel setuptools - pip install --editable .[evaluate] || pip install --force-reinstall --editable .[evaluate] + pip install --editable .[evaluate] python -c "import openproblems" - name: Download docker images @@ -178,7 +178,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -U wheel setuptools - pip install --editable .[evaluate] || pip install --force-reinstall --editable .[evaluate] + pip install --editable .[evaluate] python -c "import openproblems" @@ -269,7 +269,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -U wheel setuptools - pip install --editable .[test,r,evaluate] || pip install --force-reinstall --editable .[test,r,evaluate] + pip install -U --editable .[test,r,evaluate] python -c "import openproblems" - name: Cache R packages From 97f9d8bb39b82abcae944e0bf0a166be9bb154db Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 1 Dec 2022 15:31:53 -0500 Subject: [PATCH 158/266] bust cache --- .github/workflows/run_tests.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 26816cf648..9cde7335c7 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -52,8 +52,8 @@ jobs: uses: actions/cache@v3 with: path: ${{ env.pythonLocation }} - key: pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}-${{ hashFiles('setup.py') }} - restore-keys: pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}- + key: python-pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}-${{ hashFiles('setup.py') }} + restore-keys: python-pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}- - name: Install package & dependencies run: | @@ -171,8 +171,8 @@ jobs: uses: actions/cache@v3 with: path: ${{ env.pythonLocation }} - key: pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}-${{ hashFiles('setup.py') }} - restore-keys: pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}- + key: python-pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}-${{ hashFiles('setup.py') }} + restore-keys: python-pip-${{ env.LINUX_VERSION }}-${{ env.pythonLocation }}- - name: Install package & dependencies run: | @@ -262,8 +262,8 @@ jobs: uses: actions/cache@v3 with: path: ${{ env.pythonLocation }} - key: ${{env.LINUX_VERSION}}-pip-${{ env.PYTHON_VERSION }}-${{ hashFiles('setup.py') }} - restore-keys: ${{env.LINUX_VERSION}}-pip-${{ env.PYTHON_VERSION }}- + key: python-pip-${{env.LINUX_VERSION}}-pip-${{ env.PYTHON_VERSION }}-${{ hashFiles('setup.py') }} + restore-keys: python-pip-${{env.LINUX_VERSION}}-pip-${{ env.PYTHON_VERSION }}- - name: Install package & dependencies run: | From aa2253779e9aa9cd178f54ac0f3b6ba521ecd59f Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 1 Dec 2022 15:38:21 -0500 Subject: [PATCH 159/266] Upgrade DCA and tf2.4 (#727) * move dca to tf 2.9 * ignore warning --- .../openproblems-python-tensorflow/Dockerfile | 15 +++++++++++++++ .../README.md | 5 ++--- .../requirements.txt | 4 ++++ docker/openproblems-python-tf2.4/Dockerfile | 18 ------------------ .../openproblems-python-tf2.4/requirements.txt | 4 ---- openproblems/tasks/denoising/methods/dca.py | 2 +- pytest.ini | 1 + 7 files changed, 23 insertions(+), 26 deletions(-) create mode 100644 docker/openproblems-python-tensorflow/Dockerfile rename docker/{openproblems-python-tf2.4 => openproblems-python-tensorflow}/README.md (56%) create mode 100644 docker/openproblems-python-tensorflow/requirements.txt delete mode 100644 docker/openproblems-python-tf2.4/Dockerfile delete mode 100644 docker/openproblems-python-tf2.4/requirements.txt diff --git a/docker/openproblems-python-tensorflow/Dockerfile b/docker/openproblems-python-tensorflow/Dockerfile new file mode 100644 index 0000000000..170d6bc896 --- /dev/null +++ b/docker/openproblems-python-tensorflow/Dockerfile @@ -0,0 +1,15 @@ +FROM singlecellopenproblems/openproblems:latest + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +USER root +WORKDIR / + +# install dependencies and openproblems +COPY ./docker/openproblems-python-tensorflow/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt --editable /usr/src/singlecellopenproblems + +USER $NB_UID +WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-tf2.4/README.md b/docker/openproblems-python-tensorflow/README.md similarity index 56% rename from docker/openproblems-python-tf2.4/README.md rename to docker/openproblems-python-tensorflow/README.md index f08a69ea20..3f4c4dbf67 100644 --- a/docker/openproblems-python-tf2.4/README.md +++ b/docker/openproblems-python-tensorflow/README.md @@ -1,4 +1,4 @@ -# openproblems-python-tf2.4 Docker image +# openproblems-python-tensorflow Docker image Base image: singlecellopenproblems/openproblems @@ -8,6 +8,5 @@ Python: 3.8 Python packages: -* keras >=2.4,<2.6 -* tensorflow >=2.4,<2.5 +* tensorflow * dca diff --git a/docker/openproblems-python-tensorflow/requirements.txt b/docker/openproblems-python-tensorflow/requirements.txt new file mode 100644 index 0000000000..f2a476acf9 --- /dev/null +++ b/docker/openproblems-python-tensorflow/requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/Avsecz/kopt@6a5c890 # master +git+https://github.com/scottgigante-immunai/dca@1f4edbc # patch-1 contains tf version bump +protobuf==3.20.* +tensorflow==2.9.0 diff --git a/docker/openproblems-python-tf2.4/Dockerfile b/docker/openproblems-python-tf2.4/Dockerfile deleted file mode 100644 index dcdabaf28d..0000000000 --- a/docker/openproblems-python-tf2.4/Dockerfile +++ /dev/null @@ -1,18 +0,0 @@ -FROM singlecellopenproblems/openproblems:latest - -ARG NB_USER="sagemaker-user" -ARG NB_UID="1000" -ARG NB_GID="100" - -USER root -WORKDIR / - -# Install Python packages -COPY ./docker/openproblems-python-tf2.4/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -# tensorflow downgrades numpy and h5py (and therefore anndata) -RUN pip install --no-cache-dir -e /usr/src/singlecellopenproblems - -USER $NB_UID -WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-tf2.4/requirements.txt b/docker/openproblems-python-tf2.4/requirements.txt deleted file mode 100644 index 31a56c2ea2..0000000000 --- a/docker/openproblems-python-tf2.4/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -dca==0.3.* -keras>=2.4,<2.11 -pyyaml==6.0 # pinned in #431 -tensorflow-cpu==2.4.* # pinned in dca diff --git a/openproblems/tasks/denoising/methods/dca.py b/openproblems/tasks/denoising/methods/dca.py index fddfb85d51..458f9ecb05 100644 --- a/openproblems/tasks/denoising/methods/dca.py +++ b/openproblems/tasks/denoising/methods/dca.py @@ -30,7 +30,7 @@ def _dca(adata, test=False, epochs=None): paper_url="https://www.nature.com/articles/s41467-018-07931-2", paper_year=2019, code_url="https://github.com/theislab/dca", - image="openproblems-python-tf2.4", + image="openproblems-python-tensorflow", ) def dca(adata, test=False, epochs=None): return _dca(adata, test=test, epochs=epochs) diff --git a/pytest.ini b/pytest.ini index 447d92b2a1..273a414dbc 100644 --- a/pytest.ini +++ b/pytest.ini @@ -10,4 +10,5 @@ filterwarnings = ignore:X\.dtype being converted to np\.float32 from float64:FutureWarning ignore:is_categorical is deprecated and will be removed in a future version:FutureWarning ignore:The use of (converter|py2rpy|rpy2py) in module rpy2.robjects.conversion is deprecated.:DeprecationWarning + ignore:`Model\.state_updates` will be removed in a future version\.:UserWarning always:Container failed with AssertionError\. Retrying [0-9]* more time:RuntimeWarning From a144b53f44f977ae0f95c97a8716b037072680ff Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 1 Dec 2022 18:02:47 -0500 Subject: [PATCH 160/266] upgrade louvain and scib (#725) --- .../Dockerfile | 24 ------------------- .../README.md | 15 ------------ .../requirements.txt | 6 ----- .../requirements.txt | 8 ------- docker/openproblems-python-pytorch/Dockerfile | 15 ++++++++++++ .../README.md | 6 +++++ .../requirements.txt | 11 +++++++++ docker/openproblems-python-scvi/Dockerfile | 15 ------------ .../openproblems-python-scvi/requirements.txt | 5 ---- docker/openproblems-r-base/README.md | 2 +- .../openproblems-r-extras/r_requirements.txt | 1 + docker/openproblems-r-extras/requirements.txt | 2 +- docker/openproblems-r-pytorch/README.md | 14 +++++------ .../openproblems-r-pytorch/requirements.txt | 7 +++++- .../metrics/cc_score.py | 2 +- .../metrics/iso_label_sil.py | 2 +- .../batch_integration_embed/metrics/pcr.py | 2 +- .../metrics/sil_batch.py | 2 +- .../metrics/silhouette.py | 2 +- .../metrics/hvg_conservation.py | 2 +- .../batch_integration_graph/methods/bbknn.py | 2 +- .../batch_integration_graph/methods/combat.py | 2 +- .../batch_integration_graph/methods/mnn.py | 2 +- .../batch_integration_graph/methods/scalex.py | 2 +- .../methods/scanorama.py | 2 +- .../batch_integration_graph/methods/scanvi.py | 2 +- .../batch_integration_graph/methods/scvi.py | 2 +- .../batch_integration_graph/metrics/ari.py | 2 +- .../metrics/graph_connectivity.py | 2 +- .../metrics/iso_label_f1.py | 2 +- .../batch_integration_graph/metrics/nmi.py | 2 +- .../tasks/denoising/datasets/pancreas.py | 2 +- openproblems/tasks/denoising/datasets/pbmc.py | 2 +- .../denoising/datasets/tabula_muris_senis.py | 2 +- .../tasks/denoising/metrics/poisson.py | 2 +- .../methods/neuralee.py | 2 +- .../label_projection/methods/scvi_tools.py | 4 ++-- .../datasets/destvi/generate.py | 2 +- .../methods/cell2location.py | 2 +- .../spatial_decomposition/methods/destvi.py | 2 +- .../methods/stereoscope.py | 2 +- .../spatial_decomposition/methods/tangram.py | 2 +- setup.py | 4 ++-- 43 files changed, 79 insertions(+), 114 deletions(-) delete mode 100644 docker/openproblems-python-batch-integration/Dockerfile delete mode 100644 docker/openproblems-python-batch-integration/README.md delete mode 100644 docker/openproblems-python-batch-integration/requirements.txt create mode 100644 docker/openproblems-python-pytorch/Dockerfile rename docker/{openproblems-python-scvi => openproblems-python-pytorch}/README.md (64%) create mode 100644 docker/openproblems-python-pytorch/requirements.txt delete mode 100644 docker/openproblems-python-scvi/Dockerfile delete mode 100644 docker/openproblems-python-scvi/requirements.txt diff --git a/docker/openproblems-python-batch-integration/Dockerfile b/docker/openproblems-python-batch-integration/Dockerfile deleted file mode 100644 index 697905d79c..0000000000 --- a/docker/openproblems-python-batch-integration/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -FROM singlecellopenproblems/openproblems-r-base:latest - -USER root -WORKDIR / - -ARG NB_USER="sagemaker-user" -ARG NB_UID="1000" -ARG NB_GID="100" - -RUN sed -i '$ d' /etc/apt/sources.list -RUN \ -apt-get update --allow-releaseinfo-change && \ -apt-get -y install --no-install-recommends gcc git python3-llvmlite && \ -apt-get autoremove -y && \ -rm -rf /var/lib/apt/lists/* - -# Install Python packages -COPY ./docker/openproblems-python-batch-integration/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt -# force reinstall annoy addresses https://github.com/spotify/annoy/issues/513 -RUN pip install --no-cache-dir --force annoy==1.17.0 - -USER $NB_UID -WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-batch-integration/README.md b/docker/openproblems-python-batch-integration/README.md deleted file mode 100644 index 02a18e1c20..0000000000 --- a/docker/openproblems-python-batch-integration/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# openproblems-python-extras Docker image - -Base image: singlecellopenproblems/openproblems-r-base - -OS: Debian Stretch - -Python: 3.8 - -Python packages: - -* scIB -* mnnpy -* scanorama -* bbknn -* scVI diff --git a/docker/openproblems-python-batch-integration/requirements.txt b/docker/openproblems-python-batch-integration/requirements.txt deleted file mode 100644 index 61bdfd360a..0000000000 --- a/docker/openproblems-python-batch-integration/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -annoy==1.17.1 -bbknn==1.5.* -git+https://github.com/chriscainx/mnnpy@2097dec # master -git+https://github.com/theislab/scib@77ab015 -scanorama==1.7.0 -scvi-tools~=0.16 # pinned in #313 diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index c51a5d771a..8e3b692f9d 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -1,17 +1,9 @@ cmake==3.24.1.1 -git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac -git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 git+https://github.com/jorvis/Multicore-TSNE@6832575 git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix git+https://github.com/scottgigante-immunai/knn-smoothing@python_package magic-impute==3.0.* phate==1.0.* pybedtools==0.9.* pyensembl==2.0.* -scalex==1.0.2 -scvi-tools==0.16.* -tangram-sc==1.0.* -tensorflow-cpu==2.9.* -torch==1.12.* xgboost==1.6.* diff --git a/docker/openproblems-python-pytorch/Dockerfile b/docker/openproblems-python-pytorch/Dockerfile new file mode 100644 index 0000000000..cdf852acc0 --- /dev/null +++ b/docker/openproblems-python-pytorch/Dockerfile @@ -0,0 +1,15 @@ +FROM singlecellopenproblems/openproblems:latest + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +USER root +WORKDIR / + +# install dependencies and openproblems +COPY ./docker/openproblems-python-pytorch/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt --editable /usr/src/singlecellopenproblems + +USER $NB_UID +WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-scvi/README.md b/docker/openproblems-python-pytorch/README.md similarity index 64% rename from docker/openproblems-python-scvi/README.md rename to docker/openproblems-python-pytorch/README.md index 546cec9bc0..d566a8efd5 100644 --- a/docker/openproblems-python-scvi/README.md +++ b/docker/openproblems-python-pytorch/README.md @@ -9,3 +9,9 @@ Python: 3.8 Python packages: * scvi-tools +* tangram +* torch +* neuralee +* xgboost +* molecular-cross-validation +* cell2location diff --git a/docker/openproblems-python-pytorch/requirements.txt b/docker/openproblems-python-pytorch/requirements.txt new file mode 100644 index 0000000000..56bd2a53dc --- /dev/null +++ b/docker/openproblems-python-pytorch/requirements.txt @@ -0,0 +1,11 @@ +git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac +git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 +git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix +jax==0.3.23 +jaxlib==0.3.22 +scalex==1.0.2 +scikit-misc==0.1.* +scvi-tools~=0.17 # pinned in #313 +tangram-sc==1.0.* +torch==1.12.* +xgboost==1.6.* diff --git a/docker/openproblems-python-scvi/Dockerfile b/docker/openproblems-python-scvi/Dockerfile deleted file mode 100644 index f7edd2e4dc..0000000000 --- a/docker/openproblems-python-scvi/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM singlecellopenproblems/openproblems:latest - -ARG NB_USER="sagemaker-user" -ARG NB_UID="1000" -ARG NB_GID="100" - -USER root -WORKDIR / - -# Install Python packages -COPY ./docker/openproblems-python-scvi/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -USER $NB_UID -WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-scvi/requirements.txt b/docker/openproblems-python-scvi/requirements.txt deleted file mode 100644 index 87aa041993..0000000000 --- a/docker/openproblems-python-scvi/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -jax==0.3.23 -jaxlib==0.3.22 -scikit-misc==0.1.* -scvi-tools~=0.17 # pinned in #313 -xgboost==1.6.* diff --git a/docker/openproblems-r-base/README.md b/docker/openproblems-r-base/README.md index 785a9ace1b..ebca77780d 100644 --- a/docker/openproblems-r-base/README.md +++ b/docker/openproblems-r-base/README.md @@ -28,4 +28,4 @@ R packages: Python packages: * rpy2 -* anndata2ri>=1.0.6 +* anndata2ri>=1.1 diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index efd4ceeba6..0b8da4bde9 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -46,6 +46,7 @@ shiny@1.4.0.2 sparsesvd@0.2 systemfonts@1.0.4 textshaping@0.3.6 +theislab/kBET@a10ffea # master tibble@3.1.7 tidymodels@0.1.2 tidyverse@1.3.0 diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 9d03431a90..1795471bae 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,3 +1,3 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -git+https://github.com/theislab/scib@v1.0.2 +git+https://github.com/theislab/scib@f0be826 xgboost==1.6.* diff --git a/docker/openproblems-r-pytorch/README.md b/docker/openproblems-r-pytorch/README.md index 03ccbc07d4..33d3e5a30c 100644 --- a/docker/openproblems-r-pytorch/README.md +++ b/docker/openproblems-r-pytorch/README.md @@ -8,12 +8,12 @@ Python: 3.8 R: 4.0 -R packages: - -* batchelor -* sparsesvd -* dplyr - Python packages: -* harmonic-alignment +* harmony-pytorch +* torch +* bbknn +* mnnpy +* scib +* scanorama +* scvi-tools diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index e7c8df42b7..ad5b2f5449 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -1,3 +1,8 @@ -git+https://github.com/theislab/scib@v1.0.2 +annoy==1.17.1 +bbknn==1.5.* +git+https://github.com/chriscainx/mnnpy@2097dec # master +git+https://github.com/theislab/scib@f0be826 harmony-pytorch==0.1.* +scanorama==1.7.0 +scvi-tools~=0.16 # pinned in #313 torch==1.13.* diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py index 322891b202..e83d47bb54 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py @@ -21,7 +21,7 @@ @metric( metric_name="Cell Cycle Score", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def cc_score(adata, test=False): from ._utils import _get_split diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py index c3575de5b8..c1f8c4be2d 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py @@ -15,7 +15,7 @@ @metric( metric_name="Isolated label Silhouette", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def isolated_labels_sil(adata): from scib.metrics import isolated_labels diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py index 886f26078b..7efca62ffe 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py @@ -18,7 +18,7 @@ @metric( metric_name="PC Regression", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def pcr(adata): from ._utils import _get_split diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py index c02e5e42aa..9f28cd1284 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py @@ -24,7 +24,7 @@ @metric( metric_name="Batch ASW", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def silhouette_batch(adata): from scib.metrics import silhouette_batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py index 36991e1d67..bb2bece193 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py @@ -12,7 +12,7 @@ @metric( metric_name="Silhouette", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def silhouette(adata): from scib.metrics import silhouette diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py index d40b36b740..bb7f90cae8 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py @@ -21,7 +21,7 @@ @metric( metric_name="HVG conservation", maximize=True, - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) def hvg_conservation(adata): from scib.metrics import hvg_overlap diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py index 017ca8f766..97570dccd8 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py @@ -10,7 +10,7 @@ paper_url="https://academic.oup.com/bioinformatics/article/36/3/964/5545955", paper_year=2020, code_url="https://github.com/Teichlab/bbknn", - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py index 3043a552e1..96e53538d3 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py @@ -10,7 +10,7 @@ paper_url="https://academic.oup.com/biostatistics/article/8/1/118/252073", paper_year=2007, code_url="https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py index 99dab39203..0146f5b6e3 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/nbt.4091", paper_year=2018, code_url="https://github.com/chriscainx/mnnpy", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py index 36843b81ed..461ea04a94 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.1038/s41467-022-33758-z", paper_year=2022, code_url="https://github.com/jsxlei/SCALEX", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py index d6e80162b5..a5efc04b35 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/s41587-019-0113-3", paper_year=2019, code_url="https://github.com/brianhie/scanorama", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py index d5bf463974..8f98a3c931 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.15252/msb.20209620", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py index 9e9a82a9f2..35f1cd7ac5 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/s41592-018-0229-2", paper_year=2018, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py index 13e7eb8ce1..0d082fff44 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py @@ -16,7 +16,7 @@ @metric( metric_name="ARI", maximize=True, - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) def ari(adata): from scib.metrics import ari diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py index 3a9732d0e2..52dd7c44b2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py @@ -22,7 +22,7 @@ @metric( metric_name="Graph connectivity", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def graph_connectivity(adata): import scib.metrics diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py index df86b043d9..71cd7ca209 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py @@ -27,7 +27,7 @@ @metric( metric_name="Isolated label F1", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def isolated_labels_f1(adata): from scib.metrics import isolated_labels diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py index bbcdc7cd9d..3356507b2e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py @@ -15,7 +15,7 @@ @metric( metric_name="NMI", maximize=True, - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) def nmi(adata): from scib.metrics.clustering import opt_louvain # isort:skip diff --git a/openproblems/tasks/denoising/datasets/pancreas.py b/openproblems/tasks/denoising/datasets/pancreas.py index c18ddbeee1..bd039ad88e 100644 --- a/openproblems/tasks/denoising/datasets/pancreas.py +++ b/openproblems/tasks/denoising/datasets/pancreas.py @@ -11,7 +11,7 @@ "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " "and SMARTER-seq). Here we just use the inDrop1 batch, which includes" "1937 cells × 15502 genes.", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def pancreas(test=False): adata = load_pancreas(test=test, keep_techs=["inDrop1"]) diff --git a/openproblems/tasks/denoising/datasets/pbmc.py b/openproblems/tasks/denoising/datasets/pbmc.py index fcf0fc782b..440ebe8a3b 100644 --- a/openproblems/tasks/denoising/datasets/pbmc.py +++ b/openproblems/tasks/denoising/datasets/pbmc.py @@ -11,7 +11,7 @@ "1k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. " "Sequenced on 10X v3 chemistry in November 2018 by 10X Genomics." ), - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def pbmc(test=False): adata = load_tenx_1k_pbmc(test=test) diff --git a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py index a5c62f953e..9524cc4e95 100644 --- a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py +++ b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py @@ -10,7 +10,7 @@ dataset_summary="All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 " "organs and tissues across the mouse lifespan. Here we use just 10x data from lung." " 24540 cells × 16160 genes across 3 time points.", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def tabula_muris_senis_lung_random(test=False): adata = load_tabula_muris_senis( diff --git a/openproblems/tasks/denoising/metrics/poisson.py b/openproblems/tasks/denoising/metrics/poisson.py index ebd2a73378..93db71eee2 100644 --- a/openproblems/tasks/denoising/metrics/poisson.py +++ b/openproblems/tasks/denoising/metrics/poisson.py @@ -1,7 +1,7 @@ from ....tools.decorators import metric -@metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-extras") +@metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-pytorch") def poisson(adata): from molecular_cross_validation.mcv_sweep import poisson_nll_loss diff --git a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py index 41f55dbb65..420132dbf2 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py +++ b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py @@ -18,7 +18,7 @@ paper_url="https://www.frontiersin.org/articles/10.3389/fgene.2020.00786/full", paper_year=2020, code_url="https://github.com/HiBearME/NeuralEE", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 6212f8cd69..1cec698b27 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.15252/msb.20209620", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-scvi", + image="openproblems-python-pytorch", ) _scanvi_scarches_method = functools.partial( @@ -19,7 +19,7 @@ paper_url="https://doi.org/10.1101/2020.07.16.205997", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-scvi", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py index 86005d07f2..e446b07687 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py @@ -11,7 +11,7 @@ "from the destVI manuscripts leveraging sparsePCA. Number of cells and " "cell types present in each spatial spot is computed via combination of " "kernel-based parametrization of a categorical distribution and the NB model.", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def destvi(test=False): from .utils import generate_synthetic_dataset diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index b0645e2b0d..01c4e4d32d 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -12,7 +12,7 @@ paper_url="https://doi.org/10.1038/s41587-021-01139-4", paper_year=2022, code_url="https://github.com/BayraktarLab/cell2location", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/spatial_decomposition/methods/destvi.py b/openproblems/tasks/spatial_decomposition/methods/destvi.py index 9330ba1d92..4338a465fd 100644 --- a/openproblems/tasks/spatial_decomposition/methods/destvi.py +++ b/openproblems/tasks/spatial_decomposition/methods/destvi.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.1038/s41587-022-01272-8", paper_year=2022, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def destvi( adata, diff --git a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py index c2695e4253..f9c025319e 100644 --- a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py +++ b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.1038/s41587-022-01272-8", paper_year=2020, code_url="https://github.com/scverse/scvi-tools", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def stereoscope(adata, test=False, max_epochs_sc=None, max_epochs_sp=None): from scvi.external import RNAStereoscope diff --git a/openproblems/tasks/spatial_decomposition/methods/tangram.py b/openproblems/tasks/spatial_decomposition/methods/tangram.py index 329676c4aa..c05eb0339f 100644 --- a/openproblems/tasks/spatial_decomposition/methods/tangram.py +++ b/openproblems/tasks/spatial_decomposition/methods/tangram.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.1038/s41592-021-01264-7", paper_year=2021, code_url="https://github.com/broadinstitute/Tangram", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def tangram(adata, test=False, num_epochs=None, n_markers=None): # analysis based on: diff --git a/setup.py b/setup.py index a6a1c213f4..2fee0e2b0e 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,8 @@ "scprep>=1.2.1", "scipy>=1.8,<1.10", "scanpy>=1.6", - "louvain==0.7.*", - "python-igraph<0.10", + "louvain==0.8.*", + "python-igraph==0.10.*", "decorator<5.0", # pinned in #324 "memory-profiler==0.60", "colorama==0.4.*", From dd4260a790468663ac6b4466a42aff8ceca39f05 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 1 Dec 2022 19:46:11 -0500 Subject: [PATCH 161/266] Publish both raw and scaled scores (#729) * publish both raw and scaled scores * publish to PR branch # publish * need to checkout ref # publish * fix typo # publish * sort metrics * # publish * sort formatted outputs too # publish * temp # publish * log some stuff * warn when skipping * error if skip task * skip if missing * just let it fail * catch earlier * get keys upfront * temp * one more * commit and push * typo * config * remove raw * push results to repro repo * revert temp * one more PR # publish * uncomment * clean up diff --- .github/workflows/process_results.yml | 3 - workflow/parse_nextflow.py | 156 ++++++++++++++++++++------ 2 files changed, 124 insertions(+), 35 deletions(-) diff --git a/.github/workflows/process_results.yml b/.github/workflows/process_results.yml index 4a9389143e..a7a381799d 100644 --- a/.github/workflows/process_results.yml +++ b/.github/workflows/process_results.yml @@ -94,9 +94,6 @@ jobs: path: website/data/results - name: Move raw output - if: | - github.event_name == 'repository_dispatch' || - endsWith(github.event.head_commit.message, '# publish') run: | rsync -v -r --include "*.raw.json" --include "*/" --exclude "*" website/data/results/ nbt2022-reproducibility/results rm website/data/results/*/*.raw.json diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index 336aa3cb9a..4d0d3e06e1 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -1,3 +1,54 @@ +""" +Schema: + +# results/{task.__name__}/{dataset.__name__}.json +{ + "name": dataset.metadata["dataset_name"], + "data_url": dataset.metadata["data_url"], + "data_reference": dataset.metadata["data_reference"], + "headers": { + "names": [ + "Rank", + "Name", + "Metric1 Raw", + "Metric2 Raw", + ..., + "Mean score Scaled", + "Metric1 Scaled", + ..., + "Memory (GB)", + "Runtime (min)", + "CPU (%)", + "Paper", + "Year", + "Library" + ], + "fixed": ["Name", "Paper", "Library"] + }, + "results": [ + { + "Name": method.metadata["method_name"], + "Paper": method.metadata["paper_name"], + "Paper URL": method.metadata["paper_url"], + "Year": method.metadata["year"], + "Library": method.metadata["code_url"], + "Implementation": "https://github.com/.../path/to/method.py", + "Version": method.metadata["method_version"], + "Runtime (min)": runtime, + "CPU (%)": cpu, + "Memory (GB)": memory, + "Rank": rank, + "Metric1 Raw": metric1_raw, + "Metric2 Raw": metric2_raw, . + .., + "Mean score Scaled": mean_score, + "Metric1 Scaled": metric1, + ... + }, + ... + ] +} +""" import collections import copy import json @@ -84,11 +135,14 @@ def read_trace(filename): def parse_trace_to_dict(df): """Parse the trace dataframe and convert to dict.""" + print(f"Parsing {df.shape[0]} trace records") results = collections.defaultdict(lambda: collections.defaultdict(dict)) for task_name in df["task"].unique(): df_task = df.loc[df["task"] == task_name] + print(f"{task_name}: {df_task.shape[0]} records") for dataset_name in df_task["dataset"].unique(): df_dataset = df_task.loc[df_task["dataset"] == dataset_name] + print(f"{task_name}.{dataset_name}: {df_task.shape[0]} records") for _, row in df_dataset.iterrows(): method_name = row["method"] results[task_name][dataset_name][method_name] = row.to_dict() @@ -101,7 +155,9 @@ def parse_trace_to_dict(df): def parse_metric_results(results_path, results): """Add metric results to the trace output.""" missing_traces = [] - for filename in os.listdir(os.path.join(results_path, "results/metrics")): + metric_filenames = os.listdir(os.path.join(results_path, "results/metrics")) + print(f"Loading {len(metric_filenames)} metric results") + for filename in sorted(metric_filenames): with open( os.path.join(results_path, "results/metrics", filename), "r" ) as handle: @@ -159,8 +215,17 @@ def normalize_scores(task_name, dataset_results): dataset_results[method_name]["metrics"] ) metric_names = list(list(dataset_results.values())[0]["metrics"].keys()) + + n_removed = 0 for metric_name in metric_names: - metric = openproblems.api.utils.get_function(task_name, "metrics", metric_name) + try: + metric = openproblems.api.utils.get_function( + task_name, "metrics", metric_name + ) + except openproblems.api.utils.NoSuchFunctionError as e: + print(f"[WARN] {e}") + del dataset_results[method_name]["metrics"][metric_name] + continue metric_scores = np.array( [ dataset_results[method_name]["metrics"][metric_name] @@ -168,16 +233,23 @@ def normalize_scores(task_name, dataset_results): ] ) if np.all(np.isnan(metric_scores)): + n_removed += 1 for method_name in dataset_results: del dataset_results[method_name]["metrics"][metric_name] continue - baseline_methods = [ - method_name - for method_name in dataset_results - if openproblems.api.utils.get_function( - task_name, "methods", method_name - ).metadata["is_baseline"] - ] + baseline_methods = [] + for method_name in list(dataset_results.keys()): + try: + method = openproblems.api.utils.get_function( + task_name, + "methods", + method_name, + ) + except openproblems.api.utils.NoSuchFunctionError as e: + print(f"[WARN] {e}") + del dataset_results[method_name] + if method.metadata["is_baseline"]: + baseline_methods.append(method_name) if len(baseline_methods) < 2: # just use all methods as a fallback baseline_methods = dataset_results.keys() @@ -195,6 +267,8 @@ def normalize_scores(task_name, dataset_results): metric_scores = 1 - metric_scores for method_name, score in zip(dataset_results, metric_scores): dataset_results[method_name]["metrics"][metric_name] = score + if n_removed > 0: + print(f"[WARN] Removed {n_removed} all-NaN metrics") return dataset_results @@ -202,10 +276,18 @@ def drop_baselines(task_name, dataset_results): """Remove baseline methods from dataset results.""" dataset_results = copy.copy(dataset_results) method_names = list(dataset_results.keys()) + n_removed = 0 for method_name in method_names: - method = openproblems.api.utils.get_function(task_name, "methods", method_name) + method = openproblems.api.utils.get_function( + task_name, + "methods", + method_name, + ) if method.metadata["is_baseline"]: + n_removed += 1 del dataset_results[method_name] + + print(f"Dropped {n_removed} baseline methods") return dataset_results @@ -236,6 +318,9 @@ def compute_ranking(dataset_results): def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): """Convert the raw dataset results to pretty JSON for web.""" + print( + f"Formatting {len(dataset_results_raw)} methods for {task_name}.{dataset_name}" + ) dataset = openproblems.api.utils.get_function(task_name, "datasets", dataset_name) output = dict( name=dataset.metadata["dataset_name"], @@ -250,7 +335,11 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): metric_names = set() for method_name, rank in ranking.items(): method_results = dataset_results[method_name] - method = openproblems.api.utils.get_function(task_name, "methods", method_name) + method = openproblems.api.utils.get_function( + task_name, + "methods", + method_name, + ) result = { "Name": method.metadata["method_name"], "Paper": method.metadata["paper_name"], @@ -266,20 +355,25 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): "Rank": rank, "Mean score": method_results["mean_score"], } - for metric_name, metric_result in method_results["metrics"].items(): - metric = openproblems.api.utils.get_function( - task_name, "metrics", metric_name - ) - if np.isnan(metric_result): - metric_result = "NaN" - elif np.isneginf(metric_result): - metric_result = "-Inf" - elif np.isinf(metric_result): - metric_result = "Inf" - result[metric.metadata["metric_name"]] = metric_result - metric_names.add(metric.metadata["metric_name"]) + result_metrics = {} + for metric_type in ["metrics_raw", "metrics"]: + metric_type_name = "Raw" if metric_type == "metrics_raw" else "Scaled" + for metric_name, metric_result in method_results[metric_type].items(): + metric = openproblems.api.utils.get_function( + task_name, "metrics", metric_name + ) + if np.isnan(metric_result): + metric_result = "NaN" + elif np.isneginf(metric_result): + metric_result = "-Inf" + elif np.isinf(metric_result): + metric_result = "Inf" + metric_name_fmt = f"{metric.metadata['metric_name']} {metric_type_name}" + result_metrics[metric_name_fmt] = metric_result + metric_names.add(metric_name_fmt) + result.update(sorted(result_metrics.items())) output["results"].append(result) - output["headers"]["names"].extend(list(metric_names)) + output["headers"]["names"].extend(sorted(list(metric_names))) output["headers"]["names"].extend( [ "Memory (GB)", @@ -305,21 +399,19 @@ def results_to_json(results, outdir): os.mkdir(results_dir) filename = os.path.join(results_dir, "{}.json".format(dataset_name)) filename_raw = os.path.join(results_dir, "{}.raw.json".format(dataset_name)) - try: - dataset_results_json, dataset_results_raw = dataset_results_to_json( - task_name, dataset_name, dataset_results - ) - except openproblems.api.utils.NoSuchFunctionError: - continue + dataset_results_json, dataset_results_raw = dataset_results_to_json( + task_name, dataset_name, dataset_results + ) with open(filename_raw, "w") as handle: dump_json( dataset_results_raw, handle, ) - if not workflow_utils.task_is_incomplete( + if workflow_utils.task_is_incomplete( openproblems.api.utils.str_to_task(task_name) ): - # don't write results for incomplete tasks + print("Skipping stub task") + else: with open(filename, "w") as handle: dump_json( dataset_results_json, From 9f18bb496f9c5a7b992d093c2cecb7006f9ee32c Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 2 Dec 2022 09:58:03 -0500 Subject: [PATCH 162/266] Revert "upgrade louvain and scib (#725)" (#732) This reverts commit a144b53f44f977ae0f95c97a8716b037072680ff. --- .../Dockerfile | 24 +++++++++++++++++++ .../README.md | 15 ++++++++++++ .../requirements.txt | 6 +++++ .../requirements.txt | 8 +++++++ docker/openproblems-python-pytorch/Dockerfile | 15 ------------ .../requirements.txt | 11 --------- docker/openproblems-python-scvi/Dockerfile | 15 ++++++++++++ .../README.md | 6 ----- .../openproblems-python-scvi/requirements.txt | 5 ++++ docker/openproblems-r-base/README.md | 2 +- .../openproblems-r-extras/r_requirements.txt | 1 - docker/openproblems-r-extras/requirements.txt | 2 +- docker/openproblems-r-pytorch/README.md | 14 +++++------ .../openproblems-r-pytorch/requirements.txt | 7 +----- .../metrics/cc_score.py | 2 +- .../metrics/iso_label_sil.py | 2 +- .../batch_integration_embed/metrics/pcr.py | 2 +- .../metrics/sil_batch.py | 2 +- .../metrics/silhouette.py | 2 +- .../metrics/hvg_conservation.py | 2 +- .../batch_integration_graph/methods/bbknn.py | 2 +- .../batch_integration_graph/methods/combat.py | 2 +- .../batch_integration_graph/methods/mnn.py | 2 +- .../batch_integration_graph/methods/scalex.py | 2 +- .../methods/scanorama.py | 2 +- .../batch_integration_graph/methods/scanvi.py | 2 +- .../batch_integration_graph/methods/scvi.py | 2 +- .../batch_integration_graph/metrics/ari.py | 2 +- .../metrics/graph_connectivity.py | 2 +- .../metrics/iso_label_f1.py | 2 +- .../batch_integration_graph/metrics/nmi.py | 2 +- .../tasks/denoising/datasets/pancreas.py | 2 +- openproblems/tasks/denoising/datasets/pbmc.py | 2 +- .../denoising/datasets/tabula_muris_senis.py | 2 +- .../tasks/denoising/metrics/poisson.py | 2 +- .../methods/neuralee.py | 2 +- .../label_projection/methods/scvi_tools.py | 4 ++-- .../datasets/destvi/generate.py | 2 +- .../methods/cell2location.py | 2 +- .../spatial_decomposition/methods/destvi.py | 2 +- .../methods/stereoscope.py | 2 +- .../spatial_decomposition/methods/tangram.py | 2 +- setup.py | 4 ++-- 43 files changed, 114 insertions(+), 79 deletions(-) create mode 100644 docker/openproblems-python-batch-integration/Dockerfile create mode 100644 docker/openproblems-python-batch-integration/README.md create mode 100644 docker/openproblems-python-batch-integration/requirements.txt delete mode 100644 docker/openproblems-python-pytorch/Dockerfile delete mode 100644 docker/openproblems-python-pytorch/requirements.txt create mode 100644 docker/openproblems-python-scvi/Dockerfile rename docker/{openproblems-python-pytorch => openproblems-python-scvi}/README.md (64%) create mode 100644 docker/openproblems-python-scvi/requirements.txt diff --git a/docker/openproblems-python-batch-integration/Dockerfile b/docker/openproblems-python-batch-integration/Dockerfile new file mode 100644 index 0000000000..697905d79c --- /dev/null +++ b/docker/openproblems-python-batch-integration/Dockerfile @@ -0,0 +1,24 @@ +FROM singlecellopenproblems/openproblems-r-base:latest + +USER root +WORKDIR / + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +RUN sed -i '$ d' /etc/apt/sources.list +RUN \ +apt-get update --allow-releaseinfo-change && \ +apt-get -y install --no-install-recommends gcc git python3-llvmlite && \ +apt-get autoremove -y && \ +rm -rf /var/lib/apt/lists/* + +# Install Python packages +COPY ./docker/openproblems-python-batch-integration/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt +# force reinstall annoy addresses https://github.com/spotify/annoy/issues/513 +RUN pip install --no-cache-dir --force annoy==1.17.0 + +USER $NB_UID +WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-batch-integration/README.md b/docker/openproblems-python-batch-integration/README.md new file mode 100644 index 0000000000..02a18e1c20 --- /dev/null +++ b/docker/openproblems-python-batch-integration/README.md @@ -0,0 +1,15 @@ +# openproblems-python-extras Docker image + +Base image: singlecellopenproblems/openproblems-r-base + +OS: Debian Stretch + +Python: 3.8 + +Python packages: + +* scIB +* mnnpy +* scanorama +* bbknn +* scVI diff --git a/docker/openproblems-python-batch-integration/requirements.txt b/docker/openproblems-python-batch-integration/requirements.txt new file mode 100644 index 0000000000..61bdfd360a --- /dev/null +++ b/docker/openproblems-python-batch-integration/requirements.txt @@ -0,0 +1,6 @@ +annoy==1.17.1 +bbknn==1.5.* +git+https://github.com/chriscainx/mnnpy@2097dec # master +git+https://github.com/theislab/scib@77ab015 +scanorama==1.7.0 +scvi-tools~=0.16 # pinned in #313 diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index 8e3b692f9d..c51a5d771a 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -1,9 +1,17 @@ cmake==3.24.1.1 +git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac +git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 git+https://github.com/jorvis/Multicore-TSNE@6832575 git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python +git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix git+https://github.com/scottgigante-immunai/knn-smoothing@python_package magic-impute==3.0.* phate==1.0.* pybedtools==0.9.* pyensembl==2.0.* +scalex==1.0.2 +scvi-tools==0.16.* +tangram-sc==1.0.* +tensorflow-cpu==2.9.* +torch==1.12.* xgboost==1.6.* diff --git a/docker/openproblems-python-pytorch/Dockerfile b/docker/openproblems-python-pytorch/Dockerfile deleted file mode 100644 index cdf852acc0..0000000000 --- a/docker/openproblems-python-pytorch/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM singlecellopenproblems/openproblems:latest - -ARG NB_USER="sagemaker-user" -ARG NB_UID="1000" -ARG NB_GID="100" - -USER root -WORKDIR / - -# install dependencies and openproblems -COPY ./docker/openproblems-python-pytorch/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt --editable /usr/src/singlecellopenproblems - -USER $NB_UID -WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-pytorch/requirements.txt b/docker/openproblems-python-pytorch/requirements.txt deleted file mode 100644 index 56bd2a53dc..0000000000 --- a/docker/openproblems-python-pytorch/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac -git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 -git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix -jax==0.3.23 -jaxlib==0.3.22 -scalex==1.0.2 -scikit-misc==0.1.* -scvi-tools~=0.17 # pinned in #313 -tangram-sc==1.0.* -torch==1.12.* -xgboost==1.6.* diff --git a/docker/openproblems-python-scvi/Dockerfile b/docker/openproblems-python-scvi/Dockerfile new file mode 100644 index 0000000000..f7edd2e4dc --- /dev/null +++ b/docker/openproblems-python-scvi/Dockerfile @@ -0,0 +1,15 @@ +FROM singlecellopenproblems/openproblems:latest + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +USER root +WORKDIR / + +# Install Python packages +COPY ./docker/openproblems-python-scvi/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +USER $NB_UID +WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-pytorch/README.md b/docker/openproblems-python-scvi/README.md similarity index 64% rename from docker/openproblems-python-pytorch/README.md rename to docker/openproblems-python-scvi/README.md index d566a8efd5..546cec9bc0 100644 --- a/docker/openproblems-python-pytorch/README.md +++ b/docker/openproblems-python-scvi/README.md @@ -9,9 +9,3 @@ Python: 3.8 Python packages: * scvi-tools -* tangram -* torch -* neuralee -* xgboost -* molecular-cross-validation -* cell2location diff --git a/docker/openproblems-python-scvi/requirements.txt b/docker/openproblems-python-scvi/requirements.txt new file mode 100644 index 0000000000..87aa041993 --- /dev/null +++ b/docker/openproblems-python-scvi/requirements.txt @@ -0,0 +1,5 @@ +jax==0.3.23 +jaxlib==0.3.22 +scikit-misc==0.1.* +scvi-tools~=0.17 # pinned in #313 +xgboost==1.6.* diff --git a/docker/openproblems-r-base/README.md b/docker/openproblems-r-base/README.md index ebca77780d..785a9ace1b 100644 --- a/docker/openproblems-r-base/README.md +++ b/docker/openproblems-r-base/README.md @@ -28,4 +28,4 @@ R packages: Python packages: * rpy2 -* anndata2ri>=1.1 +* anndata2ri>=1.0.6 diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 0b8da4bde9..efd4ceeba6 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -46,7 +46,6 @@ shiny@1.4.0.2 sparsesvd@0.2 systemfonts@1.0.4 textshaping@0.3.6 -theislab/kBET@a10ffea # master tibble@3.1.7 tidymodels@0.1.2 tidyverse@1.3.0 diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 1795471bae..9d03431a90 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,3 +1,3 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -git+https://github.com/theislab/scib@f0be826 +git+https://github.com/theislab/scib@v1.0.2 xgboost==1.6.* diff --git a/docker/openproblems-r-pytorch/README.md b/docker/openproblems-r-pytorch/README.md index 33d3e5a30c..03ccbc07d4 100644 --- a/docker/openproblems-r-pytorch/README.md +++ b/docker/openproblems-r-pytorch/README.md @@ -8,12 +8,12 @@ Python: 3.8 R: 4.0 +R packages: + +* batchelor +* sparsesvd +* dplyr + Python packages: -* harmony-pytorch -* torch -* bbknn -* mnnpy -* scib -* scanorama -* scvi-tools +* harmonic-alignment diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index ad5b2f5449..e7c8df42b7 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -1,8 +1,3 @@ -annoy==1.17.1 -bbknn==1.5.* -git+https://github.com/chriscainx/mnnpy@2097dec # master -git+https://github.com/theislab/scib@f0be826 +git+https://github.com/theislab/scib@v1.0.2 harmony-pytorch==0.1.* -scanorama==1.7.0 -scvi-tools~=0.16 # pinned in #313 torch==1.13.* diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py index e83d47bb54..322891b202 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py @@ -21,7 +21,7 @@ @metric( metric_name="Cell Cycle Score", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def cc_score(adata, test=False): from ._utils import _get_split diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py index c1f8c4be2d..c3575de5b8 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py @@ -15,7 +15,7 @@ @metric( metric_name="Isolated label Silhouette", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def isolated_labels_sil(adata): from scib.metrics import isolated_labels diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py index 7efca62ffe..886f26078b 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py @@ -18,7 +18,7 @@ @metric( metric_name="PC Regression", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def pcr(adata): from ._utils import _get_split diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py index 9f28cd1284..c02e5e42aa 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py @@ -24,7 +24,7 @@ @metric( metric_name="Batch ASW", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def silhouette_batch(adata): from scib.metrics import silhouette_batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py index bb2bece193..36991e1d67 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py @@ -12,7 +12,7 @@ @metric( metric_name="Silhouette", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def silhouette(adata): from scib.metrics import silhouette diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py index bb7f90cae8..d40b36b740 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py @@ -21,7 +21,7 @@ @metric( metric_name="HVG conservation", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) def hvg_conservation(adata): from scib.metrics import hvg_overlap diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py index 97570dccd8..017ca8f766 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py @@ -10,7 +10,7 @@ paper_url="https://academic.oup.com/bioinformatics/article/36/3/964/5545955", paper_year=2020, code_url="https://github.com/Teichlab/bbknn", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py index 96e53538d3..3043a552e1 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py @@ -10,7 +10,7 @@ paper_url="https://academic.oup.com/biostatistics/article/8/1/118/252073", paper_year=2007, code_url="https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py index 0146f5b6e3..99dab39203 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/nbt.4091", paper_year=2018, code_url="https://github.com/chriscainx/mnnpy", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py index 461ea04a94..36843b81ed 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.1038/s41467-022-33758-z", paper_year=2022, code_url="https://github.com/jsxlei/SCALEX", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py index a5efc04b35..d6e80162b5 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/s41587-019-0113-3", paper_year=2019, code_url="https://github.com/brianhie/scanorama", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py index 8f98a3c931..d5bf463974 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.15252/msb.20209620", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py index 35f1cd7ac5..9e9a82a9f2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/s41592-018-0229-2", paper_year=2018, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py index 0d082fff44..13e7eb8ce1 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py @@ -16,7 +16,7 @@ @metric( metric_name="ARI", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) def ari(adata): from scib.metrics import ari diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py index 52dd7c44b2..3a9732d0e2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py @@ -22,7 +22,7 @@ @metric( metric_name="Graph connectivity", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def graph_connectivity(adata): import scib.metrics diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py index 71cd7ca209..df86b043d9 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py @@ -27,7 +27,7 @@ @metric( metric_name="Isolated label F1", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", # only if required ) def isolated_labels_f1(adata): from scib.metrics import isolated_labels diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py index 3356507b2e..bbcdc7cd9d 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py @@ -15,7 +15,7 @@ @metric( metric_name="NMI", maximize=True, - image="openproblems-r-pytorch", + image="openproblems-python-batch-integration", ) def nmi(adata): from scib.metrics.clustering import opt_louvain # isort:skip diff --git a/openproblems/tasks/denoising/datasets/pancreas.py b/openproblems/tasks/denoising/datasets/pancreas.py index bd039ad88e..c18ddbeee1 100644 --- a/openproblems/tasks/denoising/datasets/pancreas.py +++ b/openproblems/tasks/denoising/datasets/pancreas.py @@ -11,7 +11,7 @@ "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " "and SMARTER-seq). Here we just use the inDrop1 batch, which includes" "1937 cells × 15502 genes.", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def pancreas(test=False): adata = load_pancreas(test=test, keep_techs=["inDrop1"]) diff --git a/openproblems/tasks/denoising/datasets/pbmc.py b/openproblems/tasks/denoising/datasets/pbmc.py index 440ebe8a3b..fcf0fc782b 100644 --- a/openproblems/tasks/denoising/datasets/pbmc.py +++ b/openproblems/tasks/denoising/datasets/pbmc.py @@ -11,7 +11,7 @@ "1k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. " "Sequenced on 10X v3 chemistry in November 2018 by 10X Genomics." ), - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def pbmc(test=False): adata = load_tenx_1k_pbmc(test=test) diff --git a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py index 9524cc4e95..a5c62f953e 100644 --- a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py +++ b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py @@ -10,7 +10,7 @@ dataset_summary="All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 " "organs and tissues across the mouse lifespan. Here we use just 10x data from lung." " 24540 cells × 16160 genes across 3 time points.", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def tabula_muris_senis_lung_random(test=False): adata = load_tabula_muris_senis( diff --git a/openproblems/tasks/denoising/metrics/poisson.py b/openproblems/tasks/denoising/metrics/poisson.py index 93db71eee2..ebd2a73378 100644 --- a/openproblems/tasks/denoising/metrics/poisson.py +++ b/openproblems/tasks/denoising/metrics/poisson.py @@ -1,7 +1,7 @@ from ....tools.decorators import metric -@metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-pytorch") +@metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-extras") def poisson(adata): from molecular_cross_validation.mcv_sweep import poisson_nll_loss diff --git a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py index 420132dbf2..41f55dbb65 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py +++ b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py @@ -18,7 +18,7 @@ paper_url="https://www.frontiersin.org/articles/10.3389/fgene.2020.00786/full", paper_year=2020, code_url="https://github.com/HiBearME/NeuralEE", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 1cec698b27..6212f8cd69 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.15252/msb.20209620", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-pytorch", + image="openproblems-python-scvi", ) _scanvi_scarches_method = functools.partial( @@ -19,7 +19,7 @@ paper_url="https://doi.org/10.1101/2020.07.16.205997", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-pytorch", + image="openproblems-python-scvi", ) diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py index e446b07687..86005d07f2 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py @@ -11,7 +11,7 @@ "from the destVI manuscripts leveraging sparsePCA. Number of cells and " "cell types present in each spatial spot is computed via combination of " "kernel-based parametrization of a categorical distribution and the NB model.", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def destvi(test=False): from .utils import generate_synthetic_dataset diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index 01c4e4d32d..b0645e2b0d 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -12,7 +12,7 @@ paper_url="https://doi.org/10.1038/s41587-021-01139-4", paper_year=2022, code_url="https://github.com/BayraktarLab/cell2location", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) diff --git a/openproblems/tasks/spatial_decomposition/methods/destvi.py b/openproblems/tasks/spatial_decomposition/methods/destvi.py index 4338a465fd..9330ba1d92 100644 --- a/openproblems/tasks/spatial_decomposition/methods/destvi.py +++ b/openproblems/tasks/spatial_decomposition/methods/destvi.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.1038/s41587-022-01272-8", paper_year=2022, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def destvi( adata, diff --git a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py index f9c025319e..c2695e4253 100644 --- a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py +++ b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.1038/s41587-022-01272-8", paper_year=2020, code_url="https://github.com/scverse/scvi-tools", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def stereoscope(adata, test=False, max_epochs_sc=None, max_epochs_sp=None): from scvi.external import RNAStereoscope diff --git a/openproblems/tasks/spatial_decomposition/methods/tangram.py b/openproblems/tasks/spatial_decomposition/methods/tangram.py index c05eb0339f..329676c4aa 100644 --- a/openproblems/tasks/spatial_decomposition/methods/tangram.py +++ b/openproblems/tasks/spatial_decomposition/methods/tangram.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.1038/s41592-021-01264-7", paper_year=2021, code_url="https://github.com/broadinstitute/Tangram", - image="openproblems-python-pytorch", + image="openproblems-python-extras", ) def tangram(adata, test=False, num_epochs=None, n_markers=None): # analysis based on: diff --git a/setup.py b/setup.py index 2fee0e2b0e..a6a1c213f4 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,8 @@ "scprep>=1.2.1", "scipy>=1.8,<1.10", "scanpy>=1.6", - "louvain==0.8.*", - "python-igraph==0.10.*", + "louvain==0.7.*", + "python-igraph<0.10", "decorator<5.0", # pinned in #324 "memory-profiler==0.60", "colorama==0.4.*", From 271c7ff2ac11fa1215395cb2cd5bea00848efd7d Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 2 Dec 2022 09:59:12 -0500 Subject: [PATCH 163/266] Check for nf rate limiting (#730) * check for nf rate limiting * run name needs to be precomputed * fix branch usage * fix outputs syntax --- .github/workflows/run_tests.yml | 37 ++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 682b8993ce..f4d9727798 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -335,7 +335,7 @@ jobs: with: name: coverage - run_benchmark: + setup_benchmark: needs: - run_tester - build_images @@ -350,6 +350,10 @@ jobs: needs.run_tester.result == 'skipped' ) + outputs: + branch: ${{ steps.setup-environment.outputs.branch }} + run_name: ${{ steps.setup-environment.outputs.run_name }} + steps: - name: Check dependabot run: | @@ -407,6 +411,7 @@ jobs: wait - name: Set up environment + id: setup-environment run: | # If not on the base repository, append first 6 characters of username to the image name # to avoid clashes on ECR @@ -421,6 +426,9 @@ jobs: fi BRANCH=`echo $BRANCH | sed 's/[^a-zA-Z0-9]*$//'` echo "BRANCH=${BRANCH}" >> $GITHUB_ENV + echo "branch=${BRANCH}" >> $GITHUB_OUTPUT + RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" + echo "run_name=${RUN_NAME}" >> $GITHUB_OUTPUT - name: Upload Docker images env: @@ -437,35 +445,48 @@ jobs: done wait + run_benchmark: + needs: + - setup_benchmark + runs-on: ubuntu-latest + if: >- + always() && + needs.setup_benchmark.result == 'success' + + steps: - name: Run benchmark env: TOWER_WATCH_URL: https://tower.nf/orgs/openproblems-bio/workspaces/openproblems-bio/watch TOWER_WORKSPACE_ID: 53907369739130 + BRANCH: ${{ needs.setup_benchmark.outputs.branch }} run: | - RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" if [[ ${{ startsWith(github.ref, 'refs/tags') || startsWith(github.ref, 'refs/heads/test_full_benchmark') }} == true ]]; then TOWER_ACTION_ID="bVQhVSNah1JmJfnKkfyjg" WORKDIR="s3://openproblems-nextflow/work_main" else TOWER_ACTION_ID="5BQc88ZvjuXCYbc55Hot27" - WORKDIR="s3://openproblems-nextflow/work/$BRANCH" + WORKDIR="s3://openproblems-nextflow/work/${BRANCH}" fi generate_parameters() { cat <> $GITHUB_STEP_SUMMARY From 89bb90fd654ad15e81f90e49de74a7d14a6f1e77 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 2 Dec 2022 11:33:52 -0500 Subject: [PATCH 164/266] consolidate test suites --- scripts/generate_test_matrix.py | 4 ++-- test/{test_0_cli.py => test_core_cli.py} | 0 test/{test_0_tasks.py => test_core_tasks.py} | 0 test/{test_0_tools.py => test_core_tools.py} | 0 test/{test_0_utils.py => test_core_utils.py} | 0 test/{test_2_1_load_data.py => test_task_1_load_data.py} | 0 test/{test_2_2_datasets.py => test_task_2_datasets.py} | 0 ..._communication.py => test_task_cell_cell_communication.py} | 0 ...ity_reduction.py => test_task_dimensionality_reduction.py} | 0 test/{test_1_methods.py => test_task_methods.py} | 0 test/{test_1_metrics.py => test_task_metrics.py} | 0 11 files changed, 2 insertions(+), 2 deletions(-) rename test/{test_0_cli.py => test_core_cli.py} (100%) rename test/{test_0_tasks.py => test_core_tasks.py} (100%) rename test/{test_0_tools.py => test_core_tools.py} (100%) rename test/{test_0_utils.py => test_core_utils.py} (100%) rename test/{test_2_1_load_data.py => test_task_1_load_data.py} (100%) rename test/{test_2_2_datasets.py => test_task_2_datasets.py} (100%) rename test/{test_3_cell_cell_communication.py => test_task_cell_cell_communication.py} (100%) rename test/{test_3_dimensionality_reduction.py => test_task_dimensionality_reduction.py} (100%) rename test/{test_1_methods.py => test_task_methods.py} (100%) rename test/{test_1_metrics.py => test_task_metrics.py} (100%) diff --git a/scripts/generate_test_matrix.py b/scripts/generate_test_matrix.py index 11b974b145..455f772a26 100644 --- a/scripts/generate_test_matrix.py +++ b/scripts/generate_test_matrix.py @@ -1,8 +1,8 @@ import json import openproblems -_CORE_TEST_SUITES = ["(test_0_ or test_3_)"] -_TASK_TEST_SUITES = ["test_1_", "test_2_"] +_CORE_TEST_SUITES = ["test_core"] +_TASK_TEST_SUITES = ["test_task"] def generate_matrix(): diff --git a/test/test_0_cli.py b/test/test_core_cli.py similarity index 100% rename from test/test_0_cli.py rename to test/test_core_cli.py diff --git a/test/test_0_tasks.py b/test/test_core_tasks.py similarity index 100% rename from test/test_0_tasks.py rename to test/test_core_tasks.py diff --git a/test/test_0_tools.py b/test/test_core_tools.py similarity index 100% rename from test/test_0_tools.py rename to test/test_core_tools.py diff --git a/test/test_0_utils.py b/test/test_core_utils.py similarity index 100% rename from test/test_0_utils.py rename to test/test_core_utils.py diff --git a/test/test_2_1_load_data.py b/test/test_task_1_load_data.py similarity index 100% rename from test/test_2_1_load_data.py rename to test/test_task_1_load_data.py diff --git a/test/test_2_2_datasets.py b/test/test_task_2_datasets.py similarity index 100% rename from test/test_2_2_datasets.py rename to test/test_task_2_datasets.py diff --git a/test/test_3_cell_cell_communication.py b/test/test_task_cell_cell_communication.py similarity index 100% rename from test/test_3_cell_cell_communication.py rename to test/test_task_cell_cell_communication.py diff --git a/test/test_3_dimensionality_reduction.py b/test/test_task_dimensionality_reduction.py similarity index 100% rename from test/test_3_dimensionality_reduction.py rename to test/test_task_dimensionality_reduction.py diff --git a/test/test_1_methods.py b/test/test_task_methods.py similarity index 100% rename from test/test_1_methods.py rename to test/test_task_methods.py diff --git a/test/test_1_metrics.py b/test/test_task_metrics.py similarity index 100% rename from test/test_1_metrics.py rename to test/test_task_metrics.py From 46f67302286883e0f0b0c8d5c2470a6fb1786f4e Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 2 Dec 2022 11:56:42 -0500 Subject: [PATCH 165/266] Remove raw values for all-nan metrics (#734) * publish both raw and scaled scores * publish to PR branch # publish * need to checkout ref # publish * fix typo # publish * sort metrics * # publish * sort formatted outputs too # publish * temp # publish * log some stuff * warn when skipping * error if skip task * skip if missing * just let it fail * catch earlier * get keys upfront * temp * one more * commit and push * typo * config * remove raw * push results to repro repo * revert temp * one more PR # publish * uncomment * clean up diff * drop raw and scaled nan metrics * # publish * # publish --- workflow/parse_nextflow.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index 4d0d3e06e1..3d62c2c773 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -216,7 +216,6 @@ def normalize_scores(task_name, dataset_results): ) metric_names = list(list(dataset_results.values())[0]["metrics"].keys()) - n_removed = 0 for metric_name in metric_names: try: metric = openproblems.api.utils.get_function( @@ -232,11 +231,6 @@ def normalize_scores(task_name, dataset_results): for method_name in dataset_results ] ) - if np.all(np.isnan(metric_scores)): - n_removed += 1 - for method_name in dataset_results: - del dataset_results[method_name]["metrics"][metric_name] - continue baseline_methods = [] for method_name in list(dataset_results.keys()): try: @@ -267,8 +261,6 @@ def normalize_scores(task_name, dataset_results): metric_scores = 1 - metric_scores for method_name, score in zip(dataset_results, metric_scores): dataset_results[method_name]["metrics"][metric_name] = score - if n_removed > 0: - print(f"[WARN] Removed {n_removed} all-NaN metrics") return dataset_results @@ -291,6 +283,26 @@ def drop_baselines(task_name, dataset_results): return dataset_results +def drop_nan_metrics(dataset_results): + n_removed = 0 + metric_names = list(list(dataset_results.values())[0]["metrics"].keys()) + for metric_name in metric_names: + metric_scores = np.array( + [ + dataset_results[method_name]["metrics"][metric_name] + for method_name in dataset_results + ] + ) + if np.all(np.isnan(metric_scores)): + n_removed += 1 + for method_name in dataset_results: + del dataset_results[method_name]["metrics"][metric_name] + del dataset_results[method_name]["metrics_raw"][metric_name] + if n_removed > 0: + print(f"[WARN] Removed {n_removed} all-NaN metrics") + return dataset_results + + def compute_ranking(dataset_results): """Rank all methods on a specific dataset.""" metric_sums = np.zeros(len(dataset_results)) @@ -331,6 +343,7 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): ) dataset_results_raw = normalize_scores(task_name, dataset_results_raw) dataset_results = drop_baselines(task_name, dataset_results_raw) + dataset_results = drop_nan_metrics(dataset_results) dataset_results, ranking = compute_ranking(dataset_results) metric_names = set() for method_name, rank in ranking.items(): From 6f232362a6c7b21b4659b9c6df37704735fe537e Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 2 Dec 2022 11:58:01 -0500 Subject: [PATCH 166/266] Build images in PR if same repo owner (#735) * run build_images on PR if targeting same repo * dump context * use id * don't dump --- .github/workflows/run_tests.yml | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index f4d9727798..257c76a373 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -21,8 +21,11 @@ jobs: !endsWith(github.event.head_commit.message, '# ci skip') && !startsWith(github.ref, 'refs/heads/test_process') && ( - github.event_name == 'push' || - startsWith(github.ref, 'refs/heads/test_docker') + github.event_name != 'pull_request' || + ( + github.event_name == 'pull_request' && + github.event.pull_request.head.repo.owner.id == github.event.pull_request.base.repo.owner.id + ) ) env: @@ -135,9 +138,9 @@ jobs: !startsWith(github.ref, 'refs/heads/test_process') && !startsWith(github.ref, 'refs/heads/test_website') && ( - startsWith(github.ref, 'refs/heads') || - startsWith(github.ref, 'refs/tags') || + github.event_name != 'pull_request' || ( + github.event_name == 'pull_request' && github.event.pull_request.draft == false && github.actor != 'dependabot[bot]' ) @@ -229,14 +232,14 @@ jobs: - name: Log in to the Container registry uses: docker/login-action@v2 - if: "github.event_name == 'push'" + if: "needs.build_images.result == 'success'" with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Download docker images - if: "github.event_name == 'push'" + if: "needs.build_images.result == 'success'" run: | for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" @@ -284,7 +287,7 @@ jobs: shell: Rscript {0} - name: Update Docker docker images - if: "github.event_name == 'pull_request'" + if: "needs.build_images.result == 'skipped'" run: | cd workflow snakemake -j $(nproc) docker @@ -344,7 +347,7 @@ jobs: always() && !endsWith(github.event.head_commit.message, '# ci skip') && needs.build_images.result == 'success' && - github.event_name == 'push' && + github.event_name != 'pull_request' && ( needs.run_tester.result == 'success' || needs.run_tester.result == 'skipped' From 73619250c7289541017e1aefdc7d5159bbfcdded Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 2 Dec 2022 12:01:45 -0500 Subject: [PATCH 167/266] Tf -> tensorflow --- .github/dependabot.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index db7d0e9586..7b662d1d3f 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -43,7 +43,7 @@ updates: rebase-strategy: "disabled" - package-ecosystem: "pip" - directory: "/docker/openproblems-python-tf2.4" + directory: "/docker/openproblems-python-tensorflow" schedule: interval: "daily" open-pull-requests-limit: 1 From 7ffc855945b2e7ea4306a72d45927487962f4e17 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 2 Dec 2022 21:26:52 -0500 Subject: [PATCH 168/266] Add cell cycle score baseline (#706) * add cc_score baseline * document * Make sure method didn't remove uns * Combat tramples uns * Revert * Scale and hvg trample uns * scanorama clears uns * mnn tramples uns * just copy uns * just copy uns * don't set X_emb if missing; it shouldn't ever be missing * use true features as embedding * compute PCA per batch * Set code version --- .../batch_integration_embed/README.md | 1 + .../batch_integration_embed/api.py | 5 +++ .../methods/__init__.py | 1 + .../methods/baseline.py | 32 +++++++++++++++++++ .../batch_integration_embed/metrics/_utils.py | 4 --- .../metrics/cc_score.py | 6 ++-- .../datasets/immune.py | 1 + .../datasets/pancreas.py | 1 + .../batch_integration_graph/methods/_utils.py | 6 ++++ .../batch_integration_graph/methods/mnn.py | 3 ++ .../methods/scanorama.py | 4 ++- 11 files changed, 57 insertions(+), 7 deletions(-) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md index eb39b62e10..1fab8a1d18 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md @@ -66,6 +66,7 @@ Datasets should contain the following attributes: * `adata.obsm['X_uni']` with a pre-integration embedding (PCA) * `adata.layers['log_normalized']` with log-normalized data * `adata.X` with log-normalized data +* `adata.uns["organism"]` with either `"mouse"` or `"human"` Methods should assign output to `adata.obsm['X_emb']`. diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py index 52f26ee0f0..b28df1418d 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py @@ -11,6 +11,8 @@ def check_dataset(adata): assert "batch" in adata.obs assert "labels" in adata.obs assert "log_normalized" in adata.layers + assert "organism" in adata.uns + assert adata.uns["organism"] in ["mouse", "human"] return True @@ -18,6 +20,8 @@ def check_dataset(adata): def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" assert "X_emb" in adata.obsm + # check organism was not removed + assert "organism" in adata.uns return True @@ -27,6 +31,7 @@ def sample_dataset(): import scanpy as sc adata = load_sample_data() + adata.uns["organism"] = "human" adata.var.index = adata.var.gene_short_name.astype(str) sc.pp.normalize_total(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py index abac573ae2..0679d0a530 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py @@ -32,6 +32,7 @@ from .baseline import celltype_random_embedding from .baseline import celltype_random_integration from .baseline import no_integration +from .baseline import no_integration_batch from .baseline import random_integration from .scalex import scalex_full from .scalex import scalex_hvg diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py index 783814fb65..7b28a9267c 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py @@ -3,6 +3,9 @@ from ...batch_integration_graph.methods.baseline import _random_embedding from ...batch_integration_graph.methods.baseline import _randomize_features +import numpy as np +import scanpy as sc + @method( method_name="No Integration", @@ -76,3 +79,32 @@ def batch_random_integration(adata, test=False): ) adata.uns["method_code_version"] = check_version("openproblems") return adata + + +@method( + method_name="No Integration by Batch", + paper_name="No Integration by Batch (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def no_integration_batch(adata, test=False): + """Compute PCA independently on each batch + + See https://github.com/theislab/scib/issues/351 + """ + adata.obsm["X_emb"] = np.zeros((adata.shape[0], 50), dtype=float) + for batch in adata.obs["batch"].unique(): + batch_idx = adata.obs["batch"] == batch + n_comps = min(50, np.sum(batch_idx)) + solver = "full" if n_comps == np.sum(batch_idx) else "arpack" + adata.obsm["X_emb"][batch_idx, :n_comps] = sc.tl.pca( + adata[batch_idx], + n_comps=n_comps, + use_highly_variable=False, + svd_solver=solver, + copy=True, + ).obsm["X_pca"] + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py index 8a4b33cb72..45c21c2205 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py @@ -1,8 +1,4 @@ def _get_split(adata): uni = adata uni.obsm["X_pca"] = uni.obsm["X_uni_pca"] - - if "X_emb" not in adata.obsm: - adata.obsm["X_emb"] = adata.obsm["X_pca"] - return (uni, adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py index 322891b202..61d73337f3 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py @@ -21,14 +21,16 @@ @metric( metric_name="Cell Cycle Score", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-python-batch-integration", ) def cc_score(adata, test=False): from ._utils import _get_split from scib.metrics import cell_cycle try: - cc = cell_cycle(*_get_split(adata), "batch", embed="X_emb", organism="human") + cc = cell_cycle( + *_get_split(adata), "batch", embed="X_emb", organism=adata.uns["organism"] + ) except ValueError: cc = 0 diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py index bee6e7699f..084baac165 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py @@ -15,6 +15,7 @@ def immune_batch(test=False): import scanpy as sc adata = load_immune(test) + adata.uns["organism"] = "human" adata.obs["labels"] = adata.obs["final_annotation"] sc.pp.filter_genes(adata, min_counts=1) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py index 23dcdd016b..01c9af49e4 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py @@ -15,6 +15,7 @@ def pancreas_batch(test=False): import scanpy as sc adata = load_pancreas(test) + adata.uns["organism"] = "human" adata.obs["labels"] = adata.obs["celltype"] adata.obs["batch"] = adata.obs["tech"] diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/_utils.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/_utils.py index 3f34a68054..f549bd31f1 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/_utils.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/_utils.py @@ -4,6 +4,8 @@ def hvg_batch(adata, batch_key, target_genes, adataOut): if adata.n_vars < 2000: return adata else: + # uns and var get trampled + uns = adata.uns.copy() var = adata.var.copy() adata = hvg_batch( adata, @@ -13,13 +15,17 @@ def hvg_batch(adata, batch_key, target_genes, adataOut): adataOut=adataOut, ) adata.var = var.loc[adata.var.index] + adata.uns = uns return adata def scale_batch(adata, batch_key): from scib.preprocessing import scale_batch + # uns and var get trampled + uns = adata.uns.copy() var = adata.var.copy() adata = scale_batch(adata, batch_key) adata.var = var.loc[adata.var_names] + adata.uns = uns return adata diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py index 99dab39203..a40a59bb2e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py @@ -18,7 +18,10 @@ def _mnn(adata): from scib.integration import runMNN from scib.preprocessing import reduce_data + # mnn clears adata.uns + uns = adata.uns adata = runMNN(adata, "batch") + adata.uns = uns reduce_data(adata, umap=False) adata.obsm["X_emb"] = adata.obsm["X_pca"] adata.uns["method_code_version"] = check_version("mnnpy") diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py index d6e80162b5..b48ce4d70b 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py @@ -18,10 +18,12 @@ def _scanorama(adata, use_rep, pca): from scib.integration import scanorama from scib.preprocessing import reduce_data - # scanorama clears adata.layers + # scanorama clears adata.layers and uns layers = adata.layers + uns = adata.uns adata = scanorama(adata, "batch") adata.layers = layers + adata.uns = uns reduce_data(adata, umap=False, use_rep=use_rep, pca=pca) adata.uns["method_code_version"] = check_version("scanorama") return adata From 6db2102b14d727986d4e3471cc1e0a2cc521082a Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Sun, 4 Dec 2022 11:59:16 -0500 Subject: [PATCH 169/266] separate bedtools from everything else (#726) --- .../openproblems-python-bedtools/Dockerfile | 28 +++++++++++++++++++ docker/openproblems-python-bedtools/README.md | 12 ++++++++ .../requirements.txt | 2 ++ docker/openproblems-python-extras/Dockerfile | 13 --------- docker/openproblems-python-extras/README.md | 2 -- .../requirements.txt | 2 -- .../methods/beta.py | 2 +- 7 files changed, 43 insertions(+), 18 deletions(-) create mode 100644 docker/openproblems-python-bedtools/Dockerfile create mode 100644 docker/openproblems-python-bedtools/README.md create mode 100644 docker/openproblems-python-bedtools/requirements.txt diff --git a/docker/openproblems-python-bedtools/Dockerfile b/docker/openproblems-python-bedtools/Dockerfile new file mode 100644 index 0000000000..d0dff3ad54 --- /dev/null +++ b/docker/openproblems-python-bedtools/Dockerfile @@ -0,0 +1,28 @@ +FROM singlecellopenproblems/openproblems:latest + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +USER root +WORKDIR / + +# Install pybedtools dependency +ARG BUILD_PACKAGES="" +ARG PACKAGE_VERSION=2.27.1 +RUN apt-get update && \ + apt-get install --yes git openssl build-essential zlib1g-dev && \ + cd /tmp && \ + git clone https://github.com/arq5x/bedtools2.git && \ + cd bedtools2 && \ + git checkout v$PACKAGE_VERSION && \ + make && \ + mv bin/* /usr/local/bin && \ + cd / + +# install dependencies and openproblems +COPY ./docker/openproblems-python-bedtools/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt --editable /usr/src/singlecellopenproblems + +USER $NB_UID +WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-bedtools/README.md b/docker/openproblems-python-bedtools/README.md new file mode 100644 index 0000000000..d7b9935395 --- /dev/null +++ b/docker/openproblems-python-bedtools/README.md @@ -0,0 +1,12 @@ +# openproblems-python-extras Docker image + +Base image: singlecellopenproblems/openproblems + +OS: Debian Stretch + +Python: 3.8 + +Python packages: + +* pybedtools +* pyensembl diff --git a/docker/openproblems-python-bedtools/requirements.txt b/docker/openproblems-python-bedtools/requirements.txt new file mode 100644 index 0000000000..5f308af4ed --- /dev/null +++ b/docker/openproblems-python-bedtools/requirements.txt @@ -0,0 +1,2 @@ +pybedtools==0.9.* +pyensembl==2.0.* diff --git a/docker/openproblems-python-extras/Dockerfile b/docker/openproblems-python-extras/Dockerfile index e7226456a8..45d26f74f8 100644 --- a/docker/openproblems-python-extras/Dockerfile +++ b/docker/openproblems-python-extras/Dockerfile @@ -7,19 +7,6 @@ ARG NB_GID="100" USER root WORKDIR / -# Install pybedtools dependency -ARG BUILD_PACKAGES="" -ARG PACKAGE_VERSION=2.27.1 -RUN apt-get update && \ - apt-get install --yes git openssl build-essential zlib1g-dev && \ - cd /tmp && \ - git clone https://github.com/arq5x/bedtools2.git && \ - cd bedtools2 && \ - git checkout v$PACKAGE_VERSION && \ - make && \ - mv bin/* /usr/local/bin && \ - cd / - # Install Python packages COPY ./docker/openproblems-python-extras/requirements.txt ./requirements.txt RUN pip install --no-cache-dir -r requirements.txt diff --git a/docker/openproblems-python-extras/README.md b/docker/openproblems-python-extras/README.md index 7e060243fc..1fd2017dd0 100644 --- a/docker/openproblems-python-extras/README.md +++ b/docker/openproblems-python-extras/README.md @@ -9,8 +9,6 @@ Python: 3.8 Python packages: * harmonic-alignment -* pybedtools -* pyensembl * magic-impute * molecular-cross-validation * MulticoreTSNE diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index c51a5d771a..1dffe451d5 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -7,8 +7,6 @@ git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix git+https://github.com/scottgigante-immunai/knn-smoothing@python_package magic-impute==3.0.* phate==1.0.* -pybedtools==0.9.* -pyensembl==2.0.* scalex==1.0.2 scvi-tools==0.16.* tangram-sc==1.0.* diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py index e5dffb6fa7..1eeff4992e 100644 --- a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py +++ b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py @@ -235,7 +235,7 @@ def _beta(adata, test=False, top_genes=None, threshold=1): paper_year=2013, code_version="1.0", code_url="http://cistrome.org/BETA", - image="openproblems-python-extras", + image="openproblems-python-bedtools", ) def beta(adata, test=False, top_genes=None, threshold=1): adata = _beta(adata, test=test, top_genes=top_genes, threshold=threshold) From 4bb8a7e04545a06c336d3d9364a1dd84fa2af1a4 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Sun, 4 Dec 2022 12:06:43 -0500 Subject: [PATCH 170/266] Split torch from non-torch dependencies (#733) * Revert "Revert "upgrade louvain and scib (#725)" (#732)" This reverts commit 9f18bb496f9c5a7b992d093c2cecb7006f9ee32c. * Update requirements.txt * Update requirements.txt * Don't upgrade * Update setup.py * rename * skip if draft --- .github/dependabot.yml | 2 +- .github/workflows/run_tests.yml | 1 + .../Dockerfile | 24 ------------------- .../README.md | 15 ------------ .../requirements.txt | 6 ----- .../requirements.txt | 8 ------- docker/openproblems-python-pytorch/Dockerfile | 15 ++++++++++++ .../README.md | 6 +++++ .../requirements.txt | 11 +++++++++ docker/openproblems-python-scvi/Dockerfile | 15 ------------ .../openproblems-python-scvi/requirements.txt | 5 ---- docker/openproblems-r-base/README.md | 2 +- .../openproblems-r-extras/r_requirements.txt | 1 + docker/openproblems-r-extras/requirements.txt | 2 +- docker/openproblems-r-pytorch/README.md | 14 +++++------ .../openproblems-r-pytorch/requirements.txt | 7 +++++- .../metrics/cc_score.py | 2 +- .../metrics/iso_label_sil.py | 2 +- .../batch_integration_embed/metrics/pcr.py | 2 +- .../metrics/sil_batch.py | 2 +- .../metrics/silhouette.py | 2 +- .../metrics/hvg_conservation.py | 2 +- .../batch_integration_graph/methods/bbknn.py | 2 +- .../batch_integration_graph/methods/combat.py | 2 +- .../batch_integration_graph/methods/mnn.py | 2 +- .../batch_integration_graph/methods/scalex.py | 2 +- .../methods/scanorama.py | 2 +- .../batch_integration_graph/methods/scanvi.py | 2 +- .../batch_integration_graph/methods/scvi.py | 2 +- .../batch_integration_graph/metrics/ari.py | 2 +- .../metrics/graph_connectivity.py | 2 +- .../metrics/iso_label_f1.py | 2 +- .../batch_integration_graph/metrics/nmi.py | 2 +- .../tasks/denoising/datasets/pancreas.py | 2 +- openproblems/tasks/denoising/datasets/pbmc.py | 2 +- .../denoising/datasets/tabula_muris_senis.py | 2 +- .../tasks/denoising/metrics/poisson.py | 2 +- .../methods/neuralee.py | 2 +- .../label_projection/methods/scvi_tools.py | 4 ++-- .../datasets/destvi/generate.py | 2 +- .../methods/cell2location.py | 2 +- .../spatial_decomposition/methods/destvi.py | 2 +- .../methods/stereoscope.py | 2 +- .../spatial_decomposition/methods/tangram.py | 2 +- 44 files changed, 79 insertions(+), 113 deletions(-) delete mode 100644 docker/openproblems-python-batch-integration/Dockerfile delete mode 100644 docker/openproblems-python-batch-integration/README.md delete mode 100644 docker/openproblems-python-batch-integration/requirements.txt create mode 100644 docker/openproblems-python-pytorch/Dockerfile rename docker/{openproblems-python-scvi => openproblems-python-pytorch}/README.md (64%) create mode 100644 docker/openproblems-python-pytorch/requirements.txt delete mode 100644 docker/openproblems-python-scvi/Dockerfile delete mode 100644 docker/openproblems-python-scvi/requirements.txt diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 7b662d1d3f..6553e28774 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -22,7 +22,7 @@ updates: rebase-strategy: "disabled" - package-ecosystem: "pip" - directory: "/docker/openproblems-python-batch-integration" + directory: "/docker/openproblems-python-pytorch" schedule: interval: "daily" open-pull-requests-limit: 1 diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 257c76a373..fe250910df 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -24,6 +24,7 @@ jobs: github.event_name != 'pull_request' || ( github.event_name == 'pull_request' && + github.event.pull_request.draft == false && github.event.pull_request.head.repo.owner.id == github.event.pull_request.base.repo.owner.id ) ) diff --git a/docker/openproblems-python-batch-integration/Dockerfile b/docker/openproblems-python-batch-integration/Dockerfile deleted file mode 100644 index 697905d79c..0000000000 --- a/docker/openproblems-python-batch-integration/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -FROM singlecellopenproblems/openproblems-r-base:latest - -USER root -WORKDIR / - -ARG NB_USER="sagemaker-user" -ARG NB_UID="1000" -ARG NB_GID="100" - -RUN sed -i '$ d' /etc/apt/sources.list -RUN \ -apt-get update --allow-releaseinfo-change && \ -apt-get -y install --no-install-recommends gcc git python3-llvmlite && \ -apt-get autoremove -y && \ -rm -rf /var/lib/apt/lists/* - -# Install Python packages -COPY ./docker/openproblems-python-batch-integration/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt -# force reinstall annoy addresses https://github.com/spotify/annoy/issues/513 -RUN pip install --no-cache-dir --force annoy==1.17.0 - -USER $NB_UID -WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-batch-integration/README.md b/docker/openproblems-python-batch-integration/README.md deleted file mode 100644 index 02a18e1c20..0000000000 --- a/docker/openproblems-python-batch-integration/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# openproblems-python-extras Docker image - -Base image: singlecellopenproblems/openproblems-r-base - -OS: Debian Stretch - -Python: 3.8 - -Python packages: - -* scIB -* mnnpy -* scanorama -* bbknn -* scVI diff --git a/docker/openproblems-python-batch-integration/requirements.txt b/docker/openproblems-python-batch-integration/requirements.txt deleted file mode 100644 index 61bdfd360a..0000000000 --- a/docker/openproblems-python-batch-integration/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -annoy==1.17.1 -bbknn==1.5.* -git+https://github.com/chriscainx/mnnpy@2097dec # master -git+https://github.com/theislab/scib@77ab015 -scanorama==1.7.0 -scvi-tools~=0.16 # pinned in #313 diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index 1dffe451d5..7d6aee63d5 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -1,15 +1,7 @@ cmake==3.24.1.1 -git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac -git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 git+https://github.com/jorvis/Multicore-TSNE@6832575 git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix git+https://github.com/scottgigante-immunai/knn-smoothing@python_package magic-impute==3.0.* phate==1.0.* -scalex==1.0.2 -scvi-tools==0.16.* -tangram-sc==1.0.* -tensorflow-cpu==2.9.* -torch==1.12.* xgboost==1.6.* diff --git a/docker/openproblems-python-pytorch/Dockerfile b/docker/openproblems-python-pytorch/Dockerfile new file mode 100644 index 0000000000..cdf852acc0 --- /dev/null +++ b/docker/openproblems-python-pytorch/Dockerfile @@ -0,0 +1,15 @@ +FROM singlecellopenproblems/openproblems:latest + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +USER root +WORKDIR / + +# install dependencies and openproblems +COPY ./docker/openproblems-python-pytorch/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt --editable /usr/src/singlecellopenproblems + +USER $NB_UID +WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-scvi/README.md b/docker/openproblems-python-pytorch/README.md similarity index 64% rename from docker/openproblems-python-scvi/README.md rename to docker/openproblems-python-pytorch/README.md index 546cec9bc0..d566a8efd5 100644 --- a/docker/openproblems-python-scvi/README.md +++ b/docker/openproblems-python-pytorch/README.md @@ -9,3 +9,9 @@ Python: 3.8 Python packages: * scvi-tools +* tangram +* torch +* neuralee +* xgboost +* molecular-cross-validation +* cell2location diff --git a/docker/openproblems-python-pytorch/requirements.txt b/docker/openproblems-python-pytorch/requirements.txt new file mode 100644 index 0000000000..56bd2a53dc --- /dev/null +++ b/docker/openproblems-python-pytorch/requirements.txt @@ -0,0 +1,11 @@ +git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac +git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 +git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix +jax==0.3.23 +jaxlib==0.3.22 +scalex==1.0.2 +scikit-misc==0.1.* +scvi-tools~=0.17 # pinned in #313 +tangram-sc==1.0.* +torch==1.12.* +xgboost==1.6.* diff --git a/docker/openproblems-python-scvi/Dockerfile b/docker/openproblems-python-scvi/Dockerfile deleted file mode 100644 index f7edd2e4dc..0000000000 --- a/docker/openproblems-python-scvi/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM singlecellopenproblems/openproblems:latest - -ARG NB_USER="sagemaker-user" -ARG NB_UID="1000" -ARG NB_GID="100" - -USER root -WORKDIR / - -# Install Python packages -COPY ./docker/openproblems-python-scvi/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -USER $NB_UID -WORKDIR /home/$NB_USER diff --git a/docker/openproblems-python-scvi/requirements.txt b/docker/openproblems-python-scvi/requirements.txt deleted file mode 100644 index 87aa041993..0000000000 --- a/docker/openproblems-python-scvi/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -jax==0.3.23 -jaxlib==0.3.22 -scikit-misc==0.1.* -scvi-tools~=0.17 # pinned in #313 -xgboost==1.6.* diff --git a/docker/openproblems-r-base/README.md b/docker/openproblems-r-base/README.md index 785a9ace1b..ebca77780d 100644 --- a/docker/openproblems-r-base/README.md +++ b/docker/openproblems-r-base/README.md @@ -28,4 +28,4 @@ R packages: Python packages: * rpy2 -* anndata2ri>=1.0.6 +* anndata2ri>=1.1 diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index efd4ceeba6..0b8da4bde9 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -46,6 +46,7 @@ shiny@1.4.0.2 sparsesvd@0.2 systemfonts@1.0.4 textshaping@0.3.6 +theislab/kBET@a10ffea # master tibble@3.1.7 tidymodels@0.1.2 tidyverse@1.3.0 diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 9d03431a90..5b8bac668f 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,3 +1,3 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -git+https://github.com/theislab/scib@v1.0.2 +git+https://github.com/theislab/scib@77ab015 xgboost==1.6.* diff --git a/docker/openproblems-r-pytorch/README.md b/docker/openproblems-r-pytorch/README.md index 03ccbc07d4..33d3e5a30c 100644 --- a/docker/openproblems-r-pytorch/README.md +++ b/docker/openproblems-r-pytorch/README.md @@ -8,12 +8,12 @@ Python: 3.8 R: 4.0 -R packages: - -* batchelor -* sparsesvd -* dplyr - Python packages: -* harmonic-alignment +* harmony-pytorch +* torch +* bbknn +* mnnpy +* scib +* scanorama +* scvi-tools diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index e7c8df42b7..bd357b8b7f 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -1,3 +1,8 @@ -git+https://github.com/theislab/scib@v1.0.2 +annoy==1.17.1 +bbknn==1.5.* +git+https://github.com/chriscainx/mnnpy@2097dec # master +git+https://github.com/theislab/scib@77ab015 harmony-pytorch==0.1.* +scanorama==1.7.0 +scvi-tools~=0.16 # pinned in #313 torch==1.13.* diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py index 61d73337f3..1f9d9a7827 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py @@ -21,7 +21,7 @@ @metric( metric_name="Cell Cycle Score", maximize=True, - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) def cc_score(adata, test=False): from ._utils import _get_split diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py index c3575de5b8..c1f8c4be2d 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py @@ -15,7 +15,7 @@ @metric( metric_name="Isolated label Silhouette", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def isolated_labels_sil(adata): from scib.metrics import isolated_labels diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py index 886f26078b..7efca62ffe 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py @@ -18,7 +18,7 @@ @metric( metric_name="PC Regression", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def pcr(adata): from ._utils import _get_split diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py index c02e5e42aa..9f28cd1284 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py @@ -24,7 +24,7 @@ @metric( metric_name="Batch ASW", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def silhouette_batch(adata): from scib.metrics import silhouette_batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py index 36991e1d67..bb2bece193 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py @@ -12,7 +12,7 @@ @metric( metric_name="Silhouette", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def silhouette(adata): from scib.metrics import silhouette diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py index d40b36b740..bb7f90cae8 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py @@ -21,7 +21,7 @@ @metric( metric_name="HVG conservation", maximize=True, - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) def hvg_conservation(adata): from scib.metrics import hvg_overlap diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py index 017ca8f766..97570dccd8 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py @@ -10,7 +10,7 @@ paper_url="https://academic.oup.com/bioinformatics/article/36/3/964/5545955", paper_year=2020, code_url="https://github.com/Teichlab/bbknn", - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py index 3043a552e1..96e53538d3 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py @@ -10,7 +10,7 @@ paper_url="https://academic.oup.com/biostatistics/article/8/1/118/252073", paper_year=2007, code_url="https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py index a40a59bb2e..4d444b4950 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/nbt.4091", paper_year=2018, code_url="https://github.com/chriscainx/mnnpy", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py index 36843b81ed..461ea04a94 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.1038/s41467-022-33758-z", paper_year=2022, code_url="https://github.com/jsxlei/SCALEX", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py index b48ce4d70b..db9aed5caa 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/s41587-019-0113-3", paper_year=2019, code_url="https://github.com/brianhie/scanorama", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py index d5bf463974..8f98a3c931 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.15252/msb.20209620", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py index 9e9a82a9f2..35f1cd7ac5 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py @@ -10,7 +10,7 @@ paper_url="https://www.nature.com/articles/s41592-018-0229-2", paper_year=2018, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py index 13e7eb8ce1..0d082fff44 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py @@ -16,7 +16,7 @@ @metric( metric_name="ARI", maximize=True, - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) def ari(adata): from scib.metrics import ari diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py index 3a9732d0e2..52dd7c44b2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py @@ -22,7 +22,7 @@ @metric( metric_name="Graph connectivity", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def graph_connectivity(adata): import scib.metrics diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py index df86b043d9..71cd7ca209 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py @@ -27,7 +27,7 @@ @metric( metric_name="Isolated label F1", maximize=True, - image="openproblems-python-batch-integration", # only if required + image="openproblems-r-pytorch", ) def isolated_labels_f1(adata): from scib.metrics import isolated_labels diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py index bbcdc7cd9d..3356507b2e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py @@ -15,7 +15,7 @@ @metric( metric_name="NMI", maximize=True, - image="openproblems-python-batch-integration", + image="openproblems-r-pytorch", ) def nmi(adata): from scib.metrics.clustering import opt_louvain # isort:skip diff --git a/openproblems/tasks/denoising/datasets/pancreas.py b/openproblems/tasks/denoising/datasets/pancreas.py index c18ddbeee1..bd039ad88e 100644 --- a/openproblems/tasks/denoising/datasets/pancreas.py +++ b/openproblems/tasks/denoising/datasets/pancreas.py @@ -11,7 +11,7 @@ "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " "and SMARTER-seq). Here we just use the inDrop1 batch, which includes" "1937 cells × 15502 genes.", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def pancreas(test=False): adata = load_pancreas(test=test, keep_techs=["inDrop1"]) diff --git a/openproblems/tasks/denoising/datasets/pbmc.py b/openproblems/tasks/denoising/datasets/pbmc.py index fcf0fc782b..440ebe8a3b 100644 --- a/openproblems/tasks/denoising/datasets/pbmc.py +++ b/openproblems/tasks/denoising/datasets/pbmc.py @@ -11,7 +11,7 @@ "1k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. " "Sequenced on 10X v3 chemistry in November 2018 by 10X Genomics." ), - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def pbmc(test=False): adata = load_tenx_1k_pbmc(test=test) diff --git a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py index a5c62f953e..9524cc4e95 100644 --- a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py +++ b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py @@ -10,7 +10,7 @@ dataset_summary="All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 " "organs and tissues across the mouse lifespan. Here we use just 10x data from lung." " 24540 cells × 16160 genes across 3 time points.", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def tabula_muris_senis_lung_random(test=False): adata = load_tabula_muris_senis( diff --git a/openproblems/tasks/denoising/metrics/poisson.py b/openproblems/tasks/denoising/metrics/poisson.py index ebd2a73378..93db71eee2 100644 --- a/openproblems/tasks/denoising/metrics/poisson.py +++ b/openproblems/tasks/denoising/metrics/poisson.py @@ -1,7 +1,7 @@ from ....tools.decorators import metric -@metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-extras") +@metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-pytorch") def poisson(adata): from molecular_cross_validation.mcv_sweep import poisson_nll_loss diff --git a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py index 41f55dbb65..420132dbf2 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py +++ b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py @@ -18,7 +18,7 @@ paper_url="https://www.frontiersin.org/articles/10.3389/fgene.2020.00786/full", paper_year=2020, code_url="https://github.com/HiBearME/NeuralEE", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 6212f8cd69..1cec698b27 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.15252/msb.20209620", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-scvi", + image="openproblems-python-pytorch", ) _scanvi_scarches_method = functools.partial( @@ -19,7 +19,7 @@ paper_url="https://doi.org/10.1101/2020.07.16.205997", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-scvi", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py index 86005d07f2..e446b07687 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py @@ -11,7 +11,7 @@ "from the destVI manuscripts leveraging sparsePCA. Number of cells and " "cell types present in each spatial spot is computed via combination of " "kernel-based parametrization of a categorical distribution and the NB model.", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def destvi(test=False): from .utils import generate_synthetic_dataset diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index b0645e2b0d..01c4e4d32d 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -12,7 +12,7 @@ paper_url="https://doi.org/10.1038/s41587-021-01139-4", paper_year=2022, code_url="https://github.com/BayraktarLab/cell2location", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/spatial_decomposition/methods/destvi.py b/openproblems/tasks/spatial_decomposition/methods/destvi.py index 9330ba1d92..4338a465fd 100644 --- a/openproblems/tasks/spatial_decomposition/methods/destvi.py +++ b/openproblems/tasks/spatial_decomposition/methods/destvi.py @@ -11,7 +11,7 @@ paper_url="https://doi.org/10.1038/s41587-022-01272-8", paper_year=2022, code_url="https://github.com/YosefLab/scvi-tools", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def destvi( adata, diff --git a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py index c2695e4253..f9c025319e 100644 --- a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py +++ b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.1038/s41587-022-01272-8", paper_year=2020, code_url="https://github.com/scverse/scvi-tools", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def stereoscope(adata, test=False, max_epochs_sc=None, max_epochs_sp=None): from scvi.external import RNAStereoscope diff --git a/openproblems/tasks/spatial_decomposition/methods/tangram.py b/openproblems/tasks/spatial_decomposition/methods/tangram.py index 329676c4aa..c05eb0339f 100644 --- a/openproblems/tasks/spatial_decomposition/methods/tangram.py +++ b/openproblems/tasks/spatial_decomposition/methods/tangram.py @@ -10,7 +10,7 @@ paper_url="https://doi.org/10.1038/s41592-021-01264-7", paper_year=2021, code_url="https://github.com/broadinstitute/Tangram", - image="openproblems-python-extras", + image="openproblems-python-pytorch", ) def tangram(adata, test=False, num_epochs=None, n_markers=None): # analysis based on: From e8569df360736f21fae906d8352b7dba15a065a3 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Sun, 4 Dec 2022 17:01:46 -0500 Subject: [PATCH 171/266] Update scib louvain and igraph (#739) * update scib louvain and igraph * pandas 1.3.5 --- docker/openproblems-r-extras/requirements.txt | 2 +- docker/openproblems-r-pytorch/requirements.txt | 2 +- setup.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 5b8bac668f..857e39f7ec 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,3 +1,3 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -git+https://github.com/theislab/scib@77ab015 +scib==1.0.5 xgboost==1.6.* diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index bd357b8b7f..65d79f6782 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -1,8 +1,8 @@ annoy==1.17.1 bbknn==1.5.* git+https://github.com/chriscainx/mnnpy@2097dec # master -git+https://github.com/theislab/scib@77ab015 harmony-pytorch==0.1.* scanorama==1.7.0 +scib==1.0.5 scvi-tools~=0.16 # pinned in #313 torch==1.13.* diff --git a/setup.py b/setup.py index a6a1c213f4..2ef5a10e58 100644 --- a/setup.py +++ b/setup.py @@ -10,14 +10,15 @@ "scprep>=1.2.1", "scipy>=1.8,<1.10", "scanpy>=1.6", - "louvain==0.7.*", - "python-igraph<0.10", + "louvain==0.8.*", + "python-igraph==0.10.*", "decorator<5.0", # pinned in #324 "memory-profiler==0.60", "colorama==0.4.*", "packaging==21.3", "umap-learn==0.5.*", "requests==2.28.*", + "pandas==1.3.5", ] r_requires = [ From d3b9e8a9084c9b73e3b8e25d27ff48364f614dc4 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 5 Dec 2022 08:44:30 -0500 Subject: [PATCH 172/266] Revert "Update scib louvain and igraph (#739)" (#740) This reverts commit e8569df360736f21fae906d8352b7dba15a065a3. --- docker/openproblems-r-extras/requirements.txt | 2 +- docker/openproblems-r-pytorch/requirements.txt | 2 +- setup.py | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 857e39f7ec..5b8bac668f 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,3 +1,3 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -scib==1.0.5 +git+https://github.com/theislab/scib@77ab015 xgboost==1.6.* diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index 65d79f6782..bd357b8b7f 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -1,8 +1,8 @@ annoy==1.17.1 bbknn==1.5.* git+https://github.com/chriscainx/mnnpy@2097dec # master +git+https://github.com/theislab/scib@77ab015 harmony-pytorch==0.1.* scanorama==1.7.0 -scib==1.0.5 scvi-tools~=0.16 # pinned in #313 torch==1.13.* diff --git a/setup.py b/setup.py index 2ef5a10e58..a6a1c213f4 100644 --- a/setup.py +++ b/setup.py @@ -10,15 +10,14 @@ "scprep>=1.2.1", "scipy>=1.8,<1.10", "scanpy>=1.6", - "louvain==0.8.*", - "python-igraph==0.10.*", + "louvain==0.7.*", + "python-igraph<0.10", "decorator<5.0", # pinned in #324 "memory-profiler==0.60", "colorama==0.4.*", "packaging==21.3", "umap-learn==0.5.*", "requests==2.28.*", - "pandas==1.3.5", ] r_requires = [ From 6f071460d5b339d93c74be9a7baefc7272bc1242 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 5 Dec 2022 11:46:57 -0500 Subject: [PATCH 173/266] Update scib louvain and igraph (#741) * Revert "Revert "Update scib louvain and igraph (#739)" (#740)" This reverts commit d3b9e8a9084c9b73e3b8e25d27ff48364f614dc4. * Downgrade rpy2 --- docker/openproblems-r-extras/requirements.txt | 2 +- docker/openproblems-r-pytorch/requirements.txt | 2 +- setup.py | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 5b8bac668f..857e39f7ec 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,3 +1,3 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -git+https://github.com/theislab/scib@77ab015 +scib==1.0.5 xgboost==1.6.* diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index bd357b8b7f..65d79f6782 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -1,8 +1,8 @@ annoy==1.17.1 bbknn==1.5.* git+https://github.com/chriscainx/mnnpy@2097dec # master -git+https://github.com/theislab/scib@77ab015 harmony-pytorch==0.1.* scanorama==1.7.0 +scib==1.0.5 scvi-tools~=0.16 # pinned in #313 torch==1.13.* diff --git a/setup.py b/setup.py index a6a1c213f4..6f59655099 100644 --- a/setup.py +++ b/setup.py @@ -10,18 +10,19 @@ "scprep>=1.2.1", "scipy>=1.8,<1.10", "scanpy>=1.6", - "louvain==0.7.*", - "python-igraph<0.10", + "louvain==0.8.*", + "python-igraph==0.10.*", "decorator<5.0", # pinned in #324 "memory-profiler==0.60", "colorama==0.4.*", "packaging==21.3", "umap-learn==0.5.*", "requests==2.28.*", + "pandas==1.3.5", ] r_requires = [ - "rpy2<3.5.6", + "rpy2<3.4.3", "anndata2ri==1.1.*", ] From 8f607df59baaf82cc4eb0f80814be56b3083e28a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 5 Dec 2022 11:58:57 -0500 Subject: [PATCH 174/266] fix double counted tests --- test/test_core_tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_core_tasks.py b/test/test_core_tasks.py index f8851f56ec..a2383dac54 100644 --- a/test/test_core_tasks.py +++ b/test/test_core_tasks.py @@ -40,7 +40,7 @@ def test_members(self): for method in method_list: assert callable(method) - def test_task_api_members(self): + def test_api_members(self): """Test that task.api has the required members""" assert hasattr(self.task.api, "check_dataset") assert hasattr(self.task.api, "check_method") @@ -52,7 +52,7 @@ def test_task_api_members(self): assert callable(self.task.api.sample_method) assert hasattr(self.task.api.sample_dataset, "metadata") - def test_task_api_is_consistent(self): + def test_api_is_consistent(self): """Test that a task's API is self-consistent""" adata = self.task.api.sample_dataset() assert self.task.api.check_dataset(adata) From 51fbc2fe3d4197222f261296504dc579e22e96e7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 5 Dec 2022 13:11:11 -0500 Subject: [PATCH 175/266] downgrade rpy2 in r-extras --- docker/openproblems-r-extras/requirements.txt | 1 + setup.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 857e39f7ec..38001d42e5 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,3 +1,4 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python +rpy2<=3.4.3 scib==1.0.5 xgboost==1.6.* diff --git a/setup.py b/setup.py index 6f59655099..2ef5a10e58 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ ] r_requires = [ - "rpy2<3.4.3", + "rpy2<3.5.6", "anndata2ri==1.1.*", ] From 7e33c36268a4b62ed1f97c9125b32b8a105e1688 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 5 Dec 2022 16:04:42 -0500 Subject: [PATCH 176/266] rpy2 < 3.4.3 --- setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 2ef5a10e58..40e6c48955 100644 --- a/setup.py +++ b/setup.py @@ -4,11 +4,11 @@ import os install_requires = [ - "numpy>=1.22,<1.24", - "scikit-learn==1.1.*", + "numpy>=1.21,<1.24", + "scikit-learn>=1.0.*,<=1.1.*", "anndata==0.8.*", "scprep>=1.2.1", - "scipy>=1.8,<1.10", + "scipy>=1.7,<1.10", "scanpy>=1.6", "louvain==0.8.*", "python-igraph==0.10.*", @@ -22,8 +22,8 @@ ] r_requires = [ - "rpy2<3.5.6", - "anndata2ri==1.1.*", + "rpy2>=3.4,<3.4.3", + "anndata2ri==1.0.6", ] evaluate_requires = ["snakemake>=7.8,<7.17", "tabulate<0.9"] From becb1f6fe35a0644e17dc7d425e08dba96a5ae43 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 5 Dec 2022 20:49:44 -0500 Subject: [PATCH 177/266] 3.4.2 --- docker/openproblems-r-extras/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 38001d42e5..79b89e8987 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,4 +1,4 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python -rpy2<=3.4.3 +rpy2<3.4.3 scib==1.0.5 xgboost==1.6.* From 9e50f9b0a266de1255bb4a9e21e7d70766404230 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 6 Dec 2022 09:03:10 -0500 Subject: [PATCH 178/266] Prevent pip install conflicts (#745) * upgrade louvain and scib * install concurrently to prevent silent conflicts * Remove accidental diff * Remove accidental diff --- docker/openproblems-github-actions/Dockerfile | 7 ++----- docker/openproblems-python-extras/Dockerfile | 2 +- docker/openproblems-r-extras/Dockerfile | 4 ++-- docker/openproblems-r-pytorch/Dockerfile | 5 ++--- docker/openproblems/Dockerfile | 9 +++------ 5 files changed, 10 insertions(+), 17 deletions(-) diff --git a/docker/openproblems-github-actions/Dockerfile b/docker/openproblems-github-actions/Dockerfile index e5ce9085e1..bce262f0cb 100644 --- a/docker/openproblems-github-actions/Dockerfile +++ b/docker/openproblems-github-actions/Dockerfile @@ -13,13 +13,10 @@ RUN sh -c 'echo \ RUN apt-get update RUN apt-get install -y docker-ce docker-ce-cli containerd.io -# install Python packages +# install dependencies and openproblems COPY ./docker/openproblems-github-actions/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt +RUN pip install --no-cache-dir -r requirements.txt --editable /usr/src/singlecellopenproblems[test,r,evaluate] # Install R packages COPY ./docker/openproblems-github-actions/r_requirements.txt ./r_requirements.txt RUN R -e "source(\"install_renv.R\"); install_renv(\"r_requirements.txt\")" - -# Install Python packages -RUN pip install --no-cache-dir -U /usr/src/singlecellopenproblems[test,r,evaluate] diff --git a/docker/openproblems-python-extras/Dockerfile b/docker/openproblems-python-extras/Dockerfile index 45d26f74f8..ddb8a48542 100644 --- a/docker/openproblems-python-extras/Dockerfile +++ b/docker/openproblems-python-extras/Dockerfile @@ -9,7 +9,7 @@ WORKDIR / # Install Python packages COPY ./docker/openproblems-python-extras/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt +RUN pip install --no-cache-dir -r requirements.txt --editable /usr/src/singlecellopenproblems USER $NB_UID WORKDIR /home/$NB_USER diff --git a/docker/openproblems-r-extras/Dockerfile b/docker/openproblems-r-extras/Dockerfile index e67fe8eb09..62425ae57c 100644 --- a/docker/openproblems-r-extras/Dockerfile +++ b/docker/openproblems-r-extras/Dockerfile @@ -21,9 +21,9 @@ RUN apt-get clean autoclean && \ COPY ./docker/openproblems-r-extras/r_requirements.txt ./r_requirements.txt RUN R -e "source(\"install_renv.R\"); install_renv(\"r_requirements.txt\")" -# Install Python packages +# install dependencies and openproblems COPY ./docker/openproblems-r-extras/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt +RUN pip install --no-cache-dir -r requirements.txt --editable /usr/src/singlecellopenproblems[r] # Fix permissions RUN chown -R $NB_USER:$NB_GID /home/$NB_USER diff --git a/docker/openproblems-r-pytorch/Dockerfile b/docker/openproblems-r-pytorch/Dockerfile index d8300572f7..55568a6eae 100644 --- a/docker/openproblems-r-pytorch/Dockerfile +++ b/docker/openproblems-r-pytorch/Dockerfile @@ -12,10 +12,9 @@ WORKDIR / COPY ./docker/openproblems-r-pytorch/r_requirements.txt ./r_requirements.txt RUN R -e "source(\"install_renv.R\"); install_renv(\"r_requirements.txt\")" -# Install Python packages +# install dependencies and openproblems COPY ./docker/openproblems-r-pytorch/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - +RUN pip install --no-cache-dir -r requirements.txt --editable /usr/src/singlecellopenproblems[r] USER $NB_UID WORKDIR /home/$NB_USER diff --git a/docker/openproblems/Dockerfile b/docker/openproblems/Dockerfile index 6132ab9208..2434cc5acb 100644 --- a/docker/openproblems/Dockerfile +++ b/docker/openproblems/Dockerfile @@ -27,14 +27,11 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2 RUN unzip -q awscliv2.zip RUN ./aws/install -# install dependencies -COPY ./docker/openproblems/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -# Install single-cell open problems +# install dependencies and openproblems COPY . /usr/src/singlecellopenproblems RUN cd /usr/src/singlecellopenproblems && git clean -fxdq -RUN pip install --no-cache-dir --editable /usr/src/singlecellopenproblems +COPY ./docker/openproblems/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt --editable /usr/src/singlecellopenproblems # Overwrite kernel.json to use system Python install COPY ./docker/openproblems/kernelspec.json /usr/local/share/jupyter/kernels/python3/kernel.json From 106eea54ae876fa2a242d2e46dfa045fb62630bc Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 6 Dec 2022 11:49:38 -0500 Subject: [PATCH 179/266] require that jobs ran successfully --- .github/workflows/run_tests.yml | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index fe250910df..1b37ad1384 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -120,12 +120,17 @@ jobs: id: export-images run: | IMAGES="$(find ./docker -mindepth 1 -type d -exec basename {} \;)" + PIDS=() for image in ${IMAGES}; do GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" docker tag singlecellopenproblems/$image $GHCR_IMAGE docker push $GHCR_IMAGE & + PIDS+=$! + done + for pid in "$PIDS[@]"; do + # ensure exited with status 0 + wait $pid done - wait # convert to JSON echo "images=[\"$(paste -s -d ' ' <(echo $IMAGES) | sed 's/ */\",\"/g')\"]" >> $GITHUB_OUTPUT @@ -242,11 +247,16 @@ jobs: - name: Download docker images if: "needs.build_images.result == 'success'" run: | + PIDS=() for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & + PIDS+=$! + done + for pid in "$PIDS[@]"; do + # ensure exited with status 0 + wait $pid done - wait - name: Set up environment run: | @@ -408,11 +418,16 @@ jobs: - name: Download docker images run: | + PIDS=() for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & + PIDS+=$! + done + for pid in "$PIDS[@]"; do + # ensure exited with status 0 + wait $pid done - wait - name: Set up environment id: setup-environment @@ -443,11 +458,16 @@ jobs: ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ docker login --username AWS --password-stdin $ECR_ENDPOINT + PIDS=() for image in $(cd docker && ls -1d */ | tr -d '/'); do docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} & + PIDS+=$! + done + for pid in "$PIDS[@]"; do + # ensure exited with status 0 + wait $pid done - wait run_benchmark: needs: From 143b67b405c9cfa05bb7b86dc8935fbed0298a00 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 6 Dec 2022 13:33:09 -0500 Subject: [PATCH 180/266] fix typo --- .github/workflows/run_tests.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 1b37ad1384..baf4be4b4d 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -127,7 +127,7 @@ jobs: docker push $GHCR_IMAGE & PIDS+=$! done - for pid in "$PIDS[@]"; do + for pid in "${PIDS[@]}"; do # ensure exited with status 0 wait $pid done @@ -253,7 +253,7 @@ jobs: (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & PIDS+=$! done - for pid in "$PIDS[@]"; do + for pid in "${PIDS[@]}"; do # ensure exited with status 0 wait $pid done @@ -424,7 +424,7 @@ jobs: (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & PIDS+=$! done - for pid in "$PIDS[@]"; do + for pid in "${PIDS[@]}"; do # ensure exited with status 0 wait $pid done @@ -464,7 +464,7 @@ jobs: docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} & PIDS+=$! done - for pid in "$PIDS[@]"; do + for pid in "${PIDS[@]}"; do # ensure exited with status 0 wait $pid done From f26f2eb81a6e54b56ab8e6feaab518ee18c53504 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 6 Dec 2022 15:21:57 -0500 Subject: [PATCH 181/266] another typo --- .github/workflows/run_tests.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index baf4be4b4d..ae04e936f7 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -125,7 +125,7 @@ jobs: GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" docker tag singlecellopenproblems/$image $GHCR_IMAGE docker push $GHCR_IMAGE & - PIDS+=$! + PIDS+=( $! ) done for pid in "${PIDS[@]}"; do # ensure exited with status 0 @@ -251,7 +251,7 @@ jobs: for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & - PIDS+=$! + PIDS+=( $! ) done for pid in "${PIDS[@]}"; do # ensure exited with status 0 @@ -422,7 +422,7 @@ jobs: for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & - PIDS+=$! + PIDS+=( $! ) done for pid in "${PIDS[@]}"; do # ensure exited with status 0 @@ -462,7 +462,7 @@ jobs: for image in $(cd docker && ls -1d */ | tr -d '/'); do docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} & - PIDS+=$! + PIDS+=( $! ) done for pid in "${PIDS[@]}"; do # ensure exited with status 0 From 9bc10bc8472727d7e06080473ba6145d4194ee54 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 6 Dec 2022 19:12:35 -0500 Subject: [PATCH 182/266] don't need to init PIDS --- .github/workflows/run_tests.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index ae04e936f7..b91668df03 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -120,7 +120,6 @@ jobs: id: export-images run: | IMAGES="$(find ./docker -mindepth 1 -type d -exec basename {} \;)" - PIDS=() for image in ${IMAGES}; do GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" docker tag singlecellopenproblems/$image $GHCR_IMAGE @@ -247,7 +246,6 @@ jobs: - name: Download docker images if: "needs.build_images.result == 'success'" run: | - PIDS=() for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & @@ -418,7 +416,6 @@ jobs: - name: Download docker images run: | - PIDS=() for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & @@ -458,7 +455,6 @@ jobs: ECR_ENDPOINT="490915662541.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | \ docker login --username AWS --password-stdin $ECR_ENDPOINT - PIDS=() for image in $(cd docker && ls -1d */ | tr -d '/'); do docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} & From b353a462f6ea353e0fc43d0f9fcbbe621edc3a0b Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 6 Dec 2022 19:12:49 -0500 Subject: [PATCH 183/266] [Dimensionality reduction] Fix RMSE metric (#743) * make RMSE invariant to scalar multiplication * minimize RMSE, not maximise * add spectral rmse * document changes * fix typo * debug * n - 2? * remove print statements * define random_state --- .../tasks/dimensionality_reduction/README.md | 7 +- .../metrics/__init__.py | 1 + .../metrics/root_mean_square_error.py | 73 ++++++++----------- 3 files changed, 38 insertions(+), 43 deletions(-) diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index 0c78cb164d..3d3bf13cd4 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -24,8 +24,11 @@ data for visualization and interpretation. ## The metrics * **Root mean square error**: the square root of the mean squared difference between - Euclidean distances in the high-dimensional data and Euclidean distances in the - dimension-reduced data. + ground truth distances in the high-dimensional data and Euclidean distances in the + dimension-reduced data, invariant to scalar multiplication. *RMSE* computes + high-dimensional distances in Euclidean space, while *RMSE (spectral)* computes + [diffusion distances](http://dx.doi.org/10.1016/j.acha.2006.04.006) (i.e. Euclidean + distances on the [Laplacian Eigenmap](http://dx.doi.org/10.1162/089976603321780317)). * **Trustworthiness**: a measurement of similarity between the rank of each point's nearest neighbors in the high-dimensional data and the reduced data ([Venna & Kaski, 2001](http://dx.doi.org/10.1007/3-540-44668-0_68)). diff --git a/openproblems/tasks/dimensionality_reduction/metrics/__init__.py b/openproblems/tasks/dimensionality_reduction/metrics/__init__.py index 44f7cb478f..f0b46e6813 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/__init__.py @@ -6,4 +6,5 @@ from .nn_ranking import qnn from .nn_ranking import qnn_auc from .root_mean_square_error import rmse +from .root_mean_square_error import rmse_spectral from .trustworthiness import trustworthiness diff --git a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py index 52136ee29b..0963e1787b 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py @@ -1,56 +1,47 @@ from ....tools.decorators import metric -import numpy as np - -def calculate_squareform_pairwise_distance(data): - """Calculate pairwise distances. - - Compute pairwise distance between points in a matrix / vector and then format this - into a squareform vector. - """ +def _rmse(X, X_emb): + import scipy.optimize import scipy.spatial - return scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(data)) - - -def calculate_rmse(adata, n_svd=200): - """Calculate dimensional reduction stress via root mean square error.""" - import sklearn.decomposition - import sklearn.metrics - - X = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) - high_dimensional_distance_matrix = calculate_squareform_pairwise_distance(X) - - low_dimensional_distance_matrix = calculate_squareform_pairwise_distance( - adata.obsm["X_emb"] + high_dimensional_distance_vector = scipy.spatial.distance.pdist(X) + low_dimensional_distance_vector = scipy.spatial.distance.pdist(X_emb) + scale, rmse = scipy.optimize.nnls( + low_dimensional_distance_vector[:, None], high_dimensional_distance_vector ) + return rmse - diff = high_dimensional_distance_matrix - low_dimensional_distance_matrix - kruskel_matrix = np.sqrt(diff**2 / sum(low_dimensional_distance_matrix**2)) - - kruskel_score = np.sqrt(sum(diff**2) / sum(low_dimensional_distance_matrix**2)) - - y_actual = high_dimensional_distance_matrix - y_predic = low_dimensional_distance_matrix +@metric(metric_name="RMSE", maximize=False) +def rmse(adata, n_svd=200): + """Calculate the root mean squared error. - rms = np.sqrt(sklearn.metrics.mean_squared_error(y_actual, y_predic)) + Computes (RMSE) between the full (or processed) data matrix and the + dimensionally-reduced matrix, invariant to scalar multiplication + """ + import sklearn.decomposition - return kruskel_matrix, kruskel_score, rms + X = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) + return _rmse(X, adata.obsm["X_emb"]) -@metric(metric_name="root mean squared error", maximize=True) -def rmse(adata): - """Calculate the root mean squared error. +@metric(metric_name="RMSE (spectral)", maximize=False) +def rmse_spectral(adata, n_comps=200): + """Calculate the spectral root mean squared error - Computes (RMSE) between the full (or processed) data matrix and a list of - dimensionally-reduced matrices. + Computes (RMSE) between high-dimensional Laplacian eigenmaps on the full (or + processed) data matrix and the dimensionally-reduced matrix, invariant to scalar + multiplication """ - ( - adata.obsp["kruskel_matrix"], - adata.uns["kruskel_score"], - adata.uns["rmse_score"], - ) = calculate_rmse(adata) + import numpy as np + import umap + import umap.spectral - return float(adata.uns["rmse_score"]) + n_comps = min(n_comps, min(adata.shape) - 2) + + graph = umap.UMAP(transform_mode="graph").fit_transform(adata.X) + X = umap.spectral.spectral_layout( + adata.X, graph, n_comps, random_state=np.random.default_rng() + ) + return _rmse(X, adata.obsm["X_emb"]) From fcd5fd8a8da1a2e0a8f523ab63b613f852bb5392 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 7 Dec 2022 09:56:18 -0500 Subject: [PATCH 184/266] another fix --- .github/workflows/run_tests.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index b91668df03..c68c3b9e69 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -248,13 +248,18 @@ jobs: run: | for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" - (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & + docker pull $GHCR_IMAGE & PIDS+=( $! ) done for pid in "${PIDS[@]}"; do # ensure exited with status 0 wait $pid done + # tag images + for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do + GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" + docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest + done - name: Set up environment run: | @@ -418,13 +423,18 @@ jobs: run: | for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" - (docker pull $GHCR_IMAGE && docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest) & + docker pull $GHCR_IMAGE & PIDS+=( $! ) done for pid in "${PIDS[@]}"; do # ensure exited with status 0 wait $pid done + # tag images + for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do + GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" + docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest + done - name: Set up environment id: setup-environment From 15924e57f0c4bdb69b7ad54549c2fdc5c1233c3f Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 8 Dec 2022 10:59:28 -0500 Subject: [PATCH 185/266] Fix PID checking (#748) * temp1 * set -x * Revert "temp1" This reverts commit 8347fea1a5bb153502c674e4e960ffd44338e622. * temp2 * set -x * temp3 * temp4 * temp5 * fixes * untemp * must be bash * bash --- .github/workflows/run_tests.yml | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index c68c3b9e69..3aebda83a0 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -124,7 +124,7 @@ jobs: GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" docker tag singlecellopenproblems/$image $GHCR_IMAGE docker push $GHCR_IMAGE & - PIDS+=( $! ) + PIDS+=("${!}") done for pid in "${PIDS[@]}"; do # ensure exited with status 0 @@ -132,6 +132,7 @@ jobs: done # convert to JSON echo "images=[\"$(paste -s -d ' ' <(echo $IMAGES) | sed 's/ */\",\"/g')\"]" >> $GITHUB_OUTPUT + shell: bash -e {0} create_matrix: runs-on: ubuntu-latest @@ -245,11 +246,14 @@ jobs: - name: Download docker images if: "needs.build_images.result == 'success'" + env: + REPO_OWNER: ${{ github.repository_owner }} + RUN_ID: ${{ github.run_id }} run: | for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do - GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" + GHCR_IMAGE="ghcr.io/${REPO_OWNER}/${image}:${RUN_ID}" docker pull $GHCR_IMAGE & - PIDS+=( $! ) + PIDS+=("${!}") done for pid in "${PIDS[@]}"; do # ensure exited with status 0 @@ -257,9 +261,10 @@ jobs: done # tag images for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do - GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" - docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest + GHCR_IMAGE="ghcr.io/${REPO_OWNER}/${image}:${RUN_ID}" + docker tag $GHCR_IMAGE singlecellopenproblems/${image}:latest done + shell: bash -e {0} - name: Set up environment run: | @@ -420,11 +425,14 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Download docker images + env: + REPO_OWNER: ${{ github.repository_owner }} + RUN_ID: ${{ github.run_id }} run: | for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do - GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" + GHCR_IMAGE="ghcr.io/${REPO_OWNER}/${image}:${RUN_ID}" docker pull $GHCR_IMAGE & - PIDS+=( $! ) + PIDS+=("${!}") done for pid in "${PIDS[@]}"; do # ensure exited with status 0 @@ -432,9 +440,10 @@ jobs: done # tag images for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do - GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" - docker tag $GHCR_IMAGE singlecellopenproblems/$image:latest + GHCR_IMAGE="ghcr.io/${REPO_OWNER}/${image}:${RUN_ID}" + docker tag $GHCR_IMAGE singlecellopenproblems/${image}:latest done + shell: bash -e {0} - name: Set up environment id: setup-environment @@ -468,12 +477,13 @@ jobs: for image in $(cd docker && ls -1d */ | tr -d '/'); do docker tag singlecellopenproblems/${image} ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} docker push --quiet ${ECR_ENDPOINT}/openproblems:${BRANCH}-${image} & - PIDS+=( $! ) + PIDS+=("${!}") done for pid in "${PIDS[@]}"; do # ensure exited with status 0 wait $pid done + shell: bash -e {0} run_benchmark: needs: From d31f78c63031b3a6383079210e53c94092d49ebd Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 8 Dec 2022 21:31:37 +0100 Subject: [PATCH 186/266] rename nbt2022-reproducibility to website-experimental --- .github/workflows/process_results.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/process_results.yml b/.github/workflows/process_results.yml index a7a381799d..d271ee4ad9 100644 --- a/.github/workflows/process_results.yml +++ b/.github/workflows/process_results.yml @@ -36,7 +36,7 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 1 - repository: openproblems-bio/nbt2022-reproducibility + repository: openproblems-bio/website-experimental path: nbt2022-reproducibility token: ${{ secrets.GH_ACTIONS_NBT_REPRODUCIBILITY_PAT }} From 3d24e8f09520a485ca20ae45fb97ae2b2574dd21 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Dec 2022 10:55:46 -0500 Subject: [PATCH 187/266] Bump jaxlib from 0.3.22 to 0.3.25 in /docker/openproblems-python-pytorch (#737) * Bump jaxlib from 0.3.22 to 0.3.25 in /docker/openproblems-python-pytorch Bumps [jaxlib](https://github.com/google/jax) from 0.3.22 to 0.3.25. - [Release notes](https://github.com/google/jax/releases) - [Changelog](https://github.com/google/jax/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/jax/compare/jaxlib-v0.3.22...jaxlib-v0.3.25) --- updated-dependencies: - dependency-name: jaxlib dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] * Bump jax Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-python-pytorch/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/openproblems-python-pytorch/requirements.txt b/docker/openproblems-python-pytorch/requirements.txt index 56bd2a53dc..9eed4839ed 100644 --- a/docker/openproblems-python-pytorch/requirements.txt +++ b/docker/openproblems-python-pytorch/requirements.txt @@ -1,8 +1,8 @@ git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix -jax==0.3.23 -jaxlib==0.3.22 +jax==0.3.25 +jaxlib==0.3.25 scalex==1.0.2 scikit-misc==0.1.* scvi-tools~=0.17 # pinned in #313 From 57e73c560a3036122a2a665640117a6600aab565 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 14 Dec 2022 18:25:33 +1100 Subject: [PATCH 188/266] Bump cmake from 3.24.1.1 to 3.25.0 in /docker/openproblems-python-extras (#750) Bumps [cmake](https://github.com/scikit-build/cmake-python-distributions) from 3.24.1.1 to 3.25.0. - [Release notes](https://github.com/scikit-build/cmake-python-distributions/releases) - [Changelog](https://github.com/scikit-build/cmake-python-distributions/blob/master/HISTORY.rst) - [Commits](https://github.com/scikit-build/cmake-python-distributions/compare/3.24.1.1...3.25.0) --- updated-dependencies: - dependency-name: cmake dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-python-extras/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index 7d6aee63d5..8b5c6930ec 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -1,4 +1,4 @@ -cmake==3.24.1.1 +cmake==3.25.0 git+https://github.com/jorvis/Multicore-TSNE@6832575 git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python git+https://github.com/scottgigante-immunai/knn-smoothing@python_package From a49ae996dc5f34402b1fa5cab9b3a3936126a413 Mon Sep 17 00:00:00 2001 From: Daniel Dimitrov <50865230+dbdimitrov@users.noreply.github.com> Date: Wed, 14 Dec 2022 10:03:18 +0100 Subject: [PATCH 189/266] Ccc aggregate + ligand-target subtask fix (#752) * fix issue with random ligand-target, add aggregations * pre-commit * replace tag with commit * retry with release * add basilisk.utils to requirements * add dir.expiry to r-extras * pre-commit Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .../openproblems-r-extras/r_requirements.txt | 4 +- .../_common/methods/__init__.py | 6 ++- .../_common/methods/baseline.py | 18 +++++---- .../_common/methods/liana.R | 2 +- .../_common/methods/liana.py | 37 +++++++++++++++---- .../_common/metrics/odds_ratio.py | 4 +- .../methods/__init__.py | 6 ++- .../methods/__init__.py | 6 ++- test/test_task_cell_cell_communication.py | 8 ++-- 9 files changed, 62 insertions(+), 29 deletions(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 0b8da4bde9..4950baccce 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -1,5 +1,6 @@ bioc::batchelor@1.12.3 bioc::ComplexHeatmap@2.12.1 +bioc::dir.expiry bioc::scater@1.24.0 bioc::scran@1.24.1 bioc::scuttle@1.6.3 @@ -23,6 +24,7 @@ htmltools@0.5.3 htmlwidgets@1.5.4 igraph@1.3.5 lifecycle@1.0.3 +LTLA/basilisk.utils # required for liana0.1.9 Matrix@1.5-1 pkgdown@2.0.6 pkgload@1.3.1 @@ -36,7 +38,7 @@ rlang@1.0.6 rliger@1.0.0 rmarkdown@2.2 RSQLite@2.2.4 -saezlab/liana@0.1.7 +saezlab/liana@0.1.9 saezlab/OmnipathR@679bb79 # master sass@0.4.2 sctransform@0.3.4 diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py b/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py index abd29a6804..cd6c163e4f 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/__init__.py @@ -4,11 +4,13 @@ from .liana import cellphonedb_sum from .liana import connectome_max from .liana import connectome_sum -from .liana import liana_max -from .liana import liana_sum from .liana import logfc_max from .liana import logfc_sum +from .liana import magnitude_max +from .liana import magnitude_sum from .liana import natmi_max from .liana import natmi_sum from .liana import sca_max from .liana import sca_sum +from .liana import specificity_max +from .liana import specificity_sum diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py b/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py index 2b8054fc17..ff5939a11f 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py @@ -14,22 +14,24 @@ is_baseline=True, ) def random_events(adata, test=False, n_events=1000): - adata.uns["ccc_pred"] = pd.DataFrame( + rng = np.random.default_rng(seed=1) + + ccc_pred = pd.DataFrame( { - "ligand": np.random.choice( + "ligand": rng.choice( adata.uns["ligand_receptor_resource"]["ligand_genesymbol"], n_events ), "receptor": np.random.choice( adata.uns["ligand_receptor_resource"]["receptor_genesymbol"], n_events ), - "source": np.random.choice(adata.obs["label"].cat.categories, n_events), - "target": np.random.choice(adata.obs["label"].cat.categories, n_events), - "score": np.random.uniform(0, 1, n_events), + "source": rng.choice(adata.obs["label"].cat.categories, n_events), + "target": rng.choice(adata.obs["label"].cat.categories, n_events), + "score": rng.uniform(0, 1, n_events), } ) - adata.uns["ccc_pred"] = adata.uns["ccc_pred"].loc[ - ~adata.uns["ccc_pred"][adata.uns["merge_keys"]].duplicated() - ] + ccc_pred = ccc_pred.loc[~ccc_pred[adata.uns["merge_keys"]].duplicated()] + + adata.uns["ccc_pred"] = ccc_pred adata.uns["method_code_version"] = check_version("openproblems") return adata diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.R b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.R index 302016bde4..e7dfc41dad 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.R +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.R @@ -37,7 +37,7 @@ liana_res <- liana_wrap(sce, # Aggregate if a run /w multiple methods if (!is.tibble(liana_res)) { liana_res <- liana_res %>% - liana_aggregate() %>% + liana_aggregate(aggregate_how = aggregate_how) %>% # inverse distribution mutate(aggregate_rank = 1 - aggregate_rank) } diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py index dc2bba944c..522d5c330f 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py @@ -17,7 +17,8 @@ def _p_filt(x, y): _r_liana = r_function( - "liana.R", args="sce, op_resource, min_expression_prop, idents_col, test, ..." + "liana.R", + args="sce, op_resource, min_expression_prop, idents_col, test, aggregate_how, ...", ) _liana_method = functools.partial( @@ -36,6 +37,7 @@ def _liana( score_col="aggregate_rank", min_expression_prop=0.1, test=False, + aggregate_how=None, **kwargs, ): # log-normalize @@ -50,6 +52,7 @@ def _liana( min_expression_prop=min_expression_prop, idents_col="label", test=test, + aggregate_how=aggregate_how, **kwargs, ) @@ -63,20 +66,40 @@ def _liana( @_liana_method( - method_name="LIANA Rank Aggregate (max)", + method_name="Specificity Rank Aggregate (max)", ) -def liana_max(adata, test=False): - adata = _liana(adata, test=test) +def specificity_max(adata, test=False): + adata = _liana(adata, test=test, aggregate_how="specificity") adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="max") return adata @_liana_method( - method_name="LIANA Rank Aggregate (sum)", + method_name="Specificity Rank Aggregate (sum)", ) -def liana_sum(adata, test=False): - adata = _liana(adata, test=test) +def specificity_sum(adata, test=False): + adata = _liana(adata, test=test, aggregate_how="specificity") + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="sum") + + return adata + + +@_liana_method( + method_name="Magnitude Rank Aggregate (max)", +) +def magnitude_max(adata, test=False): + adata = _liana(adata, test=test, aggregate_how="magnitude") + adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="max") + + return adata + + +@_liana_method( + method_name="Magnitude Rank Aggregate (sum)", +) +def magnitude_sum(adata, test=False): + adata = _liana(adata, test=test, aggregate_how="magnitude") adata.uns["ccc_pred"] = aggregate_method_scores(adata, how="sum") return adata diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py index 21e9903e1a..084669d110 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py @@ -9,12 +9,12 @@ def _sigmoid_transform(x): @metric(metric_name="Odds Ratio", maximize=True) -def odds_ratio(adata): +def odds_ratio(adata, top_prop=0.05): # Join benchmark (assumed truth) and ccc results # Get /w ccc_target and a response [0, 1] column gt = join_truth_and_pred(adata) gt = gt.sort_values("score", ascending=False) - top_n = np.sum(adata.uns["ccc_target"].response) + top_n = int(adata.uns["ccc_target"].shape[0] * top_prop) # assign the top rank interactions to 1 a = np.zeros(len(gt["score"])) diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py index 15e21695aa..7e1180f55b 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/methods/__init__.py @@ -2,13 +2,15 @@ from ..._common.methods import cellphonedb_sum from ..._common.methods import connectome_max from ..._common.methods import connectome_sum -from ..._common.methods import liana_max -from ..._common.methods import liana_sum from ..._common.methods import logfc_max from ..._common.methods import logfc_sum +from ..._common.methods import magnitude_max +from ..._common.methods import magnitude_sum from ..._common.methods import natmi_max from ..._common.methods import natmi_sum from ..._common.methods import random_events from ..._common.methods import sca_max from ..._common.methods import sca_sum +from ..._common.methods import specificity_max +from ..._common.methods import specificity_sum from ..._common.methods import true_events diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py index 15e21695aa..7e1180f55b 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/methods/__init__.py @@ -2,13 +2,15 @@ from ..._common.methods import cellphonedb_sum from ..._common.methods import connectome_max from ..._common.methods import connectome_sum -from ..._common.methods import liana_max -from ..._common.methods import liana_sum from ..._common.methods import logfc_max from ..._common.methods import logfc_sum +from ..._common.methods import magnitude_max +from ..._common.methods import magnitude_sum from ..._common.methods import natmi_max from ..._common.methods import natmi_sum from ..._common.methods import random_events from ..._common.methods import sca_max from ..._common.methods import sca_sum +from ..._common.methods import specificity_max +from ..._common.methods import specificity_sum from ..._common.methods import true_events diff --git a/test/test_task_cell_cell_communication.py b/test/test_task_cell_cell_communication.py index fc04d46542..ab22f2a6de 100644 --- a/test/test_task_cell_cell_communication.py +++ b/test/test_task_cell_cell_communication.py @@ -121,16 +121,16 @@ def test_odds_ratio_no_match(): # check expected output adata = task.api.sample_method(adata) - m = metric(adata) + m = metric(adata, top_prop=0.4) assert np.issubdtype("float64", m) - assert m == 0.813953488372093 + assert m == 0.7 # force perfect score adata = task.methods.true_events(adata) - m = metric(adata) + m = metric(adata, top_prop=0.4) assert m == 1 # force exception adata.uns["ccc_target"]["response"] = 0 - m = metric(adata) + m = metric(adata, top_prop=0.4) assert m is np.nan From 413aebd35eee2e5a07a6a41946f293f4f038a946 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Sat, 17 Dec 2022 21:04:14 +1100 Subject: [PATCH 190/266] Fix docker image builds (#758) * bump deps * libgsl-dev * hardhat * seuratobject * sctransform * fix rctd * fix rctd filtering * fix typo * just set the minimum to 1 --- docker/openproblems-r-extras/Dockerfile | 3 +- .../openproblems-r-extras/r_requirements.txt | 10 +++--- .../spatial_decomposition/methods/rctd.R | 18 ++++++++-- .../spatial_decomposition/methods/rctd.py | 35 +++++++++++++++---- 4 files changed, 51 insertions(+), 15 deletions(-) diff --git a/docker/openproblems-r-extras/Dockerfile b/docker/openproblems-r-extras/Dockerfile index 62425ae57c..dee7cc83b8 100644 --- a/docker/openproblems-r-extras/Dockerfile +++ b/docker/openproblems-r-extras/Dockerfile @@ -10,7 +10,8 @@ WORKDIR / RUN apt-get update && apt-get install -y \ libhdf5-dev hdf5-tools libgeos-dev \ - libharfbuzz-dev libfribidi-dev + libharfbuzz-dev libfribidi-dev \ + libgsl-dev RUN apt-get clean autoclean && \ apt-get autoremove --yes && \ diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 4950baccce..238573d2a3 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -11,13 +11,13 @@ conos@1.5.0 crayon@1.5.2 dbplyr@2.2.1 devtools@2.4.5 -dmcable/spacexr@eeb02a2 # master +dmcable/spacexr@56787ee # master downlit@0.4.2 dplyr@1.0.10 e1071@1.7-12 ellipsis@0.3.2 forecast@8.18 -hardhat@1.1.0 +hardhat@1.2.0 here@1.0.1 hexbin@1.28.2 htmltools@0.5.3 @@ -41,9 +41,9 @@ RSQLite@2.2.4 saezlab/liana@0.1.9 saezlab/OmnipathR@679bb79 # master sass@0.4.2 -sctransform@0.3.4 -Seurat@4.1.1 -SeuratObject@4.1.1 +sctransform@0.3.5 +Seurat@4.3.0 +SeuratObject@4.1.3 shiny@1.4.0.2 sparsesvd@0.2 systemfonts@1.0.4 diff --git a/openproblems/tasks/spatial_decomposition/methods/rctd.R b/openproblems/tasks/spatial_decomposition/methods/rctd.R index 5989b3eb7c..044c87d20e 100644 --- a/openproblems/tasks/spatial_decomposition/methods/rctd.R +++ b/openproblems/tasks/spatial_decomposition/methods/rctd.R @@ -4,6 +4,13 @@ #' #' @param sce_sc SingleCellExperiment single-cell dataset #' @param sce_sp SingleCellExperiment spatial dataset +#' @param fc_cutoff minimum log-fold-change (across cell types) for genes to be +#' included in the platform effect normalization step. +#' @param fc_cutoff_reg minimum log-fold-change (across cell types) for genes to +#' be included in the RCTD step. +#' @param max_cores for parallel processing, the number of cores used. If set to +#' 1, parallel processing is not used. The system will additionally be checked +#' for number of available cores. #' @return sce_sp SingleCellExperiment spatial dataset with predictions in obs library(spacexr) @@ -30,9 +37,14 @@ colnames(sp_counts) <- colnames(sce_sp) puck <- SpatialRNA(sp_coords, sp_counts) # create RCTD object from reference and spatialRNA objects my_rctd <- create.RCTD( - puck, reference, - max_cores = 1, - test_mode = FALSE, UMI_min_sigma = 100 + puck, + reference, + max_cores = max_cores, + fc_cutoff = fc_cutoff, + fc_cutoff_reg = fc_cutoff_reg, + test_mode = FALSE, + UMI_min_sigma = 100, + CELL_MIN_INSTANCE = 1 ) # run analysis and get results my_rctd <- run.RCTD(my_rctd) diff --git a/openproblems/tasks/spatial_decomposition/methods/rctd.py b/openproblems/tasks/spatial_decomposition/methods/rctd.py index c05b703463..9b73c21c2c 100644 --- a/openproblems/tasks/spatial_decomposition/methods/rctd.py +++ b/openproblems/tasks/spatial_decomposition/methods/rctd.py @@ -2,10 +2,12 @@ from ....tools.decorators import method from ....tools.utils import check_r_version from ..utils import split_sc_and_sp +from typing import Optional +import multiprocessing import numpy as np -_rctd = r_function("rctd.R", args="sce_sc, sce_sp") +_rctd = r_function("rctd.R", args="sce_sc, sce_sp, fc_cutoff, fc_cutoff_reg, max_cores") @method( @@ -16,20 +18,41 @@ code_url="https://github.com/dmcable/spacexr", image="openproblems-r-extras", ) -def rctd(adata, test=False): - # exctract single cell reference data +def rctd( + adata, + fc_cutoff: Optional[float] = None, + fc_cutoff_reg: Optional[float] = None, + test=False, +): + if test: + fc_cutoff = fc_cutoff or 0.05 + fc_cutoff_reg = fc_cutoff_reg or 0.075 + else: # pragma: nocover + fc_cutoff = fc_cutoff or 0.5 + fc_cutoff_reg = fc_cutoff_reg or 0.75 + # extract single cell reference data adata_sc, adata = split_sc_and_sp(adata) + labels = np.unique(adata_sc.obs["label"]) # set spatial coordinates for the single cell data adata_sc.obsm["spatial"] = np.ones((adata_sc.shape[0], 2)) + # remove rare cell types to prevent RCTD error + celltype_counts = adata_sc.obs["label"].value_counts() + adata_sc = adata_sc[ + ~adata_sc.obs["label"].isin(celltype_counts[celltype_counts < 25].index) + ].copy() # run RCTD - adata = _rctd(adata_sc, adata) + adata = _rctd( + adata_sc, adata, fc_cutoff, fc_cutoff_reg, max_cores=multiprocessing.cpu_count() + ) # get predicted cell type proportions from obs - cell_type_names = [x for x in adata.obs.columns if x.startswith("xCT")] + cell_type_names = [f"xCT_{label}" for label in labels] # add proportions - adata.obsm["proportions_pred"] = adata.obs[cell_type_names].to_numpy() + adata.obsm["proportions_pred"] = ( + adata.obs.reindex(cell_type_names, axis=1).fillna(0).to_numpy() + ) adata.uns["method_code_version"] = check_r_version("spacexr") From 926dfaa583f187c5de01ccf0cb101810bd46024b Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 19 Dec 2022 10:38:12 +0100 Subject: [PATCH 191/266] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5b5a3799a3..e43845212b 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Formalizing and benchmarking open problems in single-cell genomics. **Core** (_alphabetically_): * Daniel Burkhardt (@dburkhardt), Cellarity -* Robrecht Cannoodt (@rcannoodt), Data Intuitive +* Robrecht Cannoodt (@rcannood), Data Intuitive * Scott Gigante (@scottgigante-immunai), Immunai * Christopher Lance (@xlancelottx), Helmholtz Munich * Malte Luecken (@LuckyMD), Helmholtz Munich From 255b23e086c38d921b979c683fb9d9d439de5969 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Wed, 4 Jan 2023 17:32:40 +1100 Subject: [PATCH 192/266] fix normalization in baselines (#760) --- .../methods/__init__.py | 2 + .../methods/baseline.py | 41 +++++++++++++++++++ .../metrics/root_mean_square_error.py | 5 +++ 3 files changed, 48 insertions(+) diff --git a/openproblems/tasks/dimensionality_reduction/methods/__init__.py b/openproblems/tasks/dimensionality_reduction/methods/__init__.py index 715ea1decc..5f18c536af 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/methods/__init__.py @@ -1,5 +1,7 @@ from .baseline import random_features from .baseline import true_features +from .baseline import true_features_log_cpm +from .baseline import true_features_log_cpm_hvg from .densmap import densmap_logCPM_1kHVG from .densmap import densmap_pca_logCPM_1kHVG from .neuralee import neuralee_default diff --git a/openproblems/tasks/dimensionality_reduction/methods/baseline.py b/openproblems/tasks/dimensionality_reduction/methods/baseline.py index 8035bdd403..1c9d4e3bb9 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/baseline.py +++ b/openproblems/tasks/dimensionality_reduction/methods/baseline.py @@ -1,4 +1,6 @@ from ....tools.decorators import method +from ....tools.normalize import log_cpm +from ....tools.normalize import log_cpm_hvg from ....tools.utils import check_version import numpy as np @@ -34,3 +36,42 @@ def true_features(adata, test=False): adata.obsm["X_emb"] = adata.obsm["X_emb"].toarray() adata.uns["method_code_version"] = check_version("openproblems") return adata + + +@method( + method_name="True Features (logCPM)", + paper_name="True Features (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def true_features_log_cpm(adata, test=False): + adata = log_cpm(adata) + adata.obsm["X_emb"] = adata.X + if test: + adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100] + + adata.obsm["X_emb"] = adata.obsm["X_emb"].toarray() + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@method( + method_name="True Features (logCPM, 1kHVG)", + paper_name="True Features (baseline)", + paper_url="https://openproblems.bio", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def true_features_log_cpm_hvg(adata, test=False): + adata = log_cpm_hvg(adata) + adata = adata[:, adata.var["highly_variable"]].copy() + adata.obsm["X_emb"] = adata.X + if test: + adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100] + + adata.obsm["X_emb"] = adata.obsm["X_emb"].toarray() + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py index 0963e1787b..1089583bcd 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py @@ -1,4 +1,5 @@ from ....tools.decorators import metric +from ....tools.normalize import log_cpm def _rmse(X, X_emb): @@ -22,6 +23,8 @@ def rmse(adata, n_svd=200): """ import sklearn.decomposition + adata = log_cpm(adata) + X = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) return _rmse(X, adata.obsm["X_emb"]) @@ -38,6 +41,8 @@ def rmse_spectral(adata, n_comps=200): import umap import umap.spectral + adata = log_cpm(adata) + n_comps = min(n_comps, min(adata.shape) - 2) graph = umap.UMAP(transform_mode="graph").fit_transform(adata.X) From 0a0300c27ffd96b170ba10c95a0504a7cf27ea95 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 5 Jan 2023 09:34:49 +1100 Subject: [PATCH 193/266] downgrade gtfparse and polars (#766) --- docker/openproblems-python-bedtools/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/openproblems-python-bedtools/requirements.txt b/docker/openproblems-python-bedtools/requirements.txt index 5f308af4ed..3b2a743754 100644 --- a/docker/openproblems-python-bedtools/requirements.txt +++ b/docker/openproblems-python-bedtools/requirements.txt @@ -1,2 +1,4 @@ +gtfparse==1.3.* +polars==0.14.* pybedtools==0.9.* pyensembl==2.0.* From 3d8964a6c02496c0c604f0b1ddadc40589ca43a8 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 6 Jan 2023 11:41:40 +1100 Subject: [PATCH 194/266] Fix output headers order (#769) --- workflow/parse_nextflow.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index 3d62c2c773..7fe20b9ab7 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -338,7 +338,9 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): name=dataset.metadata["dataset_name"], data_url=dataset.metadata["data_url"], data_reference=dataset.metadata["data_reference"], - headers=dict(names=["Rank", "Mean score"], fixed=["Name", "Paper", "Library"]), + headers=dict( + names=["Rank", "Name", "Mean score"], fixed=["Name", "Paper", "Library"] + ), results=list(), ) dataset_results_raw = normalize_scores(task_name, dataset_results_raw) @@ -392,7 +394,6 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): "Memory (GB)", "Runtime (min)", "CPU (%)", - "Name", "Paper", "Year", "Library", From 29803b95c88b4ec5921df2eec7111fd5d1a95daf Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 10 Jan 2023 00:53:28 +1100 Subject: [PATCH 195/266] Convert references to bib (#720) * convert references to bib * add paper reference to metric decorator * fix missing reference in destvi dataset * load bibtexparser only when required * fix paper_url check * add links to readmes * don't abbreviate 10x genomics * fix tabula muris name * fix van der maaten * fix all bib * update bib * send main.bib to website * add references for denoising baselines * link to bibliography from results table * clean up bib * add pca reference * add contributing guidelines * don't change results spec * bugfixes --- .github/workflows/update_website_content.yml | 1 + .pre-commit-config.yaml | 5 + CONTRIBUTING.md | 17 +- main.bib | 940 ++++++++++++++++++ .../Wagner_2018_zebrafish_embryo_CRISPR.py | 2 +- openproblems/data/allen_brain_atlas.py | 2 +- openproblems/data/cengen.py | 4 +- openproblems/data/immune_cells.py | 2 +- .../data/mouse_blood_olssen_labelled.py | 2 +- openproblems/data/mouse_hspc_nestorowa2016.py | 4 +- openproblems/data/multimodal/citeseq.py | 2 +- openproblems/data/multimodal/scicar/base.py | 2 +- openproblems/data/pancreas.py | 2 +- openproblems/data/tabula_muris_senis.py | 2 +- openproblems/data/tenx.py | 5 +- openproblems/data/tnbc_wu2021.py | 2 +- openproblems/data/zebrafish.py | 2 +- .../tasks/_batch_integration/README.md | 2 +- .../batch_integration_embed/README.md | 5 +- .../methods/baseline.py | 45 +- .../metrics/cc_score.py | 1 + .../metrics/iso_label_sil.py | 1 + .../batch_integration_embed/metrics/kBET.py | 1 + .../batch_integration_embed/metrics/pcr.py | 1 + .../metrics/sil_batch.py | 1 + .../metrics/silhouette.py | 1 + .../batch_integration_feature/README.md | 2 +- .../methods/baseline.py | 8 +- .../metrics/hvg_conservation.py | 1 + .../batch_integration_graph/README.md | 2 +- .../methods/baseline.py | 10 +- .../batch_integration_graph/methods/bbknn.py | 2 +- .../batch_integration_graph/methods/combat.py | 2 +- .../methods/fastmnn.py | 2 +- .../methods/harmony.py | 2 +- .../batch_integration_graph/methods/liger.py | 2 +- .../batch_integration_graph/methods/mnn.py | 2 +- .../batch_integration_graph/methods/scalex.py | 2 +- .../methods/scanorama.py | 2 +- .../batch_integration_graph/methods/scanvi.py | 2 +- .../batch_integration_graph/methods/scvi.py | 2 +- .../batch_integration_graph/metrics/ari.py | 1 + .../metrics/graph_connectivity.py | 1 + .../metrics/iso_label_f1.py | 1 + .../batch_integration_graph/metrics/nmi.py | 1 + .../tasks/_cell_cell_communication/README.md | 4 +- .../_common/methods/baseline.py | 4 +- .../_common/methods/liana.py | 10 +- .../_common/metrics/auprc.py | 4 +- .../_common/metrics/odds_ratio.py | 2 +- .../README.md | 4 +- .../README.md | 4 +- openproblems/tasks/denoising/README.md | 34 +- openproblems/tasks/denoising/methods/alra.py | 2 +- .../tasks/denoising/methods/baseline.py | 4 +- openproblems/tasks/denoising/methods/dca.py | 2 +- .../tasks/denoising/methods/knn_smoothing.py | 2 +- openproblems/tasks/denoising/methods/magic.py | 4 +- openproblems/tasks/denoising/metrics/mse.py | 6 +- .../tasks/denoising/metrics/poisson.py | 7 +- .../tasks/dimensionality_reduction/README.md | 13 +- .../methods/baseline.py | 36 +- .../methods/densmap.py | 2 +- .../methods/neuralee.py | 2 +- .../dimensionality_reduction/methods/pca.py | 2 +- .../dimensionality_reduction/methods/phate.py | 5 +- .../dimensionality_reduction/methods/tsne.py | 2 +- .../dimensionality_reduction/methods/umap.py | 2 +- .../metrics/density.py | 7 +- .../metrics/nn_ranking.py | 16 +- .../metrics/root_mean_square_error.py | 12 +- .../metrics/trustworthiness.py | 6 +- openproblems/tasks/label_projection/README.md | 6 +- .../label_projection/methods/baseline.py | 6 +- .../methods/knn_classifier.py | 2 +- .../methods/logistic_regression.py | 2 +- .../tasks/label_projection/methods/mlp.py | 2 +- .../label_projection/methods/scvi_tools.py | 4 +- .../tasks/label_projection/methods/seurat.py | 2 +- .../tasks/label_projection/methods/xgboost.py | 2 +- .../label_projection/metrics/accuracy.py | 2 +- .../tasks/label_projection/metrics/f1.py | 6 +- .../multimodal_data_integration/README.md | 10 +- .../methods/baseline.py | 4 +- .../methods/harmonic_alignment.py | 2 +- .../methods/mnn.py | 2 +- .../methods/procrustes.py | 2 +- .../metrics/knn_auc.py | 6 +- .../metrics/mse.py | 6 +- .../regulatory_effect_prediction/README.md | 8 +- .../methods/baseline.py | 4 +- .../methods/beta.py | 2 +- .../metrics/correlation.py | 12 +- .../datasets/destvi/generate.py | 2 +- .../spatial_decomposition/methods/baseline.py | 4 +- .../methods/cell2location.py | 2 +- .../spatial_decomposition/methods/destvi.py | 2 +- .../spatial_decomposition/methods/nmfreg.py | 2 +- .../spatial_decomposition/methods/nnls.py | 2 +- .../spatial_decomposition/methods/rctd.py | 2 +- .../spatial_decomposition/methods/seuratv3.py | 2 +- .../methods/stereoscope.py | 2 +- .../spatial_decomposition/methods/tangram.py | 2 +- .../methods/vanillanmf.py | 2 +- .../tasks/spatial_decomposition/metrics/r2.py | 2 +- openproblems/tools/decorators.py | 21 +- setup.py | 1 + test/test_task_2_datasets.py | 2 +- test/test_task_methods.py | 7 +- test/test_task_metrics.py | 3 + test/utils/asserts.py | 23 + workflow/parse_nextflow.py | 6 +- 112 files changed, 1255 insertions(+), 225 deletions(-) create mode 100644 main.bib diff --git a/.github/workflows/update_website_content.yml b/.github/workflows/update_website_content.yml index 0d72f23d62..b1ddd4d027 100644 --- a/.github/workflows/update_website_content.yml +++ b/.github/workflows/update_website_content.yml @@ -52,6 +52,7 @@ jobs: run: | rm -r website/content/benchmarks/*/ python openproblems/workflow/generate_website_markdown.py website/content/benchmarks + cp main.bib website/static/bibliography cd website git diff --exit-code --quiet || echo "CHANGED=true" >> $GITHUB_ENV diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 57a3fce70c..3ea32ca8c8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,3 +39,8 @@ repos: hooks: - id: markdownlint-fix exclude: (SAGEMAKER.md|.github/ISSUE_TEMPLATE/bug_report.md|.github/pull_request_template.md) + - repo: https://github.com/FlamingTempura/bibtex-tidy + rev: "8838095" + hooks: + - id: bibtex-tidy + args: ['--omit', 'abstract', '--sort', '--duplicates', '--drop-all-caps', '--sort-fields', '--trailing-commas'] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 152b0224e7..9fd9468ed0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,6 +26,7 @@ website, or simply star it in GitHub to say "I use it". * [API](#api) * [Writing functions in R](#writing-functions-in-r) * [Adding package dependencies](#adding-package-dependencies) + * [Adding paper references](#adding-paper-references) * [Adding a new dataset](#adding-a-new-dataset) * [Adding a dataset / method / metric to a task](#adding-a-dataset--method--metric-to-a-task) @@ -218,7 +219,7 @@ _pca = r_function("pca.R") @method( method_name="PCA", paper_name="On lines and planes of closest fit to systems of points in space", - paper_url="https://www.tandfonline.com/doi/abs/10.1080/14786440109462720", + paper_reference="pearson1901pca", paper_year=1901, code_url="https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/prcomp", image="openproblems-r-base", @@ -252,12 +253,24 @@ def f2(adata): import package2 ``` +### Adding paper references + +All papers cited in the `openproblems` repository should be cited in [`main.bib`](main.bib) +and referenced in the corresponding dataset / method / metric decorator by its BibTeX +reference, generally of the form `author1900papername`. BibTeX entries should be retrieved +from [doi2bib.org](https://www.doi2bib.org/) where feasible, except for arXiv and bioRxiv +which provide more correct BibTeX entries on the paper abstract page. + +When referencing a paper in markdown (e.g. in a task README), you should link directly +to the bibliography entry on the Open Problems website using the BibTeX reference, e.g. +[`https://openproblems.bio/bibliography#openproblems`](https://openproblems.bio/bibliography#openproblems). + ### Adding a new dataset Datasets are loaded under `openproblems/data`. Each data loading function should download the appropriate dataset from a stable location (e.g. from Figshare) be decorated with `openproblems.data.utils.loader(data_url="https://data.link", -data_reference="https://doi.org/10.0/123")` in order to cache the result. +data_reference="author1900papername")` in order to cache the result. Data should be provided in a raw count format. We assume that `adata.X` contains the raw (count) data for the primary modality; this will also be copied to diff --git a/main.bib b/main.bib new file mode 100644 index 0000000000..c040c44913 --- /dev/null +++ b/main.bib @@ -0,0 +1,940 @@ +@misc{10x2018pbmc, + title = {1k PBMCs from a Healthy Donor (v3 chemistry)}, + author = {{10x Genomics}}, + year = {2018}, + url = {https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0}, +} +@misc{10x2019pbmc, + title = {5k Peripheral Blood Mononuclear Cells (PBMCs) from a Healthy Donor with a Panel of TotalSeq-B Antibodies (v3 chemistry)}, + author = {{10x Genomics}}, + year = {2019}, + url = {https://www.10xgenomics.com/resources/datasets/5-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-with-cell-surface-proteins-v-3-chemistry-3-1-standard-3-1-0}, +} +@article{andersson2020single, + title = {Single-cell and spatial transcriptomics enables probabilistic inference of cell type topography}, + author = {Alma Andersson and Joseph Bergenstr{\aa}hle and Michaela Asp and Ludvig Bergenstr{\aa}hle and Aleksandra Jurek and Jos{\'{e}} Fern{\'{a}}ndez Navarro and Joakim Lundeberg}, + year = {2020}, + month = oct, + journal = {Communications Biology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {3}, + number = {1}, + doi = {10.1038/s42003-020-01247-y}, + url = {https://doi.org/10.1038/s42003-020-01247-y}, +} +@article{batson2019molecular, + title = {Molecular Cross-Validation for Single-Cell RNA-seq}, + author = {Batson, Joshua and Royer, Lo{\"\i}c and Webber, James}, + year = {2019}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/786269}, + url = {https://www.biorxiv.org/content/early/2019/09/30/786269}, + elocation-id = {786269}, + eprint = {https://www.biorxiv.org/content/early/2019/09/30/786269.full.pdf}, +} +@article{biancalani2021deep, + title = {Deep learning and alignment of spatially resolved single-cell transcriptomes with Tangram}, + author = {Tommaso Biancalani and Gabriele Scalia and Lorenzo Buffoni and Raghav Avasthi and Ziqing Lu and Aman Sanger and Neriman Tokcan and Charles R. Vanderburg and {\AA}sa Segerstolpe and Meng Zhang and Inbal Avraham-Davidi and Sanja Vickovic and Mor Nitzan and Sai Ma and Ayshwarya Subramanian and Michal Lipinski and Jason Buenrostro and Nik Bear Brown and Duccio Fanelli and Xiaowei Zhuang and Evan Z. Macosko and Aviv Regev}, + year = {2021}, + month = oct, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {18}, + number = {11}, + pages = {1352--1362}, + doi = {10.1038/s41592-021-01264-7}, + url = {https://doi.org/10.1038/s41592-021-01264-7}, +} +@article{bland2000odds, + title = {Statistics Notes: The odds ratio}, + author = {J. M. Bland}, + year = {2000}, + month = may, + journal = {{Bmj}}, + publisher = {{Bmj}}, + volume = {320}, + number = {7247}, + pages = {1468--1468}, + doi = {10.1136/bmj.320.7247.1468}, + url = {https://doi.org/10.1136/bmj.320.7247.1468}, +} +@article{bttner2018test, + title = {A test metric for assessing single-cell {RNA}-seq batch correction}, + author = {Maren B\"{u}ttner and Zhichao Miao and F. Alexander Wolf and Sarah A. Teichmann and Fabian J. Theis}, + year = {2018}, + month = dec, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {1}, + pages = {43--49}, + doi = {10.1038/s41592-018-0254-1}, + url = {https://doi.org/10.1038/s41592-018-0254-1}, +} +@article{cabello2020singlecellsignalr, + title = {{SingleCellSignalR}: inference of intercellular networks from single-cell transcriptomics}, + author = {Simon Cabello-Aguilar and M{\'{e}}lissa Alame and Fabien Kon-Sun-Tack and Caroline Fau and Matthieu Lacroix and Jacques Colinge}, + year = {2020}, + month = mar, + journal = {Nucleic Acids Research}, + publisher = {Oxford University Press ({OUP})}, + volume = {48}, + number = {10}, + pages = {e55--e55}, + doi = {10.1093/nar/gkaa183}, + url = {https://doi.org/10.1093/nar/gkaa183}, +} +@article{cable2021robust, + title = {Robust decomposition of cell type mixtures in spatial transcriptomics}, + author = {Dylan M. Cable and Evan Murray and Luli S. Zou and Aleksandrina Goeva and Evan Z. Macosko and Fei Chen and Rafael A. Irizarry}, + year = {2021}, + month = feb, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {4}, + pages = {517--526}, + doi = {10.1038/s41587-021-00830-w}, + url = {https://doi.org/10.1038/s41587-021-00830-w}, +} +@article{cao2018joint, + title = {Joint profiling of chromatin accessibility and gene expression in thousands of single cells}, + author = {Junyue Cao and Darren A. Cusanovich and Vijay Ramani and Delasa Aghamirzaie and Hannah A. Pliner and Andrew J. Hill and Riza M. Daza and Jose L. McFaline-Figueroa and Jonathan S. Packer and Lena Christiansen and Frank J. Steemers and Andrew C. Adey and Cole Trapnell and Jay Shendure}, + year = {2018}, + month = sep, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {361}, + number = {6409}, + pages = {1380--1385}, + doi = {10.1126/science.aau0730}, + url = {https://doi.org/10.1126/science.aau0730}, +} +@article{cao2020human, + title = {A human cell atlas of fetal gene expression}, + author = {Junyue Cao and Diana R. O'Day and Hannah A. Pliner and Paul D. Kingsley and Mei Deng and Riza M. Daza and Michael A. Zager and Kimberly A. Aldinger and Ronnie Blecher-Gonen and Fan Zhang and Malte Spielmann and James Palis and Dan Doherty and Frank J. Steemers and Ian A. Glass and Cole Trapnell and Jay Shendure}, + year = {2020}, + month = nov, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {370}, + number = {6518}, + doi = {10.1126/science.aba7721}, + url = {https://doi.org/10.1126/science.aba7721}, +} +@inproceedings{chen2016xgboost, + title = {{XGBoost}}, + author = {Tianqi Chen and Carlos Guestrin}, + year = {2016}, + month = aug, + booktitle = {Proceedings of the 22nd {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining}, + publisher = {{Acm}}, + doi = {10.1145/2939672.2939785}, + url = {https://doi.org/10.1145/2939672.2939785}, +} +@article{cichocki2009fast, + title = {Fast Local Algorithms for Large Scale Nonnegative Matrix and Tensor Factorizations}, + author = {Andrzej Cichocki and Anh-Huy Phan}, + year = {2009}, + journal = {{IEICE} Transactions on Fundamentals of Electronics, Communications and Computer Sciences}, + publisher = {Institute of Electronics, Information and Communications Engineers ({IEICE})}, + volume = {E92-a}, + number = {3}, + pages = {708--721}, + doi = {10.1587/transfun.e92.a.708}, + url = {https://doi.org/10.1587/transfun.e92.a.708}, +} +@article{coifman2006diffusion, + title = {Diffusion maps}, + author = {Ronald R. Coifman and St{\'{e}}phane Lafon}, + year = {2006}, + month = jul, + journal = {Applied and Computational Harmonic Analysis}, + publisher = {Elsevier {BV}}, + volume = {21}, + number = {1}, + pages = {5--30}, + doi = {10.1016/j.acha.2006.04.006}, + url = {https://doi.org/10.1016/j.acha.2006.04.006}, +} +@article{cover1967nearest, + title = {Nearest neighbor pattern classification}, + author = {T. Cover and P. Hart}, + year = {1967}, + month = jan, + journal = {{IEEE} Transactions on Information Theory}, + publisher = {Institute of Electrical and Electronics Engineers ({IEEE})}, + volume = {13}, + number = {1}, + pages = {21--27}, + doi = {10.1109/tit.1967.1053964}, + url = {https://doi.org/10.1109/tit.1967.1053964}, +} +@inproceedings{davis2006prauc, + title = {The relationship between Precision-Recall and {ROC} curves}, + author = {Jesse Davis and Mark Goadrich}, + year = {2006}, + booktitle = {Proceedings of the 23rd international conference on Machine learning - {ICML} {\textquotesingle}06}, + publisher = {{ACM} Press}, + doi = {10.1145/1143844.1143874}, + url = {https://doi.org/10.1145/1143844.1143874}, +} +@article{dimitrov2022comparison, + title = {Comparison of methods and resources for cell-cell communication inference from single-cell {RNA}-Seq data}, + author = {Daniel Dimitrov and D{\'{e}}nes T\"{u}rei and Martin Garrido-Rodriguez and Paul L. Burmedi and James S. Nagai and Charlotte Boys and Ricardo O. Ramirez Flores and Hyojin Kim and Bence Szalai and Ivan G. Costa and Alberto Valdeolivas and Aur{\'{e}}lien Dugourd and Julio Saez-Rodriguez}, + year = {2022}, + month = jun, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {13}, + number = {1}, + doi = {10.1038/s41467-022-30755-0}, + url = {https://doi.org/10.1038/s41467-022-30755-0}, +} +@article{efremova2020cellphonedb, + title = {{CellPhoneDB}: inferring cell{\textendash}cell communication from combined expression of multi-subunit ligand{\textendash}receptor complexes}, + author = {Mirjana Efremova and Miquel Vento-Tormo and Sarah A. Teichmann and Roser Vento-Tormo}, + year = {2020}, + month = feb, + journal = {Nature Protocols}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {15}, + number = {4}, + pages = {1484--1506}, + doi = {10.1038/s41596-020-0292-x}, + url = {https://doi.org/10.1038/s41596-020-0292-x}, +} +@article{eraslan2019single, + title = {Single-cell {RNA}-seq denoising using a deep count autoencoder}, + author = {G\"{o}kcen Eraslan and Lukas M. Simon and Maria Mircea and Nikola S. Mueller and Fabian J. Theis}, + year = {2019}, + month = jan, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {10}, + number = {1}, + doi = {10.1038/s41467-018-07931-2}, + url = {https://doi.org/10.1038/s41467-018-07931-2}, +} +@article{gower1975generalized, + title = {Generalized procrustes analysis}, + author = {J. C. Gower}, + year = {1975}, + month = mar, + journal = {Psychometrika}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {1}, + pages = {33--51}, + doi = {10.1007/bf02291478}, + url = {https://doi.org/10.1007/bf02291478}, +} +@article{grandini2020metrics, + title = {Metrics for Multi-Class Classification: an Overview}, + author = {Grandini, Margherita and Bagli, Enrico and Visani, Giorgio}, + year = {2020}, + journal = {arXiv}, + publisher = {Cornell University}, + doi = {10.48550/arxiv.2008.05756}, + url = {https://arxiv.org/abs/2008.05756}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, +} +@article{granja2021archr, + title = {{ArchR} is a scalable software package for integrative single-cell chromatin accessibility analysis}, + author = {Jeffrey M. Granja and M. Ryan Corces and Sarah E. Pierce and S. Tansu Bagdatli and Hani Choudhry and Howard Y. Chang and William J. Greenleaf}, + year = {2021}, + month = feb, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {3}, + pages = {403--411}, + doi = {10.1038/s41588-021-00790-6}, + url = {https://doi.org/10.1038/s41588-021-00790-6}, +} +@article{grn2014validation, + title = {Validation of noise models for single-cell transcriptomics}, + author = {Dominic Gr\"{u}n and Lennart Kester and Alexander van Oudenaarden}, + year = {2014}, + month = apr, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {11}, + number = {6}, + pages = {637--640}, + doi = {10.1038/nmeth.2930}, + url = {https://doi.org/10.1038/nmeth.2930}, +} +@article{haghverdi2018batch, + title = {Batch effects in single-cell {RNA}-sequencing data are corrected by matching mutual nearest neighbors}, + author = {Laleh Haghverdi and Aaron T L Lun and Michael D Morgan and John C Marioni}, + year = {2018}, + month = apr, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {36}, + number = {5}, + pages = {421--427}, + doi = {10.1038/nbt.4091}, + url = {https://doi.org/10.1038/nbt.4091}, +} +@article{hammarlund2018cengen, + title = {The {CeNGEN} Project: The Complete Gene Expression Map of an Entire Nervous System}, + author = {Marc Hammarlund and Oliver Hobert and David M. Miller and Nenad Sestan}, + year = {2018}, + month = aug, + journal = {Neuron}, + publisher = {Elsevier {BV}}, + volume = {99}, + number = {3}, + pages = {430--433}, + doi = {10.1016/j.neuron.2018.07.042}, + url = {https://doi.org/10.1016/j.neuron.2018.07.042}, +} +@article{hansen2012removing, + title = {Adjusting batch effects in microarray expression data using empirical Bayes methods}, + author = {W. Evan Johnson and Cheng Li and Ariel Rabinovic}, + year = {2006}, + month = apr, + journal = {Biostatistics}, + publisher = {Oxford University Press ({OUP})}, + volume = {8}, + number = {1}, + pages = {118--127}, + doi = {10.1093/biostatistics/kxj037}, + url = {https://doi.org/10.1093/biostatistics/kxj037}, +} +@article{hao2021integrated, + title = {Integrated analysis of multimodal single-cell data}, + author = {Yuhan Hao and Stephanie Hao and Erica Andersen-Nissen and William M. Mauck and Shiwei Zheng and Andrew Butler and Maddie J. Lee and Aaron J. Wilk and Charlotte Darby and Michael Zager and Paul Hoffman and Marlon Stoeckius and Efthymia Papalexi and Eleni P. Mimitou and Jaison Jain and Avi Srivastava and Tim Stuart and Lamar M. Fleming and Bertrand Yeung and Angela J. Rogers and Juliana M. McElrath and Catherine A. Blish and Raphael Gottardo and Peter Smibert and Rahul Satija}, + year = {2021}, + month = jun, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {184}, + number = {13}, + pages = {3573--3587.e29}, + doi = {10.1016/j.cell.2021.04.048}, + url = {https://doi.org/10.1016/j.cell.2021.04.048}, +} +@article{hie2019efficient, + title = {Efficient integration of heterogeneous single-cell transcriptomes using Scanorama}, + author = {Brian Hie and Bryan Bryson and Bonnie Berger}, + year = {2019}, + month = may, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {37}, + number = {6}, + pages = {685--691}, + doi = {10.1038/s41587-019-0113-3}, + url = {https://doi.org/10.1038/s41587-019-0113-3}, +} +@article{hinton1989connectionist, + title = {Connectionist learning procedures}, + author = {Geoffrey E. Hinton}, + year = {1989}, + month = sep, + journal = {Artificial Intelligence}, + publisher = {Elsevier {BV}}, + volume = {40}, + number = {1-3}, + pages = {185--234}, + doi = {10.1016/0004-3702(89)90049-0}, + url = {https://doi.org/10.1016/0004-3702(89)90049-0}, +} +@book{hosmer2013applied, + title = {Applied logistic regression}, + author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.}, + year = {2013}, + publisher = {John Wiley \& Sons}, + volume = {398}, +} +@article{hou2019scmatch, + title = {{scMatch}: a single-cell gene expression profile annotation tool using reference datasets}, + author = {Rui Hou and Elena Denisenko and Alistair R R Forrest}, + year = {2019}, + month = apr, + journal = {Bioinformatics}, + publisher = {Oxford University Press ({OUP})}, + volume = {35}, + number = {22}, + pages = {4688--4695}, + doi = {10.1093/bioinformatics/btz292}, + url = {https://doi.org/10.1093/bioinformatics/btz292}, + editor = {Janet Kelso}, +} +@string{jan = {Jan}} +@string{feb = {Feb.}} +@string{mar = {Mar.}} +@string{apr = {Apr.}} +@string{may = {May}} +@string{jun = {Jun.}} +@string{jul = {Jul.}} +@string{aug = {Aug.}} +@string{sep = {Sept.}} +@string{oct = {Oct.}} +@string{nov = {Nov.}} +@string{dec = {Dec.}} +@article{hou2020predicting, + title = {Predicting cell-to-cell communication networks using {NATMI}}, + author = {Rui Hou and Elena Denisenko and Huan Ting Ong and Jordan A. Ramilowski and Alistair R. R. Forrest}, + year = {2020}, + month = oct, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {11}, + number = {1}, + doi = {10.1038/s41467-020-18873-z}, + url = {https://doi.org/10.1038/s41467-020-18873-z}, +} +@article{hou2020systematic, + title = {A systematic evaluation of single-cell {RNA}-sequencing imputation methods}, + author = {Wenpin Hou and Zhicheng Ji and Hongkai Ji and Stephanie C. Hicks}, + year = {2020}, + month = aug, + journal = {Genome Biology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {21}, + number = {1}, + doi = {10.1186/s13059-020-02132-x}, + url = {https://doi.org/10.1186/s13059-020-02132-x}, +} +@article{kiselev2019challenges, + title = {Challenges in unsupervised clustering of single-cell {RNA}-seq data}, + author = {Vladimir Yu Kiselev and Tallulah S. Andrews and Martin Hemberg}, + year = {2019}, + month = jan, + journal = {Nature Reviews Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {20}, + number = {5}, + pages = {273--282}, + doi = {10.1038/s41576-018-0088-9}, + url = {https://doi.org/10.1038/s41576-018-0088-9}, +} +@article{kleshchevnikov2022cell2location, + title = {Cell2location maps fine-grained cell types in spatial transcriptomics}, + author = {Vitalii Kleshchevnikov and Artem Shmatko and Emma Dann and Alexander Aivazidis and Hamish W. King and Tong Li and Rasa Elmentaite and Artem Lomakin and Veronika Kedlian and Adam Gayoso and Mika Sarkin Jain and Jun Sung Park and Lauma Ramona and Elizabeth Tuck and Anna Arutyunyan and Roser Vento-Tormo and Moritz Gerstung and Louisa James and Oliver Stegle and Omer Ali Bayraktar}, + year = {2022}, + month = jan, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {5}, + pages = {661--671}, + doi = {10.1038/s41587-021-01139-4}, + url = {https://doi.org/10.1038/s41587-021-01139-4}, +} +@article{korsunsky2019fast, + title = {Fast, sensitive and accurate integration of single-cell data with Harmony}, + author = {Ilya Korsunsky and Nghia Millard and Jean Fan and Kamil Slowikowski and Fan Zhang and Kevin Wei and Yuriy Baglaenko and Michael Brenner and Po-ru Loh and Soumya Raychaudhuri}, + year = {2019}, + month = nov, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {12}, + pages = {1289--1296}, + doi = {10.1038/s41592-019-0619-0}, + url = {https://doi.org/10.1038/s41592-019-0619-0}, +} +@article{kruskal1964mds, + title = {Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis}, + author = {J. B. Kruskal}, + year = {1964}, + month = mar, + journal = {Psychometrika}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {29}, + number = {1}, + pages = {1--27}, + doi = {10.1007/bf02289565}, + url = {https://doi.org/10.1007/bf02289565}, +} +@article{lance2022multimodal, + title = {Multimodal single cell data integration challenge: results and lessons learned}, + author = {Lance, Christopher and Luecken, Malte D. and Burkhardt, Daniel B. and Cannoodt, Robrecht and Rautenstrauch, Pia and Laddach, Anna and Ubingazhibov, Aidyn and Cao, Zhi-Jie and Deng, Kaiwen and Khan, Sumeer and Liu, Qiao and Russkikh, Nikolay and Ryazantsev, Gleb and Ohler, Uwe and , and Pisco, Angela Oliveira and Bloom, Jonathan and Krishnaswamy, Smita and Theis, Fabian J.}, + year = {2022}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2022.04.11.487796}, + url = {https://www.biorxiv.org/content/early/2022/04/12/2022.04.11.487796}, + elocation-id = {2022.04.11.487796}, + eprint = {https://www.biorxiv.org/content/early/2022/04/12/2022.04.11.487796.full.pdf}, +} +@book{lawson1995solving, + title = {Solving Least Squares Problems}, + author = {Charles L. Lawson and Richard J. Hanson}, + year = {1995}, + month = jan, + publisher = {Society for Industrial and Applied Mathematics}, + doi = {10.1137/1.9781611971217}, + url = {https://doi.org/10.1137/1.9781611971217}, +} +@article{linderman2018zero, + title = {Zero-preserving imputation of scRNA-seq data using low-rank approximation}, + author = {Linderman, George C. and Zhao, Jun and Kluger, Yuval}, + year = {2018}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/397588}, + url = {https://www.biorxiv.org/content/early/2018/08/22/397588}, + elocation-id = {397588}, + eprint = {https://www.biorxiv.org/content/early/2018/08/22/397588.full.pdf}, +} +@article{lopez2018deep, + title = {Deep generative modeling for single-cell transcriptomics}, + author = {Romain Lopez and Jeffrey Regier and Michael B. Cole and Michael I. Jordan and Nir Yosef}, + year = {2018}, + month = nov, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {15}, + number = {12}, + pages = {1053--1058}, + doi = {10.1038/s41592-018-0229-2}, + url = {https://doi.org/10.1038/s41592-018-0229-2}, +} +@article{lopez2022destvi, + title = {{DestVI} identifies continuums of cell types in spatial transcriptomics data}, + author = {Romain Lopez and Baoguo Li and Hadas Keren-Shaul and Pierre Boyeau and Merav Kedmi and David Pilzer and Adam Jelinski and Ido Yofe and Eyal David and Allon Wagner and Can Ergen and Yoseph Addadi and Ofra Golani and Franca Ronchese and Michael I. Jordan and Ido Amit and Nir Yosef}, + year = {2022}, + month = apr, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {9}, + pages = {1360--1369}, + doi = {10.1038/s41587-022-01272-8}, + url = {https://doi.org/10.1038/s41587-022-01272-8}, +} +@article{lotfollahi2020query, + title = {Query to reference single-cell integration with transfer learning}, + author = {Lotfollahi, Mohammad and Naghipourfar, Mohsen and Luecken, Malte D. and Khajavi, Matin and B{\"u}ttner, Maren and Avsec, Ziga and Misharin, Alexander V. and Theis, Fabian J.}, + year = {2020}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2020.07.16.205997}, + url = {https://doi.org/10.1101/2020.07.16.205997}, + elocation-id = {2020.07.16.205997}, + eprint = {https://www.biorxiv.org/content/early/2020/07/16/2020.07.16.205997.full.pdf}, +} +@article{luecken2022benchmarking, + title = {Benchmarking atlas-level data integration in single-cell genomics}, + author = {Malte D. Luecken and M. B\"{u}ttner and K. Chaichoompu and A. Danese and M. Interlandi and M. F. Mueller and D. C. Strobl and L. Zappia and M. Dugas and M. Colom{\'{e}}-Tatch{\'{e}} and Fabian J. Theis}, + year = {2021}, + month = dec, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {19}, + number = {1}, + pages = {41--50}, + doi = {10.1038/s41592-021-01336-8}, + url = {https://doi.org/10.1038/s41592-021-01336-8}, +} +@misc{lun2019fastmnn, + title = {A description of the theory behind the fastMNN algorithm}, + author = {Lun, Aaron}, + year = {2019}, + url = {https://marionilab.github.io/FurtherMNN2018/theory/description.html}, +} +@article{mcinnes2018umap, + title = {UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction}, + author = {McInnes, Leland and Healy, John and Melville, James}, + year = {2018}, + journal = {arXiv}, + publisher = {Cornell University}, + doi = {10.48550/arxiv.1802.03426}, + url = {https://arxiv.org/abs/1802.03426}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (stat.ML), Computational Geometry (cs.CG), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, +} +@inbook{miles2005rsquared, + title = {Encyclopedia of Statistics in Behavioral Science}, + author = {Jeremy Miles}, + year = {2005}, + month = oct, + publisher = {John Wiley {\&} Sons, Ltd}, + doi = {10.1002/0470013192.bsa526}, + url = {https://doi.org/10.1002/0470013192.bsa526}, + chapter = {{R-Squared}, Adjusted {R-Squared}}, +} +@article{moon2019visualizing, + title = {Visualizing structure and transitions in high-dimensional biological data}, + author = {Kevin R. Moon and David van Dijk and Zheng Wang and Scott Gigante and Daniel B. Burkhardt and William S. Chen and Kristina Yim and Antonia van den Elzen and Matthew J. Hirn and Ronald R. Coifman and Natalia B. Ivanova and Guy Wolf and Smita Krishnaswamy}, + year = {2019}, + month = dec, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {37}, + number = {12}, + pages = {1482--1492}, + doi = {10.1038/s41587-019-0336-3}, + url = {https://doi.org/10.1038/s41587-019-0336-3}, +} +@article{narayan2021assessing, + title = {Assessing single-cell transcriptomic variability through density-preserving data visualization}, + author = {Ashwin Narayan and Bonnie Berger and Hyunghoon Cho}, + year = {2021}, + month = jan, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {39}, + number = {6}, + pages = {765--774}, + doi = {10.1038/s41587-020-00801-7}, + url = {https://doi.org/10.1038/s41587-020-00801-7}, +} +@article{nestorowa2016single, + title = {A single-cell resolution map of mouse hematopoietic stem and progenitor cell differentiation}, + author = {Sonia Nestorowa and Fiona K. Hamey and Blanca Pijuan Sala and Evangelia Diamanti and Mairi Shepherd and Elisa Laurenti and Nicola K. Wilson and David G. Kent and Berthold G\"{o}ttgens}, + year = {2016}, + month = aug, + journal = {Blood}, + publisher = {American Society of Hematology}, + volume = {128}, + number = {8}, + pages = {e20--e31}, + doi = {10.1182/blood-2016-05-716480}, + url = {https://doi.org/10.1182/blood-2016-05-716480}, +} +@article{olsson2016single, + title = {Single-cell analysis of mixed-lineage states leading to a binary cell fate choice}, + author = {Andre Olsson and Meenakshi Venkatasubramanian and Viren K. Chaudhri and Bruce J. Aronow and Nathan Salomonis and Harinder Singh and H. Leighton Grimes}, + year = {2016}, + month = aug, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {537}, + number = {7622}, + pages = {698--702}, + doi = {10.1038/nature19348}, + url = {https://doi.org/10.1038/nature19348}, +} +@misc{openproblems, + title = {Open Problems}, + author = {{Open Problems for Single Cell Analysis Consortium}}, + year = {2022}, + url = {https://openproblems.bio}, +} +@article{pearson1901pca, + title = {On lines and planes of closest fit to systems of points in space}, + author = {Karl Pearson}, + year = {1901}, + month = nov, + journal = {The London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science}, + publisher = {Informa {UK} Limited}, + volume = {2}, + number = {11}, + pages = {559--572}, + doi = {10.1080/14786440109462720}, + url = {https://doi.org/10.1080/14786440109462720}, +} +@article{pliner2019supervised, + title = {Supervised classification enables rapid annotation of cell atlases}, + author = {Hannah A. Pliner and Jay Shendure and Cole Trapnell}, + year = {2019}, + month = sep, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {10}, + pages = {983--986}, + doi = {10.1038/s41592-019-0535-3}, + url = {https://doi.org/10.1038/s41592-019-0535-3}, +} +@article{polanski2020bbknn, + title = {{BBKNN}: fast batch alignment of single cell transcriptomes}, + author = {Krzysztof Pola{\'{n}}ski and Matthew D Young and Zhichao Miao and Kerstin B Meyer and Sarah A Teichmann and Jong-Eun Park}, + year = {2019}, + month = aug, + journal = {Bioinformatics}, + publisher = {Oxford University Press ({OUP})}, + doi = {10.1093/bioinformatics/btz625}, + url = {https://doi.org/10.1093/bioinformatics/btz625}, + editor = {Bonnie Berger}, +} +@article{raredon2022computation, + title = {Computation and visualization of cell{\textendash}cell signaling topologies in single-cell systems data using Connectome}, + author = {Micha Sam Brickman Raredon and Junchen Yang and James Garritano and Meng Wang and Dan Kushnir and Jonas Christian Schupp and Taylor S. Adams and Allison M. Greaney and Katherine L. Leiby and Naftali Kaminski and Yuval Kluger and Andre Levchenko and Laura E. Niklason}, + year = {2022}, + month = mar, + journal = {Scientific Reports}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {12}, + number = {1}, + doi = {10.1038/s41598-022-07959-x}, + url = {https://doi.org/10.1038/s41598-022-07959-x}, +} +@article{rodriques2019slide, + title = {Slide-seq: A scalable technology for measuring genome-wide expression at high spatial resolution}, + author = {Samuel G. Rodriques and Robert R. Stickels and Aleksandrina Goeva and Carly A. Martin and Evan Murray and Charles R. Vanderburg and Joshua Welch and Linlin M. Chen and Fei Chen and Evan Z. Macosko}, + year = {2019}, + month = mar, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {363}, + number = {6434}, + pages = {1463--1467}, + doi = {10.1126/science.aaw1219}, + url = {https://doi.org/10.1126/science.aaw1219}, +} +@article{sarkar2021separating, + title = {Separating measurement and expression models clarifies confusion in single-cell {RNA} sequencing analysis}, + author = {Abhishek Sarkar and Matthew Stephens}, + year = {2021}, + month = may, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {6}, + pages = {770--777}, + doi = {10.1038/s41588-021-00873-4}, + url = {https://doi.org/10.1038/s41588-021-00873-4}, +} +@article{schober2018correlation, + title = {Correlation Coefficients}, + author = {Patrick Schober and Christa Boer and Lothar A. Schwarte}, + year = {2018}, + month = may, + journal = {Anesthesia {\&}amp$\mathsemicolon$ Analgesia}, + publisher = {Ovid Technologies (Wolters Kluwer Health)}, + volume = {126}, + number = {5}, + pages = {1763--1768}, + doi = {10.1213/ane.0000000000002864}, + url = {https://doi.org/10.1213/ane.0000000000002864}, +} +@inproceedings{stanley2020harmonic, + title = {Harmonic Alignment}, + author = {Jay S. Stanley and Scott Gigante and Guy Wolf and Smita Krishnaswamy}, + year = {2020}, + month = jan, + booktitle = {Proceedings of the 2020 {SIAM} International Conference on Data Mining}, + publisher = {Society for Industrial and Applied Mathematics}, + pages = {316--324}, + doi = {10.1137/1.9781611976236.36}, + url = {https://doi.org/10.1137/1.9781611976236.36}, +} +@article{stoeckius2017simultaneous, + title = {Simultaneous epitope and transcriptome measurement in single cells}, + author = {Marlon Stoeckius and Christoph Hafemeister and William Stephenson and Brian Houck-Loomis and Pratip K Chattopadhyay and Harold Swerdlow and Rahul Satija and Peter Smibert}, + year = {2017}, + month = jul, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {14}, + number = {9}, + pages = {865--868}, + doi = {10.1038/nmeth.4380}, + url = {https://doi.org/10.1038/nmeth.4380}, +} +@article{stuart2019comprehensive, + title = {Comprehensive Integration of Single-Cell Data}, + author = {Stuart, T. and Butler, A. and Hoffman, P. and Hafemeister, C. and Papalexi, E. and Mauck, W.M. and Hao, Y. and Stoeckius, M. and Smibert, P. and Satija, R.}, + year = {2019}, + journal = {Cell}, + volume = {177}, + number = {7}, + pages = {1888--1902.e21}, + doi = {10.1016/j.cell.2019.05.031}, +} +@article{tabula2018single, + title = {Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris}, + author = {{Tabula Muris Consortium}}, + year = {2018}, + month = oct, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {562}, + number = {7727}, + pages = {367--372}, + doi = {10.1038/s41586-018-0590-4}, + url = {https://doi.org/10.1038/s41586-018-0590-4}, +} +@article{tabula2020single, + title = {A single-cell transcriptomic atlas characterizes ageing tissues in the mouse}, + author = {{Tabula Muris Consortium}}, + year = {2020}, + month = jul, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {583}, + number = {7817}, + pages = {590--595}, + doi = {10.1038/s41586-020-2496-1}, + url = {https://doi.org/10.1038/s41586-020-2496-1}, +} +@article{tasic2016adult, + title = {Adult mouse cortical cell taxonomy revealed by single cell transcriptomics}, + author = {Bosiljka Tasic and Vilas Menon and Thuc Nghi Nguyen and Tae Kyung Kim and Tim Jarsky and Zizhen Yao and Boaz Levi and Lucas T Gray and Staci A Sorensen and Tim Dolbeare and Darren Bertagnolli and Jeff Goldy and Nadiya Shapovalova and Sheana Parry and Changkyu Lee and Kimberly Smith and Amy Bernard and Linda Madisen and Susan M Sunkin and Michael Hawrylycz and Christof Koch and Hongkui Zeng}, + year = {2016}, + month = jan, + journal = {Nature Neuroscience}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {19}, + number = {2}, + pages = {335--346}, + doi = {10.1038/nn.4216}, + url = {https://doi.org/10.1038/nn.4216}, +} +@article{tian2019benchmarking, + title = {Benchmarking single cell {RNA}-sequencing analysis pipelines using mixture control experiments}, + author = {Luyi Tian and Xueyi Dong and Saskia Freytag and Kim-Anh L{\^{e}} Cao and Shian Su and Abolfazl JalalAbadi and Daniela Amann-Zalcenstein and Tom S. Weber and Azadeh Seidi and Jafar S. Jabbari and Shalin H. Naik and Matthew E. Ritchie}, + year = {2019}, + month = may, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {6}, + pages = {479--487}, + doi = {10.1038/s41592-019-0425-8}, + url = {https://doi.org/10.1038/s41592-019-0425-8}, +} +@article{van2018recovering, + title = {Recovering Gene Interactions from Single-Cell Data Using Data Diffusion}, + author = {David van Dijk and Roshan Sharma and Juozas Nainys and Kristina Yim and Pooja Kathail and Ambrose J. Carr and Cassandra Burdziak and Kevin R. Moon and Christine L. Chaffer and Diwakar Pattabiraman and Brian Bierie and Linas Mazutis and Guy Wolf and Smita Krishnaswamy and Dana Pe'er}, + year = {2018}, + month = jul, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {174}, + number = {3}, + pages = {716--729.e27}, + doi = {10.1016/j.cell.2018.05.061}, + url = {https://doi.org/10.1016/j.cell.2018.05.061}, +} +@article{vandermaaten2008visualizing, + title = {Visualizing Data using t-SNE}, + author = {{van der} Maaten, Laurens and Hinton, Geoffrey}, + year = {2008}, + journal = {Journal of Machine Learning Research}, + volume = {9}, + number = {86}, + pages = {2579--2605}, + url = {http://jmlr.org/papers/v9/vandermaaten08a.html}, +} +@inproceedings{venna2001neighborhood, + title = {Neighborhood Preservation in Nonlinear Projection Methods: An Experimental Study}, + author = {Jarkko Venna and Samuel Kaski}, + year = {2001}, + booktitle = {Artificial Neural Networks {\textemdash} {ICANN} 2001}, + publisher = {Springer Berlin Heidelberg}, + pages = {485--491}, + doi = {{10.1007/3-540-44668-0\_68}}, + url = {{https://doi.org/10.1007/3-540-44668-0\_68}}, +} +@article{wagner2018knearest, + title = {K-nearest neighbor smoothing for high-throughput single-cell RNA-Seq data}, + author = {Wagner, Florian and Yan, Yun and Yanai, Itai}, + year = {2018}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/217737}, + url = {https://www.biorxiv.org/content/early/2018/04/09/217737}, + elocation-id = {217737}, + eprint = {https://www.biorxiv.org/content/early/2018/04/09/217737.full.pdf}, +} +@article{wagner2018single, + title = {Single-cell mapping of gene expression landscapes and lineage in the zebrafish embryo}, + author = {Daniel E. Wagner and Caleb Weinreb and Zach M. Collins and James A. Briggs and Sean G. Megason and Allon M. Klein}, + year = {2018}, + month = jun, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {360}, + number = {6392}, + pages = {981--987}, + doi = {10.1126/science.aar4362}, + url = {https://doi.org/10.1126/science.aar4362}, +} +@article{wang2013target, + title = {Target analysis by integration of transcriptome and {ChIP}-seq data with {BETA}}, + author = {Su Wang and Hanfei Sun and Jian Ma and Chongzhi Zang and Chenfei Wang and Juan Wang and Qianzi Tang and Clifford A Meyer and Yong Zhang and X Shirley Liu}, + year = {2013}, + month = nov, + journal = {Nature Protocols}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {8}, + number = {12}, + pages = {2502--2515}, + doi = {10.1038/nprot.2013.150}, + url = {https://doi.org/10.1038/nprot.2013.150}, +} +@article{welch2019single, + title = {Single-Cell Multi-omic Integration Compares and Contrasts Features of Brain Cell Identity}, + author = {Joshua D. Welch and Velina Kozareva and Ashley Ferreira and Charles Vanderburg and Carly Martin and Evan Z. Macosko}, + year = {2019}, + month = jun, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {177}, + number = {7}, + pages = {1873--1887.e17}, + doi = {10.1016/j.cell.2019.05.006}, + url = {https://doi.org/10.1016/j.cell.2019.05.006}, +} +@article{wu2021single, + title = {A single-cell and spatially resolved atlas of human breast cancers}, + author = {Sunny Z. Wu and Ghamdan Al-Eryani and Daniel Lee Roden and Simon Junankar and Kate Harvey and Alma Andersson and Aatish Thennavan and Chenfei Wang and James R. Torpy and Nenad Bartonicek and Taopeng Wang and Ludvig Larsson and Dominik Kaczorowski and Neil I. Weisenfeld and Cedric R. Uytingco and Jennifer G. Chew and Zachary W. Bent and Chia-Ling Chan and Vikkitharan Gnanasambandapillai and Charles-Antoine Dutertre and Laurence Gluch and Mun N. Hui and Jane Beith and Andrew Parker and Elizabeth Robbins and Davendra Segara and Caroline Cooper and Cindy Mak and Belinda Chan and Sanjay Warrier and Florent Ginhoux and Ewan Millar and Joseph E. Powell and Stephen R. Williams and X. Shirley Liu and Sandra O'Toole and Elgene Lim and Joakim Lundeberg and Charles M. Perou and Alexander Swarbrick}, + year = {2021}, + month = sep, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {9}, + pages = {1334--1347}, + doi = {10.1038/s41588-021-00911-1}, + url = {https://doi.org/10.1038/s41588-021-00911-1}, +} +@article{xiong2020neuralee, + title = {{NeuralEE}: A {GPU}-Accelerated Elastic Embedding Dimensionality Reduction Method for Visualizing Large-Scale {scRNA}-Seq Data}, + author = {Jiankang Xiong and Fuzhou Gong and Lin Wan and Liang Ma}, + year = {2020}, + month = oct, + journal = {Frontiers in Genetics}, + publisher = {Frontiers Media {SA}}, + volume = {11}, + doi = {10.3389/fgene.2020.00786}, + url = {https://doi.org/10.3389/fgene.2020.00786}, +} +@article{xiong2021online, + title = {Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space}, + author = {Lei Xiong and Kang Tian and Yuzhe Li and Weixi Ning and Xin Gao and Qiangfeng Cliff Zhang}, + year = {2022}, + month = oct, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {13}, + number = {1}, + doi = {10.1038/s41467-022-33758-z}, + url = {https://doi.org/10.1038/s41467-022-33758-z}, +} +@article{xu2021probabilistic, + title = {Probabilistic harmonization and annotation of single-cell transcriptomics data with deep generative models}, + author = {Chenling Xu and Romain Lopez and Edouard Mehlman and Jeffrey Regier and Michael I Jordan and Nir Yosef}, + year = {2021}, + month = jan, + journal = {Molecular Systems Biology}, + publisher = {{Embo}}, + volume = {17}, + number = {1}, + doi = {10.15252/msb.20209620}, + url = {https://doi.org/10.15252/msb.20209620}, +} +@article{zhang2021pydrmetrics, + title = {{pyDRMetrics} - A Python toolkit for dimensionality reduction quality assessment}, + author = {Yinsheng Zhang and Qian Shang and Guoming Zhang}, + year = {2021}, + month = feb, + journal = {Heliyon}, + publisher = {Elsevier {BV}}, + volume = {7}, + number = {2}, + pages = {e06199}, + doi = {10.1016/j.heliyon.2021.e06199}, + url = {https://doi.org/10.1016/j.heliyon.2021.e06199}, +} diff --git a/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py b/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py index ef7d56a343..d87d2c5fd8 100644 --- a/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py +++ b/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py @@ -6,7 +6,7 @@ @utils.loader( data_url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE112294", - data_reference="https://doi.org/10.1126/science.aar4362", + data_reference="wagner2018single", ) def load_zebrafish_chd_tyr(test=False): """Download zebrafish data from GEO accession GSE112294""" diff --git a/openproblems/data/allen_brain_atlas.py b/openproblems/data/allen_brain_atlas.py index fecbca85d1..65e9d6822e 100644 --- a/openproblems/data/allen_brain_atlas.py +++ b/openproblems/data/allen_brain_atlas.py @@ -8,7 +8,7 @@ URL = "https://figshare.com/ndownloader/files/36509385" -@utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/nn.4216") +@utils.loader(data_url=URL, data_reference="tasic2016adult") def load_mouse_brain_atlas(test=False): """Download Allen Brain (Taisc et al.,2016) data from Figshare. diff --git a/openproblems/data/cengen.py b/openproblems/data/cengen.py index 859755f558..7cdf923527 100644 --- a/openproblems/data/cengen.py +++ b/openproblems/data/cengen.py @@ -11,9 +11,7 @@ ) -@utils.loader( - data_url=URL, data_reference="https://doi.org/10.1016/j.neuron.2018.07.042" -) +@utils.loader(data_url=URL, data_reference="hammarlund2018cengen") def load_cengen(test=False): """Download CeNGEN data from GitHub. diff --git a/openproblems/data/immune_cells.py b/openproblems/data/immune_cells.py index 81b1f6cd58..b053e27597 100644 --- a/openproblems/data/immune_cells.py +++ b/openproblems/data/immune_cells.py @@ -8,7 +8,7 @@ URL = "https://ndownloader.figshare.com/files/36086786" -@utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/s41592-021-01336-8") +@utils.loader(data_url=URL, data_reference="luecken2022benchmarking") def load_immune(test=False): """Download immune human data from figshare.""" import scanpy as sc diff --git a/openproblems/data/mouse_blood_olssen_labelled.py b/openproblems/data/mouse_blood_olssen_labelled.py index 9deb09a4e1..dbc6b40daa 100644 --- a/openproblems/data/mouse_blood_olssen_labelled.py +++ b/openproblems/data/mouse_blood_olssen_labelled.py @@ -9,7 +9,7 @@ URL = "https://figshare.com/ndownloader/files/36872214" -@utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/nature19348") +@utils.loader(data_url=URL, data_reference="olsson2016single") def load_olsson_2016_mouse_blood(test=False): """Download Olsson, 2016_mouse_blood, Nature, 2016 data from Figshare.""" import scanpy as sc diff --git a/openproblems/data/mouse_hspc_nestorowa2016.py b/openproblems/data/mouse_hspc_nestorowa2016.py index d409c2ce70..f39254287f 100644 --- a/openproblems/data/mouse_hspc_nestorowa2016.py +++ b/openproblems/data/mouse_hspc_nestorowa2016.py @@ -9,9 +9,7 @@ URL = "https://ndownloader.figshare.com/files/36088649" -@utils.loader( - data_url=URL, data_reference="https://doi.org/10.1182/blood-2016-05-716480" -) +@utils.loader(data_url=URL, data_reference="nestorowa2016single") def load_mouse_hspc_nestorowa2016(test=False): """Download Nesterova data from Figshare.""" import scanpy as sc diff --git a/openproblems/data/multimodal/citeseq.py b/openproblems/data/multimodal/citeseq.py index 7bf3a509c6..23f57a6aa3 100644 --- a/openproblems/data/multimodal/citeseq.py +++ b/openproblems/data/multimodal/citeseq.py @@ -17,7 +17,7 @@ @loader( data_url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866", - data_reference="https://doi.org/10.1038/nmeth.4380", + data_reference="stoeckius2017simultaneous", ) def load_citeseq_cbmc(test=False): """Download CITEseq data from GEO.""" diff --git a/openproblems/data/multimodal/scicar/base.py b/openproblems/data/multimodal/scicar/base.py index 4ff018f649..b0930ef91e 100644 --- a/openproblems/data/multimodal/scicar/base.py +++ b/openproblems/data/multimodal/scicar/base.py @@ -6,7 +6,7 @@ import scprep import tempfile -DATA_REFERENCE = "https://doi.org/10.1126/science.aau0730" +DATA_REFERENCE = "cao2018joint" def load_scicar( diff --git a/openproblems/data/pancreas.py b/openproblems/data/pancreas.py index 98303becfc..1e1663eebc 100644 --- a/openproblems/data/pancreas.py +++ b/openproblems/data/pancreas.py @@ -9,7 +9,7 @@ URL = "https://ndownloader.figshare.com/files/36086813" -@utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/s41592-021-01336-8") +@utils.loader(data_url=URL, data_reference="luecken2022benchmarking") def load_pancreas(test=False, keep_techs=None): """Download pancreas data from figshare.""" import scanpy as sc diff --git a/openproblems/data/tabula_muris_senis.py b/openproblems/data/tabula_muris_senis.py index c634e85d43..78afe934d8 100644 --- a/openproblems/data/tabula_muris_senis.py +++ b/openproblems/data/tabula_muris_senis.py @@ -85,7 +85,7 @@ def load_raw_counts(dataset): @utils.loader( data_url="https://tabula-muris-senis.ds.czbiohub.org/", - data_reference="https://doi.org/10.1038/s41586-020-2496-1", + data_reference="tabula2020single", ) def load_tabula_muris_senis(test=False, method_list=None, organ_list=None): """Load tubula_muris_senis datasets into 1 anndata object based on user input. diff --git a/openproblems/data/tenx.py b/openproblems/data/tenx.py index 59f9cb93da..1d9f9c3621 100644 --- a/openproblems/data/tenx.py +++ b/openproblems/data/tenx.py @@ -10,10 +10,9 @@ # TODO(@LuckyMD): document relevant link at figshare.com/articles/* PBMC_5K_URL = "https://ndownloader.figshare.com/files/25555739" -REFERENCE_URL = "https://www.10xgenomics.com/resources/datasets" -@utils.loader(data_url=PBMC_1K_URL, data_reference=REFERENCE_URL) +@utils.loader(data_url=PBMC_1K_URL, data_reference="10x2018pbmc") def load_tenx_1k_pbmc(test=False): """Download PBMC data from Figshare.""" import scanpy as sc @@ -32,7 +31,7 @@ def load_tenx_1k_pbmc(test=False): return adata -@utils.loader(data_url=PBMC_5K_URL, data_reference=REFERENCE_URL) +@utils.loader(data_url=PBMC_5K_URL, data_reference="10x2019pbmc") def load_tenx_5k_pbmc(test=False): """Download 5k PBMCs from 10x Genomics.""" import scanpy as sc diff --git a/openproblems/data/tnbc_wu2021.py b/openproblems/data/tnbc_wu2021.py index c0a3e6a941..3f27220138 100644 --- a/openproblems/data/tnbc_wu2021.py +++ b/openproblems/data/tnbc_wu2021.py @@ -9,7 +9,7 @@ URL = "https://figshare.com/ndownloader/files/37593188" -@utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/s41588-021-00911-1") +@utils.loader(data_url=URL, data_reference="wu2021single") def load_tnbc_data(test=False): """Download TNBC data (Wu et al., 2021) from Figshare. diff --git a/openproblems/data/zebrafish.py b/openproblems/data/zebrafish.py index 630b9be0a2..23e8540f72 100644 --- a/openproblems/data/zebrafish.py +++ b/openproblems/data/zebrafish.py @@ -11,7 +11,7 @@ ) -@utils.loader(data_url=URL, data_reference="https://doi.org/10.1126/science.aar4362") +@utils.loader(data_url=URL, data_reference="wagner2018single") def load_zebrafish(test=False): """Download zebrafish data from figshare.""" with tempfile.TemporaryDirectory() as tempdir: diff --git a/openproblems/tasks/_batch_integration/README.md b/openproblems/tasks/_batch_integration/README.md index f67783341d..c17078df0a 100644 --- a/openproblems/tasks/_batch_integration/README.md +++ b/openproblems/tasks/_batch_integration/README.md @@ -28,5 +28,5 @@ Metrics for this task can be divided into those that assess the removal of batch effects, and assessments of the conservation of biological variation. This can be a helpful distinction when devising new metrics. This task, including the subtask structure, was taken from a [benchmarking study of data integration -methods](https://www.biorxiv.org/content/10.1101/2020.05.22.111161v2). This is a useful +methods](https://openproblems.bio/bibliography#luecken2022benchmarking). This is a useful reference for more background reading on the task and the above concepts. diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md index 1fab8a1d18..77f5845c71 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md @@ -18,7 +18,7 @@ sub-tasks for batch integration can be found for: This sub-task was taken from a [benchmarking study of data integration -methods](https://www.biorxiv.org/content/10.1101/2020.05.22.111161v2). +methods](https://openproblems.bio/bibliography#luecken2022benchmarking). ## The metrics @@ -30,7 +30,8 @@ variance conservation metrics. * **kBET**: kBET determines whether the label composition of a k nearest neighborhood of a cell is similar to the expected (global) label composition -(Buettner et al., Nat Meth 2019). The test is repeated for a random subset of cells, +([Buettner et al., Nat Meth 2019](https://openproblems.bio/bibliography#bttner2018test)). + The test is repeated for a random subset of cells, and the results are summarized as a rejection rate over all tested neighborhoods. * **Silhouette batch score**: The absolute silhouette width is computed over batch labels per cell. As 0 then indicates that batches are well mixed and any deviation from diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py index 7b28a9267c..e219707e41 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py @@ -3,31 +3,31 @@ from ...batch_integration_graph.methods.baseline import _random_embedding from ...batch_integration_graph.methods.baseline import _randomize_features +import functools import numpy as np import scanpy as sc - -@method( - method_name="No Integration", - paper_name="No Integration (baseline)", - paper_url="https://openproblems.bio", +_baseline_method = functools.partial( + method, + paper_name="Open Problems for Single Cell Analysis", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, ) + + +@_baseline_method( + method_name="No Integration", +) def no_integration(adata, test=False): adata.obsm["X_emb"] = adata.obsm["X_uni_pca"] adata.uns["method_code_version"] = check_version("openproblems") return adata -@method( +@_baseline_method( method_name="Random Integration", - paper_name="Random Integration (baseline)", - paper_url="https://openproblems.bio", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, ) def random_integration(adata, test=False): adata.obsm["X_emb"] = _randomize_features(adata.obsm["X_uni_pca"]) @@ -38,7 +38,7 @@ def random_integration(adata, test=False): @method( method_name="Random Integration by Celltype", paper_name="Random Integration by Celltype (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -51,13 +51,8 @@ def celltype_random_integration(adata, test=False): return adata -@method( +@_baseline_method( method_name="Random Embedding by Celltype", - paper_name="Random Embedding by Celltype (baseline)", - paper_url="https://openproblems.bio", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, ) def celltype_random_embedding(adata, test=False): adata.obsm["X_emb"] = _random_embedding(partition=adata.obs["labels"]) @@ -65,13 +60,8 @@ def celltype_random_embedding(adata, test=False): return adata -@method( +@_baseline_method( method_name="Random Integration by Batch", - paper_name="Random Integration by Batch (baseline)", - paper_url="https://openproblems.bio", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, ) def batch_random_integration(adata, test=False): adata.obsm["X_emb"] = _randomize_features( @@ -81,13 +71,8 @@ def batch_random_integration(adata, test=False): return adata -@method( +@_baseline_method( method_name="No Integration by Batch", - paper_name="No Integration by Batch (baseline)", - paper_url="https://openproblems.bio", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, ) def no_integration_batch(adata, test=False): """Compute PCA independently on each batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py index 1f9d9a7827..8d7ee9101f 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py @@ -20,6 +20,7 @@ @metric( metric_name="Cell Cycle Score", + paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py index c1f8c4be2d..746e5851fe 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py @@ -14,6 +14,7 @@ @metric( metric_name="Isolated label Silhouette", + paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/kBET.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/kBET.py index 271eeadbd6..1655a43c9a 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/kBET.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/kBET.py @@ -26,6 +26,7 @@ @metric( metric_name="kBET", + paper_reference="bttner2018test", maximize=True, image="openproblems-r-extras", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py index 7efca62ffe..3e68a8ac27 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py @@ -17,6 +17,7 @@ @metric( metric_name="PC Regression", + paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py index 9f28cd1284..efe8775252 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py @@ -23,6 +23,7 @@ @metric( metric_name="Batch ASW", + paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py index bb2bece193..6275a08927 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py @@ -11,6 +11,7 @@ @metric( metric_name="Silhouette", + paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md index 022f1aed65..209d3f0dfb 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md @@ -16,7 +16,7 @@ for: * [embeddings](../batch_integration_embed/) This sub-task was taken from a [benchmarking study of data integration -methods](https://www.biorxiv.org/content/10.1101/2020.05.22.111161v2). +methods](https://openproblems.bio/bibliography#luecken2022benchmarking). ## The metrics diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py index 97053a2d16..7f71fdb686 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py @@ -6,7 +6,7 @@ @method( method_name="No Integration", paper_name="No Integration (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -19,7 +19,7 @@ def no_integration(adata, test=False): @method( method_name="Random Integration", paper_name="Random Integration (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -33,7 +33,7 @@ def random_integration(adata, test=False): @method( method_name="Random Integration by Celltype", paper_name="Random Integration by Celltype (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -47,7 +47,7 @@ def celltype_random_integration(adata, test=False): @method( method_name="Random Integration by Batch", paper_name="Random Integration by Batch (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py index bb7f90cae8..f7779037fa 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py @@ -20,6 +20,7 @@ @metric( metric_name="HVG conservation", + paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/README.md b/openproblems/tasks/_batch_integration/batch_integration_graph/README.md index e3ab961389..f407e8d349 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/README.md @@ -17,7 +17,7 @@ sub-tasks for batch integration can be found for: * [corrected features](../batch_integration_feature/) This sub-task was taken from a [benchmarking study of data integration -methods](https://www.biorxiv.org/content/10.1101/2020.05.22.111161v2). +methods](https://openproblems.bio/bibliography#luecken2022benchmarking). ## The metrics diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py index e95153e25a..8a3236795f 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py @@ -14,7 +14,7 @@ def _set_uns(adata): @method( method_name="No Integration", paper_name="No Integration (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -65,7 +65,7 @@ def _random_embedding(partition): @method( method_name="Random Integration", paper_name="Random Integration (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -79,7 +79,7 @@ def random_integration(adata, test=False): @method( method_name="Random Integration by Celltype", paper_name="Random Integration by Celltype (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -96,7 +96,7 @@ def celltype_random_integration(adata, test=False): @method( method_name="Random Integration by Batch", paper_name="Random Integration by Batch (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -113,7 +113,7 @@ def batch_random_integration(adata, test=False): @method( method_name="Random Graph by Celltype", paper_name="Random Graph by Celltype (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py index 97570dccd8..0190b60b4a 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py @@ -7,7 +7,7 @@ _bbknn_method = functools.partial( method, paper_name="BBKNN: fast batch alignment of single cell transcriptomes", - paper_url="https://academic.oup.com/bioinformatics/article/36/3/964/5545955", + paper_reference="polanski2020bbknn", paper_year=2020, code_url="https://github.com/Teichlab/bbknn", image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py index 96e53538d3..1cce2908b4 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py @@ -7,7 +7,7 @@ method, paper_name="Adjusting batch effects in microarray expression data using " "empirical Bayes methods", - paper_url="https://academic.oup.com/biostatistics/article/8/1/118/252073", + paper_reference="hansen2012removing", paper_year=2007, code_url="https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html", image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py index 1391d076e9..cd617d975f 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py @@ -8,7 +8,7 @@ _fastmnn_method = functools.partial( method, paper_name="A description of the theory behind the fastMNN algorithm", - paper_url="https://marionilab.github.io/FurtherMNN2018/theory/description.html", + paper_reference="lun2019fastmnn", paper_year=2019, code_url="https://doi.org/doi:10.18129/B9.bioc.batchelor", image="openproblems-r-extras", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/harmony.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/harmony.py index 90edc8d495..981e46e739 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/harmony.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/harmony.py @@ -8,7 +8,7 @@ method, paper_name="Fast, sensitive and accurate integration " "of single-cell data with Harmony", - paper_url="https://www.nature.com/articles/s41592-019-0619-0", + paper_reference="korsunsky2019fast", paper_year=2019, code_url="https://github.com/lilab-bcb/harmony-pytorch", image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/liger.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/liger.py index 6617f586ed..5077e9f34e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/liger.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/liger.py @@ -9,7 +9,7 @@ method, paper_name="Single-Cell Multi-omic Integration Compares and " "Contrasts Features of Brain Cell Identity", - paper_url="https://doi.org/10.1016/j.cell.2019.05.006", + paper_reference="welch2019single", paper_year=2019, code_url="https://github.com/welch-lab/liger", image="openproblems-r-extras", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py index 4d444b4950..48919504f2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py @@ -7,7 +7,7 @@ method, paper_name="Batch effects in single-cell RNA-sequencing " "data are corrected by matching mutual nearest neighbors", - paper_url="https://www.nature.com/articles/nbt.4091", + paper_reference="haghverdi2018batch", paper_year=2018, code_url="https://github.com/chriscainx/mnnpy", image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py index 461ea04a94..796a786375 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py @@ -8,7 +8,7 @@ method, paper_name="Online single-cell data integration through projecting heterogeneous " "datasets into a common cell-embedding space", - paper_url="https://doi.org/10.1038/s41467-022-33758-z", + paper_reference="xiong2021online", paper_year=2022, code_url="https://github.com/jsxlei/SCALEX", image="openproblems-python-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py index db9aed5caa..e04c305cd7 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py @@ -7,7 +7,7 @@ method, paper_name="Efficient integration of heterogeneous single-cell " "transcriptomes using Scanorama", - paper_url="https://www.nature.com/articles/s41587-019-0113-3", + paper_reference="hie2019efficient", paper_year=2019, code_url="https://github.com/brianhie/scanorama", image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py index 8f98a3c931..91e3c92722 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py @@ -8,7 +8,7 @@ method, paper_name="Probabilistic harmonization and annotation of single‐cell " "transcriptomics data with deep generative models", - paper_url="https://doi.org/10.15252/msb.20209620", + paper_reference="xu2021probabilistic", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py index 35f1cd7ac5..26d5f1d0d0 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py @@ -7,7 +7,7 @@ _scvi_method = functools.partial( method, paper_name="Deep generative modeling for single-cell transcriptomics", - paper_url="https://www.nature.com/articles/s41592-018-0229-2", + paper_reference="lopez2018deep", paper_year=2018, code_url="https://github.com/YosefLab/scvi-tools", image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py index 0d082fff44..e6b69a228c 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py @@ -16,6 +16,7 @@ @metric( metric_name="ARI", maximize=True, + paper_reference="luecken2022benchmarking", image="openproblems-r-pytorch", ) def ari(adata): diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py index 52dd7c44b2..1d7a7780c1 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py @@ -21,6 +21,7 @@ @metric( metric_name="Graph connectivity", + paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py index 71cd7ca209..0caac73f2b 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py @@ -26,6 +26,7 @@ @metric( metric_name="Isolated label F1", + paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py index 3356507b2e..0bce4a5eaf 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py @@ -14,6 +14,7 @@ @metric( metric_name="NMI", + paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", ) diff --git a/openproblems/tasks/_cell_cell_communication/README.md b/openproblems/tasks/_cell_cell_communication/README.md index 3d40f17c0a..34a89cf32c 100644 --- a/openproblems/tasks/_cell_cell_communication/README.md +++ b/openproblems/tasks/_cell_cell_communication/README.md @@ -10,8 +10,8 @@ Different tools propose distinct preprocessing steps with diverse scoring functions, that are challenging to compare and evaluate. Furthermore, each tool typically comes with its own set of prior knowledge. To harmonize these, [Dimitrov et -al, 2022](https://doi.org/10.1038/s41467-022-30755-0) recently developed the -[LIANA](https://github.com/saezlab/liana) framework, which was used +al, 2022](https://openproblems.bio/bibliography#dimitrov2022comparison) recently +developed the [LIANA](https://github.com/saezlab/liana) framework, which was used as a foundation for this task. The challenges in evaluating the tools are further exacerbated by the diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py b/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py index ff5939a11f..27d950c233 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py @@ -8,7 +8,7 @@ @method( method_name="Random Events", paper_name="Random Events (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -39,7 +39,7 @@ def random_events(adata, test=False, n_events=1000): @method( method_name="True Events", paper_name="True Events (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py index 522d5c330f..87856a10a6 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py @@ -25,7 +25,7 @@ def _p_filt(x, y): method, paper_name="Comparison of methods and resources for cell-cell " "communication inference from single-cell RNA-Seq data", - paper_url="https://www.nature.com/articles/s41467-022-30755-0", + paper_reference="dimitrov2022comparison", paper_year=2022, code_url="https://github.com/saezlab/liana", image="openproblems-r-extras", @@ -109,7 +109,7 @@ def magnitude_sum(adata, test=False): method, paper_name="CellPhoneDB: inferring cell–cell communication from " "combined expression of multi-subunit ligand–receptor complexes", - paper_url="https://www.nature.com/articles/s41596-020-0292-x", + paper_reference="efremova2020cellphonedb", paper_year=2020, code_url="https://github.com/saezlab/liana", image="openproblems-r-extras", @@ -156,7 +156,7 @@ def cellphonedb_sum(adata, test=False): method, paper_name="Computation and visualization of cell–cell signaling " "topologies in single-cell systems data using Connectome", - paper_url="https://www.nature.com/articles/s41598-022-07959-x", + paper_reference="raredon2022computation", paper_year=2022, code_url="https://github.com/saezlab/liana", image="openproblems-r-extras", @@ -214,7 +214,7 @@ def logfc_sum(adata, test=False): _natmi_method = functools.partial( method, paper_name="Predicting cell-to-cell communication networks using NATMI", - paper_url="https://www.nature.com/articles/s41467-020-18873-z", + paper_reference="hou2020predicting", paper_year=2021, code_url="https://github.com/saezlab/liana", image="openproblems-r-extras", @@ -249,7 +249,7 @@ def natmi_sum(adata, test=False): method, paper_name="SingleCellSignalR: inference of intercellular networks " "from single-cell transcriptomics", - paper_url="https://academic.oup.com/nar/article/48/10/e55/5810485", + paper_reference="cabello2020singlecellsignalr", paper_year=2021, code_url="https://github.com/saezlab/liana", image="openproblems-r-extras", diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py index 9365998a30..593f537041 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py @@ -2,7 +2,9 @@ from ..utils import join_truth_and_pred -@metric(metric_name="Precision-recall AUC", maximize=True) +@metric( + metric_name="Precision-recall AUC", paper_reference="davis2006prauc", maximize=True +) def auprc(adata): from sklearn.metrics import auc from sklearn.metrics import precision_recall_curve diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py index 084669d110..5cce8bb1fb 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py @@ -8,7 +8,7 @@ def _sigmoid_transform(x): return 1 - 1 / (1 + x / 2) -@metric(metric_name="Odds Ratio", maximize=True) +@metric(metric_name="Odds Ratio", paper_reference="bland2000odds", maximize=True) def odds_ratio(adata, top_prop=0.05): # Join benchmark (assumed truth) and ccc results # Get /w ccc_target and a response [0, 1] column diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md index f1c19872cf..98634fb384 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md @@ -10,8 +10,8 @@ Different tools propose distinct preprocessing steps with diverse scoring functions, that are challenging to compare and evaluate. Furthermore, each tool typically comes with its own set of prior knowledge. To harmonize these, [Dimitrov et -al, 2022](https://doi.org/10.1038/s41467-022-30755-0) recently developed the -[LIANA](https://github.com/saezlab/liana) framework, which was used +al, 2022](https://openproblems.bio/bibliography#dimitrov2022comparison) recently +developed the [LIANA](https://github.com/saezlab/liana) framework, which was used as a foundation for this task. The challenges in evaluating the tools are further exacerbated by the diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md index 1d8f0e9ef7..7f8fb76cae 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md @@ -10,8 +10,8 @@ Different tools propose distinct preprocessing steps with diverse scoring functions, that are challenging to compare and evaluate. Furthermore, each tool typically comes with its own set of prior knowledge. To harmonize these, [Dimitrov et -al, 2022](https://doi.org/10.1038/s41467-022-30755-0) recently developed the -[LIANA](https://github.com/saezlab/liana) framework, which was used +al, 2022](https://openproblems.bio/bibliography#dimitrov2022comparison) recently +developed the [LIANA](https://github.com/saezlab/liana) framework, which was used as a foundation for this task. The challenges in evaluating the tools are further exacerbated by the diff --git a/openproblems/tasks/denoising/README.md b/openproblems/tasks/denoising/README.md index 9bfe698781..6a5eb5e1b3 100644 --- a/openproblems/tasks/denoising/README.md +++ b/openproblems/tasks/denoising/README.md @@ -5,30 +5,30 @@ Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present in each cell. As a result, the measurements (UMI counts) observed for each gene and each cell are associated with generally high levels of technical noise ([Grün et al., -2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of -estimating the true expression level of each gene in each cell. In the single-cell -literature, this task is also referred to as *imputation*, a term which is typically -used for missing data problems in statistics. Similar to the use of the terms "dropout", -"missing data", and "technical zeros", this terminology can create confusion about the -underlying measurement process ([Sarkar and Stephens, -2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)). +2014](https://openproblems.bio/bibliography#grn2014validation)). Denoising describes the +task of estimating the true expression level of each gene in each cell. In the +single-cell literature, this task is also referred to as *imputation*, a term which is +typically used for missing data problems in statistics. Similar to the use of the terms +"dropout", "missing data", and "technical zeros", this terminology can create confusion +about the underlying measurement process ([Sarkar and Stephens, +2021](https://openproblems.bio/bibliography#sarkar2021separating)). A key challenge in evaluating denoising methods is the general lack of a ground truth. A recent benchmark study ([Hou et al., -2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x)) +2020](https://openproblems.bio/bibliography#hou2020systematic)) relied on flow-sorted datasets, mixture control experiments ([Tian et al., -2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk -RNA-Seq data. Since each of these approaches suffers from specific limitations, it is -difficult to combine these different approaches into a single quantitative measure of +2019](https://openproblems.bio/bibliography#tian2019benchmarking)), and comparisons with +bulk RNA-Seq data. Since each of these approaches suffers from specific limitations, it +is difficult to combine these different approaches into a single quantitative measure of denoising accuracy. Here, we instead rely on an approach termed molecular cross-validation (MCV), which was specifically developed to quantify denoising accuracy in the absence of a ground truth ([Batson et al., -2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules -in a given scRNA-Seq dataset are first partitioned between a *training* and a *test* -dataset. Next, a denoising method is applied to the training dataset. Finally, denoising -accuracy is measured by comparing the result to the test dataset. The authors show that -both in theory and in practice, the measured denoising accuracy is representative of the -accuracy that would be obtained on a ground truth dataset. +2019](https://openproblems.bio/bibliography#batson2019molecular)). In MCV, the observed +molecules in a given scRNA-Seq dataset are first partitioned between a *training* and a +*test* dataset. Next, a denoising method is applied to the training dataset. Finally, +denoising accuracy is measured by comparing the result to the test dataset. The authors +show that both in theory and in practice, the measured denoising accuracy is +representative of the accuracy that would be obtained on a ground truth dataset. ## The metrics diff --git a/openproblems/tasks/denoising/methods/alra.py b/openproblems/tasks/denoising/methods/alra.py index a4d7058e3a..d1946395e5 100644 --- a/openproblems/tasks/denoising/methods/alra.py +++ b/openproblems/tasks/denoising/methods/alra.py @@ -12,7 +12,7 @@ method_name="ALRA", paper_name="Zero-preserving imputation of scRNA-seq data using " "low-rank approximation", - paper_url="https://doi.org/10.1101/397588", + paper_reference="linderman2018zero", paper_year=2018, code_url="https://github.com/KlugerLab/ALRA", image="openproblems-r-extras", diff --git a/openproblems/tasks/denoising/methods/baseline.py b/openproblems/tasks/denoising/methods/baseline.py index b31f0781fa..79a03fa859 100644 --- a/openproblems/tasks/denoising/methods/baseline.py +++ b/openproblems/tasks/denoising/methods/baseline.py @@ -5,7 +5,7 @@ @method( method_name="No denoising", paper_name="Molecular Cross-Validation for Single-Cell RNA-seq", - paper_url="https://doi.org/10.1101/786269", + paper_reference="batson2019molecular", paper_year=2019, code_url="https://github.com/czbiohub/molecular-cross-validation", is_baseline=True, @@ -20,7 +20,7 @@ def no_denoising(adata, test=False): @method( method_name="Perfect denoising", paper_name="Molecular Cross-Validation for Single-Cell RNA-seq", - paper_url="https://doi.org/10.1101/786269", + paper_reference="batson2019molecular", paper_year=2019, code_url="https://github.com/czbiohub/molecular-cross-validation", is_baseline=True, diff --git a/openproblems/tasks/denoising/methods/dca.py b/openproblems/tasks/denoising/methods/dca.py index 458f9ecb05..3f9d9cb5cf 100644 --- a/openproblems/tasks/denoising/methods/dca.py +++ b/openproblems/tasks/denoising/methods/dca.py @@ -27,7 +27,7 @@ def _dca(adata, test=False, epochs=None): @method( method_name="DCA", paper_name="Single-cell RNA-seq denoising using a deep count autoencoder", - paper_url="https://www.nature.com/articles/s41467-018-07931-2", + paper_reference="https://www.nature.com/articles/s41467-018-07931-2", paper_year=2019, code_url="https://github.com/theislab/dca", image="openproblems-python-tensorflow", diff --git a/openproblems/tasks/denoising/methods/knn_smoothing.py b/openproblems/tasks/denoising/methods/knn_smoothing.py index dd5da6e941..31cd4d1d7a 100644 --- a/openproblems/tasks/denoising/methods/knn_smoothing.py +++ b/openproblems/tasks/denoising/methods/knn_smoothing.py @@ -6,7 +6,7 @@ method_name="Iterative KNN smoothing", paper_name="K-nearest neighbor smoothing for high-throughput " "single-cell RNA-Seq data", - paper_url="https://doi.org/10.1101/217737", + paper_reference="wagner2018knearest", paper_year=2018, code_url="https://github.com/yanailab/knn-smoothing", image="openproblems-python-extras", diff --git a/openproblems/tasks/denoising/methods/magic.py b/openproblems/tasks/denoising/methods/magic.py index 53a6f437bb..68fcfd9e38 100644 --- a/openproblems/tasks/denoising/methods/magic.py +++ b/openproblems/tasks/denoising/methods/magic.py @@ -9,7 +9,7 @@ method, paper_name="Recovering Gene Interactions from Single-Cell Data " "Using Data Diffusion", - paper_url="https://doi.org/10.1016/j.cell.2018.05.061", + paper_reference="https://doi.org/10.1016/j.cell.2018.05.061", paper_year=2018, code_url="https://github.com/KrishnaswamyLab/MAGIC", image="openproblems-python-extras", @@ -62,7 +62,7 @@ def magic_approx(adata, test=False): @method( method_name="KNN smoothing", paper_name="KNN Smoothing (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", image="openproblems-python-extras", diff --git a/openproblems/tasks/denoising/metrics/mse.py b/openproblems/tasks/denoising/metrics/mse.py index 4a663ed6e6..104aec9fc8 100644 --- a/openproblems/tasks/denoising/metrics/mse.py +++ b/openproblems/tasks/denoising/metrics/mse.py @@ -1,7 +1,11 @@ from ....tools.decorators import metric -@metric(metric_name="Mean-squared error", maximize=False) +@metric( + metric_name="Mean-squared error", + paper_reference="batson2019molecular", + maximize=False, +) def mse(adata): import anndata import scanpy as sc diff --git a/openproblems/tasks/denoising/metrics/poisson.py b/openproblems/tasks/denoising/metrics/poisson.py index 93db71eee2..f1460627a3 100644 --- a/openproblems/tasks/denoising/metrics/poisson.py +++ b/openproblems/tasks/denoising/metrics/poisson.py @@ -1,7 +1,12 @@ from ....tools.decorators import metric -@metric(metric_name="Poisson loss", maximize=False, image="openproblems-python-pytorch") +@metric( + metric_name="Poisson loss", + paper_reference="batson2019molecular", + maximize=False, + image="openproblems-python-pytorch", +) def poisson(adata): from molecular_cross_validation.mcv_sweep import poisson_nll_loss diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index 3d3bf13cd4..0ad3d2f5d6 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -9,8 +9,8 @@ other functional elements encoded in mRNA such as lncRNAs). Since its inception, scRNA-seq experiments have been growing in terms of the number of cells measured. Originally, cutting-edge SmartSeq experiments would yield a few hundred cells, at best. Now, it is not uncommon to see experiments that yield over [100,000 -cells]() or even [> 1 million -cells.](https://doi.org/10.1126/science.aba7721) +cells](https://openproblems.bio/bibliography#tabula2018single) or even [> 1 million +cells.](https://openproblems.bio/bibliography#cao2020human) Each *feature* in a dataset functions as a single dimension. While each of the ~30,000 dimensions measured in each cell contribute to an underlying data structure, the overall @@ -31,13 +31,14 @@ data for visualization and interpretation. distances on the [Laplacian Eigenmap](http://dx.doi.org/10.1162/089976603321780317)). * **Trustworthiness**: a measurement of similarity between the rank of each point's nearest neighbors in the high-dimensional data and the reduced data ([Venna & Kaski, - 2001](http://dx.doi.org/10.1007/3-540-44668-0_68)). + 2001](https://openproblems.bio/bibliography#venna2001neighborhood)). * **Density preservation**: similarity between local densities in the high-dimensional data and the reduced data ([Narayan, Berger & Cho, - 2020](https://doi.org/10.1038/s41587-020-00801-7)) + 2020](https://openproblems.bio/bibliography#narayan2021assessing)) * **NN Ranking**: a set of metrics from - [pyDRMetrics](https://doi.org/10.17632/jbjd5fmggh.2) relating to the preservation - of nearest neighbors in the high-dimensional data and the reduced data. + [pyDRMetrics](https://openproblems.bio/bibliography#zhang2021pydrmetrics) relating to + the preservation of nearest neighbors in the high-dimensional data and the reduced + data. ## API diff --git a/openproblems/tasks/dimensionality_reduction/methods/baseline.py b/openproblems/tasks/dimensionality_reduction/methods/baseline.py index 1c9d4e3bb9..8e2c523cbc 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/baseline.py +++ b/openproblems/tasks/dimensionality_reduction/methods/baseline.py @@ -3,30 +3,30 @@ from ....tools.normalize import log_cpm_hvg from ....tools.utils import check_version +import functools import numpy as np - -@method( - method_name="Random Features", - paper_name="Random Features (baseline)", - paper_url="https://openproblems.bio", +_baseline_method = functools.partial( + method, + paper_name="Open Problems for Single Cell Analysis", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, ) + + +@_baseline_method( + method_name="Random Features", +) def random_features(adata, test=False): adata.obsm["X_emb"] = np.random.normal(0, 1, (adata.shape[0], 2)) adata.uns["method_code_version"] = check_version("openproblems") return adata -@method( +@_baseline_method( method_name="True Features", - paper_name="True Features (baseline)", - paper_url="https://openproblems.bio", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, ) def true_features(adata, test=False): adata.obsm["X_emb"] = adata.X @@ -38,13 +38,8 @@ def true_features(adata, test=False): return adata -@method( +@_baseline_method( method_name="True Features (logCPM)", - paper_name="True Features (baseline)", - paper_url="https://openproblems.bio", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, ) def true_features_log_cpm(adata, test=False): adata = log_cpm(adata) @@ -57,13 +52,8 @@ def true_features_log_cpm(adata, test=False): return adata -@method( +@_baseline_method( method_name="True Features (logCPM, 1kHVG)", - paper_name="True Features (baseline)", - paper_url="https://openproblems.bio", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, ) def true_features_log_cpm_hvg(adata, test=False): adata = log_cpm_hvg(adata) diff --git a/openproblems/tasks/dimensionality_reduction/methods/densmap.py b/openproblems/tasks/dimensionality_reduction/methods/densmap.py index a31567d3cc..d76e2375ff 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/densmap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/densmap.py @@ -8,7 +8,7 @@ method, paper_name="Assessing single-cell transcriptomic variability through" " density-preserving data visualization", - paper_url="https://www.nature.com/articles/s41587-020-00801-7", + paper_reference="narayan2021assessing", paper_year=2021, code_url="https://github.com/lmcinnes/umap", image="openproblems-python-extras", diff --git a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py index 420132dbf2..fc8fcff2fd 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py +++ b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py @@ -15,7 +15,7 @@ paper_name="NeuralEE: A GPU-Accelerated Elastic Embedding " "Dimensionality Reduction Method for " "Visualizing Large-Scale scRNA-Seq Data", - paper_url="https://www.frontiersin.org/articles/10.3389/fgene.2020.00786/full", + paper_reference="xiong2020neuralee", paper_year=2020, code_url="https://github.com/HiBearME/NeuralEE", image="openproblems-python-pytorch", diff --git a/openproblems/tasks/dimensionality_reduction/methods/pca.py b/openproblems/tasks/dimensionality_reduction/methods/pca.py index 20a9d08902..aa3491e7cc 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pca.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pca.py @@ -6,7 +6,7 @@ @method( method_name="Principle Component Analysis (PCA) (logCPM, 1kHVG)", paper_name="On lines and planes of closest fit to systems of points in space", - paper_url="https://doi.org/10.1080/14786440109462720", + paper_reference="pearson1901pca", paper_year=1901, code_url="https://scikit-learn.org/stable/modules/generated/" "sklearn.decomposition.PCA.html", diff --git a/openproblems/tasks/dimensionality_reduction/methods/phate.py b/openproblems/tasks/dimensionality_reduction/methods/phate.py index 43009bd34d..496c5d93ba 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/phate.py +++ b/openproblems/tasks/dimensionality_reduction/methods/phate.py @@ -8,8 +8,9 @@ _phate_method = functools.partial( method, - paper_name="Visualizing Transitions and Structure for Biological Data Exploration", - paper_url="https://www.nature.com/articles/s41587-019-0336-3", + paper_name="Visualizing Structure and Transitions in High-Dimensional Biological" + " Data", + paper_reference="moon2019visualizing", paper_year=2019, code_url="https://github.com/KrishnaswamyLab/PHATE/", image="openproblems-python-extras", diff --git a/openproblems/tasks/dimensionality_reduction/methods/tsne.py b/openproblems/tasks/dimensionality_reduction/methods/tsne.py index 0a9bf7a44e..0e210804c3 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/tsne.py +++ b/openproblems/tasks/dimensionality_reduction/methods/tsne.py @@ -6,7 +6,7 @@ @method( method_name="“t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCPM, 1kHVG)", paper_name="Visualizing Data using t-SNE", - paper_url="https://www.jmlr.org/papers/v9/vandermaaten08a.html", + paper_reference="vandermaaten2008visualizing", paper_year=2008, code_url="https://scikit-learn.org/stable/modules/generated/" "sklearn.manifold.TSNE.html#sklearn.manifold.TSNE", diff --git a/openproblems/tasks/dimensionality_reduction/methods/umap.py b/openproblems/tasks/dimensionality_reduction/methods/umap.py index 6a8f96bf43..ede81e50f3 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/umap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/umap.py @@ -8,7 +8,7 @@ "as implemented by scanpy (logCPM, 1kHVG)", paper_name="UMAP: Uniform Manifold Approximation and Projection for " "Dimension Reduction", - paper_url="https://arxiv.org/abs/1802.03426", + paper_reference="mcinnes2018umap", paper_year=2018, code_url="https://github.com/lmcinnes/umap", ) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/density.py b/openproblems/tasks/dimensionality_reduction/metrics/density.py index 680fd00ec0..f782ae7cda 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/density.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/density.py @@ -96,7 +96,12 @@ def _calculate_radii( return np.log(epsilon + (re / mu_sum)) -@metric("density preservation", maximize=True, image="openproblems-python-extras") +@metric( + "density preservation", + paper_reference="narayan2021assessing", + maximize=True, + image="openproblems-python-extras", +) def density_preservation(adata: AnnData) -> float: from scipy.sparse import issparse from scipy.stats import pearsonr diff --git a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py index 9ee479dbb7..d7526d1ef7 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py @@ -180,32 +180,36 @@ def _fit( return T[_K], C[_K], QNN[_K], AUC, LCMC[_K], Qlocal, Qglobal -@metric("continuity", maximize=True) +@metric("continuity", paper_reference="zhang2021pydrmetrics", maximize=True) def continuity(adata: AnnData) -> float: _, C, _, *_ = _fit(_high_dim(adata), adata.obsm["X_emb"]) return float(np.clip(C, 0.0, 1.0)) # in [0, 1] -@metric("co-KNN size", maximize=True) +@metric("co-KNN size", paper_reference="zhang2021pydrmetrics", maximize=True) def qnn(adata: AnnData) -> float: _, _, QNN, *_ = _fit(_high_dim(adata), adata.obsm["X_emb"]) # normalized in the code to [0, 1] return float(np.clip(QNN, 0.0, 1.0)) -@metric("co-KNN AUC", maximize=True) +@metric("co-KNN AUC", paper_reference="zhang2021pydrmetrics", maximize=True) def qnn_auc(adata: AnnData) -> float: _, _, _, AUC, *_ = _fit(_high_dim(adata), adata.obsm["X_emb"]) return float(np.clip(AUC, 0.5, 1.0)) # in [0.5, 1] -@metric("local continuity meta criterion", maximize=True) +@metric( + "local continuity meta criterion", + paper_reference="zhang2021pydrmetrics", + maximize=True, +) def lcmc(adata: AnnData) -> float: *_, LCMC, _, _ = _fit(_high_dim(adata), adata.obsm["X_emb"]) return LCMC -@metric("local property", maximize=True) +@metric("local property", paper_reference="zhang2021pydrmetrics", maximize=True) def qlocal(adata: AnnData) -> float: # according to authors, this is usually preferred to # qglobal, because human are more sensitive to nearer neighbors @@ -213,7 +217,7 @@ def qlocal(adata: AnnData) -> float: return Qlocal -@metric("global property", maximize=True) +@metric("global property", paper_reference="zhang2021pydrmetrics", maximize=True) def qglobal(adata: AnnData) -> float: *_, Qglobal = _fit(_high_dim(adata), adata.obsm["X_emb"]) return Qglobal diff --git a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py index 1089583bcd..b48fa09870 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py @@ -14,7 +14,11 @@ def _rmse(X, X_emb): return rmse -@metric(metric_name="RMSE", maximize=False) +@metric( + metric_name="RMSE", + maximize=False, + paper_reference="kruskal1964mds", +) def rmse(adata, n_svd=200): """Calculate the root mean squared error. @@ -29,7 +33,11 @@ def rmse(adata, n_svd=200): return _rmse(X, adata.obsm["X_emb"]) -@metric(metric_name="RMSE (spectral)", maximize=False) +@metric( + metric_name="RMSE (spectral)", + maximize=False, + paper_reference="coifman2006diffusion", +) def rmse_spectral(adata, n_comps=200): """Calculate the spectral root mean squared error diff --git a/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py index d308ef4486..157e71fb55 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py @@ -4,7 +4,11 @@ import numpy as np -@metric(metric_name="trustworthiness", maximize=True) +@metric( + metric_name="trustworthiness", + paper_reference="venna2001neighborhood", + maximize=True, +) def trustworthiness(adata: AnnData) -> float: from sklearn import manifold diff --git a/openproblems/tasks/label_projection/README.md b/openproblems/tasks/label_projection/README.md index 5a9dbc4951..0fb854c852 100644 --- a/openproblems/tasks/label_projection/README.md +++ b/openproblems/tasks/label_projection/README.md @@ -5,17 +5,17 @@ A major challenge for integrating single cell datasets is creating matching cell type annotations for each cell. One of the most common strategies for annotating cell types is referred to as -["cluster-then-annotate"](https://www.nature.com/articles/s41576-018-0088-9) whereby +["cluster-then-annotate"](https://openproblems.bio/bibliography#kiselev2019challenges) whereby cells are aggregated into clusters based on feature similarity and then manually characterized based on differential gene expression or previously identified marker genes. Recently, methods have emerged to build on this strategy and annotate cells -using [known marker genes](https://www.nature.com/articles/s41592-019-0535-3). However, +using [known marker genes](https://openproblems.bio/bibliography#pliner2019supervised). However, these strategies pose a difficulty for integrating atlas-scale datasets as the particular annotations may not match. To ensure that the cell type labels in newly generated datasets match existing reference datasets, some methods align cells to a previously annotated [reference -dataset](https://academic.oup.com/bioinformatics/article/35/22/4688/54802990) and then +dataset](https://openproblems.bio/bibliography#hou2019scmatch) and then _project_ labels from the reference to the new dataset. Here, we compare methods for annotation based on a reference dataset. The datasets diff --git a/openproblems/tasks/label_projection/methods/baseline.py b/openproblems/tasks/label_projection/methods/baseline.py index fd4d292815..513abc24b4 100644 --- a/openproblems/tasks/label_projection/methods/baseline.py +++ b/openproblems/tasks/label_projection/methods/baseline.py @@ -7,7 +7,7 @@ @method( method_name="Majority Vote", paper_name="Majority Vote (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", ) @@ -23,7 +23,7 @@ def majority_vote(adata, test=False): @method( method_name="Random Labels", paper_name="Random Labels (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -46,7 +46,7 @@ def random_labels(adata, test=False): @method( method_name="True Labels", paper_name="True Labels (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, diff --git a/openproblems/tasks/label_projection/methods/knn_classifier.py b/openproblems/tasks/label_projection/methods/knn_classifier.py index 62109c89b5..bc60082019 100644 --- a/openproblems/tasks/label_projection/methods/knn_classifier.py +++ b/openproblems/tasks/label_projection/methods/knn_classifier.py @@ -8,7 +8,7 @@ _knn_classifier_method = functools.partial( method, paper_name="Nearest neighbor pattern classification", - paper_url="https://doi.org/10.1109/TIT.1967.1053964", + paper_reference="cover1967nearest", paper_year=1967, code_url="https://scikit-learn.org/stable/modules/generated/" "sklearn.neighbors.KNeighborsClassifier.html", diff --git a/openproblems/tasks/label_projection/methods/logistic_regression.py b/openproblems/tasks/label_projection/methods/logistic_regression.py index 8c393b5387..6bed5d9758 100644 --- a/openproblems/tasks/label_projection/methods/logistic_regression.py +++ b/openproblems/tasks/label_projection/methods/logistic_regression.py @@ -8,7 +8,7 @@ _logistic_regression_method = functools.partial( method, paper_name="Applied Logistic Regression", - paper_url="https://books.google.com/books?id=64JYAwAAQBAJ", + paper_reference="hosmer2013applied", paper_year=2013, code_url="https://scikit-learn.org/stable/modules/generated/" "sklearn.linear_model.LogisticRegression.html", diff --git a/openproblems/tasks/label_projection/methods/mlp.py b/openproblems/tasks/label_projection/methods/mlp.py index 71d1dcea9b..294d487652 100644 --- a/openproblems/tasks/label_projection/methods/mlp.py +++ b/openproblems/tasks/label_projection/methods/mlp.py @@ -8,7 +8,7 @@ _mlp_method = functools.partial( method, paper_name="Connectionist learning procedures", - paper_url="https://doi.org/10.1016/0004-3702(89)90049-0", + paper_reference="hinton1989connectionist", paper_year=1990, code_url="https://scikit-learn.org/stable/modules/generated/" "sklearn.neural_network.MLPClassifier.html", diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 1cec698b27..b220b8bea8 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -7,7 +7,7 @@ method, paper_name="Probabilistic harmonization and annotation of single-cell" " transcriptomics data with deep generative models", - paper_url="https://doi.org/10.15252/msb.20209620", + paper_reference="xu2021probabilistic", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", image="openproblems-python-pytorch", @@ -16,7 +16,7 @@ _scanvi_scarches_method = functools.partial( method, paper_name="Query to reference single-cell integration with transfer learning", - paper_url="https://doi.org/10.1101/2020.07.16.205997", + paper_reference="lotfollahi2020query", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", image="openproblems-python-pytorch", diff --git a/openproblems/tasks/label_projection/methods/seurat.py b/openproblems/tasks/label_projection/methods/seurat.py index 874f2c9b54..2871c06acf 100644 --- a/openproblems/tasks/label_projection/methods/seurat.py +++ b/openproblems/tasks/label_projection/methods/seurat.py @@ -13,7 +13,7 @@ @method( method_name="Seurat reference mapping (SCTransform)", paper_name="Integrated analysis of multimodal single-cell data", - paper_url="https://doi.org/10.1016/j.cell.2021.04.048", + paper_reference="hao2021integrated", paper_year=2021, code_url="https://github.com/satijalab/seurat", image="openproblems-r-extras", diff --git a/openproblems/tasks/label_projection/methods/xgboost.py b/openproblems/tasks/label_projection/methods/xgboost.py index b489f98bce..92ec3c7e6b 100644 --- a/openproblems/tasks/label_projection/methods/xgboost.py +++ b/openproblems/tasks/label_projection/methods/xgboost.py @@ -10,7 +10,7 @@ _xgboost_method = functools.partial( method, paper_name="XGBoost: A Scalable Tree Boosting System", - paper_url="https://doi.org/10.1145/2939672.2939785", + paper_reference="chen2016xgboost", paper_year=2016, code_url="https://xgboost.readthedocs.io/en/stable/index.html", ) diff --git a/openproblems/tasks/label_projection/metrics/accuracy.py b/openproblems/tasks/label_projection/metrics/accuracy.py index d86bf8ec48..bed7b3e4cf 100644 --- a/openproblems/tasks/label_projection/metrics/accuracy.py +++ b/openproblems/tasks/label_projection/metrics/accuracy.py @@ -3,7 +3,7 @@ import numpy as np -@metric(metric_name="Accuracy", maximize=True) +@metric(metric_name="Accuracy", paper_reference="grandini2020metrics", maximize=True) def accuracy(adata): import sklearn.preprocessing diff --git a/openproblems/tasks/label_projection/metrics/f1.py b/openproblems/tasks/label_projection/metrics/f1.py index e2e870a6ee..e893e588d0 100644 --- a/openproblems/tasks/label_projection/metrics/f1.py +++ b/openproblems/tasks/label_projection/metrics/f1.py @@ -16,11 +16,13 @@ def _f1(adata, average="weighted"): ) -@metric(metric_name="F1 score", maximize=True) +@metric(metric_name="F1 score", paper_reference="grandini2020metrics", maximize=True) def f1(adata): return _f1(adata, average="weighted") -@metric(metric_name="Macro F1 score", maximize=True) +@metric( + metric_name="Macro F1 score", paper_reference="grandini2020metrics", maximize=True +) def f1_macro(adata): return _f1(adata, average="macro") diff --git a/openproblems/tasks/multimodal_data_integration/README.md b/openproblems/tasks/multimodal_data_integration/README.md index 88884cdc74..fd7e32ad44 100644 --- a/openproblems/tasks/multimodal_data_integration/README.md +++ b/openproblems/tasks/multimodal_data_integration/README.md @@ -5,12 +5,12 @@ Cellular function is regulated by the complex interplay of different types of biological molecules (DNA, RNA, proteins, etc.), which determine the state of a cell. Several recently described technologies allow for simultaneous measurement of different aspects -of cellular state. For example, [sci-CAR](https://doi.org/10.1126/science.aau0730) +of cellular state. For example, [sci-CAR](https://openproblems.bio/bibliography#cao2018joint) jointly profiles RNA expression and chromatin accessibility on the same cell and -[CITE-seq](https://doi.org/10.1038/nmeth.4380) measures surface protein abundance and -RNA expression from each cell. These technologies enable us to better understand -cellular function, however datasets are still rare and there are tradeoffs that these -measurements make for to profile multiple modalities. +[CITE-seq](https://openproblems.bio/bibliography#stoeckius2017simultaneous) measures +surface protein abundance and RNA expression from each cell. These technologies enable +us to better understand cellular function, however datasets are still rare and there are +tradeoffs that these measurements make for to profile multiple modalities. Joint methods can be more expensive or lower throughput or more noisy than measuring a single modality at a time. Therefore it is useful to develop methods that are capable diff --git a/openproblems/tasks/multimodal_data_integration/methods/baseline.py b/openproblems/tasks/multimodal_data_integration/methods/baseline.py index 49e9a3c55e..01f682aedb 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/baseline.py +++ b/openproblems/tasks/multimodal_data_integration/methods/baseline.py @@ -8,7 +8,7 @@ @method( method_name="Random Features", paper_name="Random Features (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -30,7 +30,7 @@ def random_features(adata, test=False, n_svd=20): @method( method_name="True Features", paper_name="True Features (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, diff --git a/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py b/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py index c60b689ec3..65f1572ea1 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py +++ b/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py @@ -9,7 +9,7 @@ _harmonic_alignment_method = functools.partial( method, paper_name="Harmonic Alignment", - paper_url="https://doi.org/10.1137/1.9781611976236.36", + paper_reference="stanley2020harmonic", paper_year=2020, code_url="https://github.com/KrishnaswamyLab/harmonic-alignment", ) diff --git a/openproblems/tasks/multimodal_data_integration/methods/mnn.py b/openproblems/tasks/multimodal_data_integration/methods/mnn.py index 92eeeddf8f..c0bd27e501 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/mnn.py +++ b/openproblems/tasks/multimodal_data_integration/methods/mnn.py @@ -12,7 +12,7 @@ method, paper_name="Batch effects in single-cell RNA-sequencing data are corrected by " "matching mutual nearest neighbors", - paper_url="https://www.nature.com/articles/nbt.4091", + paper_reference="haghverdi2018batch", paper_year=2018, code_url="https://github.com/LTLA/batchelor", image="openproblems-r-extras", diff --git a/openproblems/tasks/multimodal_data_integration/methods/procrustes.py b/openproblems/tasks/multimodal_data_integration/methods/procrustes.py index 82ce06fa72..55f8587a27 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/procrustes.py +++ b/openproblems/tasks/multimodal_data_integration/methods/procrustes.py @@ -6,7 +6,7 @@ @method( method_name="Procrustes", paper_name="Generalized Procrustes analysis", - paper_url="https://link.springer.com/content/pdf/10.1007/BF02291478.pdf", + paper_reference="gower1975generalized", paper_year=1975, code_url="https://docs.scipy.org/doc/scipy/reference/generated/" "scipy.spatial.procrustes.html", diff --git a/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py b/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py index 7a458ee386..3a50fbb178 100644 --- a/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py +++ b/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py @@ -3,7 +3,11 @@ import numpy as np -@metric(metric_name="kNN Area Under the Curve", maximize=True) +@metric( + metric_name="kNN Area Under the Curve", + paper_reference="stanley2020harmonic", + maximize=True, +) def knn_auc(adata, proportion_neighbors=0.1, n_svd=100): import sklearn.decomposition import sklearn.neighbors diff --git a/openproblems/tasks/multimodal_data_integration/metrics/mse.py b/openproblems/tasks/multimodal_data_integration/metrics/mse.py index ed8a1563a9..ffdfd1dfee 100644 --- a/openproblems/tasks/multimodal_data_integration/metrics/mse.py +++ b/openproblems/tasks/multimodal_data_integration/metrics/mse.py @@ -13,7 +13,11 @@ def _square(X): return scprep.utils.toarray(X) ** 2 -@metric(metric_name="Mean squared error", maximize=False) +@metric( + metric_name="Mean squared error", + paper_reference="lance2022multimodal", + maximize=False, +) def mse(adata): X = scprep.utils.toarray(adata.obsm["aligned"]) Y = scprep.utils.toarray(adata.obsm["mode2_aligned"]) diff --git a/openproblems/tasks/regulatory_effect_prediction/README.md b/openproblems/tasks/regulatory_effect_prediction/README.md index 1bd06f809c..0735377a26 100644 --- a/openproblems/tasks/regulatory_effect_prediction/README.md +++ b/openproblems/tasks/regulatory_effect_prediction/README.md @@ -1,10 +1,10 @@ # Chromatin accessibility prediction Chromatin accessibility prediction refers to the gene expression prediction of a cell or -cell type from ATAC-seq peaks. For a summary or all relevant models, see gene score -method in [Jeffrey M. Granja et -al.](https://www.biorxiv.org/content/10.1101/2020.04.28.066498v1), [Su Wang et -al.](https://pubmed.ncbi.nlm.nih.gov/24263090/) et al. +cell type from ATAC-seq peaks. For a summary of all relevant models, see gene score +methods in [Jeffrey M. Granja et +al.](https://openproblems.bio/bibliography#granja2021archr), [Su Wang et +al.](https://openproblems.bio/bibliography#wang2013target) et al. ## API diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/baseline.py b/openproblems/tasks/regulatory_effect_prediction/methods/baseline.py index eb5afbfd71..a91bdc60d4 100644 --- a/openproblems/tasks/regulatory_effect_prediction/methods/baseline.py +++ b/openproblems/tasks/regulatory_effect_prediction/methods/baseline.py @@ -7,7 +7,7 @@ @method( method_name="Random Scores", paper_name="Random Scores (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -23,7 +23,7 @@ def random_scores(adata, test=False): @method( method_name="True Scores", paper_name="True Scores (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py index 1eeff4992e..5689c5a5a2 100644 --- a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py +++ b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py @@ -231,7 +231,7 @@ def _beta(adata, test=False, top_genes=None, threshold=1): method_name="BETA", paper_name="Target analysis by integration of transcriptome " "and ChIP-seq data with BETA", - paper_url="https://pubmed.ncbi.nlm.nih.gov/24263090/", + paper_reference="wang2013target", paper_year=2013, code_version="1.0", code_url="http://cistrome.org/BETA", diff --git a/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py b/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py index e586a15e8a..4993d58cff 100644 --- a/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py +++ b/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py @@ -26,11 +26,19 @@ def _correlation(adata, method="pearson"): return np.median(cors[~np.isnan(cors)]) -@metric(metric_name="Median Pearson correlation", maximize=True) +@metric( + metric_name="Median Pearson correlation", + paper_reference="schober2018correlation", + maximize=True, +) def pearson_correlation(adata): return _correlation(adata) -@metric(metric_name="Median Spearman correlation", maximize=True) +@metric( + metric_name="Median Spearman correlation", + paper_reference="schober2018correlation", + maximize=True, +) def spearman_correlation(adata): return _correlation(adata, method="spearman") diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py index e446b07687..3bdfb77031 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py @@ -6,7 +6,7 @@ "DestVI", data_url="https://github.com/romain-lopez/DestVI-reproducibility/" "blob/master/simulations/make_dataset.py", - data_reference="https://doi.org/10.1038/s41587-022-01272-8", + data_reference="lopez2022destvi", dataset_summary="scRNA-seq is generated based on learn NB parameters " "from the destVI manuscripts leveraging sparsePCA. Number of cells and " "cell types present in each spatial spot is computed via combination of " diff --git a/openproblems/tasks/spatial_decomposition/methods/baseline.py b/openproblems/tasks/spatial_decomposition/methods/baseline.py index 6c048326c5..727ec1bf3d 100644 --- a/openproblems/tasks/spatial_decomposition/methods/baseline.py +++ b/openproblems/tasks/spatial_decomposition/methods/baseline.py @@ -8,7 +8,7 @@ @method( method_name="Random Proportions", paper_name="Random Proportions (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, @@ -28,7 +28,7 @@ def random_proportions(adata, test=False): @method( method_name="True Proportions", paper_name="True Proportions (baseline)", - paper_url="https://openproblems.bio", + paper_reference="openproblems", paper_year=2022, code_url="https://github.com/openproblems-bio/openproblems", is_baseline=True, diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index 01c4e4d32d..3c55a44eeb 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -9,7 +9,7 @@ _cell2location_method = functools.partial( method, paper_name="Cell2location maps fine-grained cell types in spatial transcriptomics", - paper_url="https://doi.org/10.1038/s41587-021-01139-4", + paper_reference="kleshchevnikov2022cell2location", paper_year=2022, code_url="https://github.com/BayraktarLab/cell2location", image="openproblems-python-pytorch", diff --git a/openproblems/tasks/spatial_decomposition/methods/destvi.py b/openproblems/tasks/spatial_decomposition/methods/destvi.py index 4338a465fd..0c6ca2b7dd 100644 --- a/openproblems/tasks/spatial_decomposition/methods/destvi.py +++ b/openproblems/tasks/spatial_decomposition/methods/destvi.py @@ -8,7 +8,7 @@ method_name="DestVI", paper_name="DestVI identifies continuums of cell types in spatial " "transcriptomics data", - paper_url="https://doi.org/10.1038/s41587-022-01272-8", + paper_reference="lopez2022destvi", paper_year=2022, code_url="https://github.com/YosefLab/scvi-tools", image="openproblems-python-pytorch", diff --git a/openproblems/tasks/spatial_decomposition/methods/nmfreg.py b/openproblems/tasks/spatial_decomposition/methods/nmfreg.py index 0876cb37a1..dd5d0dfe91 100644 --- a/openproblems/tasks/spatial_decomposition/methods/nmfreg.py +++ b/openproblems/tasks/spatial_decomposition/methods/nmfreg.py @@ -9,7 +9,7 @@ method_name="NMF-reg", paper_name="Slide-seq: A scalable technology for measuring genome-wide" " expression at high spatial resolution", - paper_url="https://doi.org/10.1126/science.aaw1219", + paper_reference="rodriques2019slide", paper_year=2019, code_url="https://github.com/tudaga/NMFreg_tutorial", ) diff --git a/openproblems/tasks/spatial_decomposition/methods/nnls.py b/openproblems/tasks/spatial_decomposition/methods/nnls.py index d1caf7b532..23cfd7b5e1 100644 --- a/openproblems/tasks/spatial_decomposition/methods/nnls.py +++ b/openproblems/tasks/spatial_decomposition/methods/nnls.py @@ -10,7 +10,7 @@ @method( method_name="Non-Negative Least Squares", paper_name="Solving Least Squares Problems", - paper_url="https://doi.org/10.1137/1.9781611971217", + paper_reference="lawson1995solving", paper_year=1987, code_url="https://docs.scipy.org/doc/scipy/" "reference/generated/scipy.optimize.nnls.html", diff --git a/openproblems/tasks/spatial_decomposition/methods/rctd.py b/openproblems/tasks/spatial_decomposition/methods/rctd.py index 9b73c21c2c..50b6d8414c 100644 --- a/openproblems/tasks/spatial_decomposition/methods/rctd.py +++ b/openproblems/tasks/spatial_decomposition/methods/rctd.py @@ -13,7 +13,7 @@ @method( method_name="RCTD", paper_name="Robust decomposition of cell type mixtures in spatial transcriptomics", - paper_url="https://doi.org/10.1038/s41587-021-00830-w", + paper_reference="cable2021robust", paper_year=2020, code_url="https://github.com/dmcable/spacexr", image="openproblems-r-extras", diff --git a/openproblems/tasks/spatial_decomposition/methods/seuratv3.py b/openproblems/tasks/spatial_decomposition/methods/seuratv3.py index 98f1516089..0aaa92f4de 100644 --- a/openproblems/tasks/spatial_decomposition/methods/seuratv3.py +++ b/openproblems/tasks/spatial_decomposition/methods/seuratv3.py @@ -15,7 +15,7 @@ @method( method_name="SeuratV3", paper_name="Comprehensive Integration of Single-Cell Data", - paper_url="https://doi.org/10.1016/j.cell.2019.05.031", + paper_reference="stuart2019comprehensive", paper_year=2019, code_url="https://satijalab.org/seurat/archive/v3.2/spatial_vignette.html", image="openproblems-r-extras", diff --git a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py index f9c025319e..d046376daa 100644 --- a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py +++ b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py @@ -7,7 +7,7 @@ method_name="Stereoscope", paper_name="Single-cell and spatial transcriptomics enables probabilistic " "inference of cell type topography", - paper_url="https://doi.org/10.1038/s41587-022-01272-8", + paper_reference="andersson2020single", paper_year=2020, code_url="https://github.com/scverse/scvi-tools", image="openproblems-python-pytorch", diff --git a/openproblems/tasks/spatial_decomposition/methods/tangram.py b/openproblems/tasks/spatial_decomposition/methods/tangram.py index c05eb0339f..a1a9e92f82 100644 --- a/openproblems/tasks/spatial_decomposition/methods/tangram.py +++ b/openproblems/tasks/spatial_decomposition/methods/tangram.py @@ -7,7 +7,7 @@ method_name="Tangram", paper_name="Deep learning and alignment of spatially resolved single-cell " "transcriptomes with Tangram", - paper_url="https://doi.org/10.1038/s41592-021-01264-7", + paper_reference="biancalani2021deep", paper_year=2021, code_url="https://github.com/broadinstitute/Tangram", image="openproblems-python-pytorch", diff --git a/openproblems/tasks/spatial_decomposition/methods/vanillanmf.py b/openproblems/tasks/spatial_decomposition/methods/vanillanmf.py index e9ff3e4d16..d561ff14cf 100644 --- a/openproblems/tasks/spatial_decomposition/methods/vanillanmf.py +++ b/openproblems/tasks/spatial_decomposition/methods/vanillanmf.py @@ -8,7 +8,7 @@ method_name="Non-Negative Matrix Factorization (NMF)", paper_name="Fast local algorithms for large scale nonnegative " "matrix and tensor factorizations", - paper_url="https://doi.org/10.1587/transfun.E92.A.708", + paper_reference="cichocki2009fast", paper_year=2009, code_url="https://scikit-learn.org/stable/modules/generated/" "sklearn.decomposition.NMF.html", diff --git a/openproblems/tasks/spatial_decomposition/metrics/r2.py b/openproblems/tasks/spatial_decomposition/metrics/r2.py index 083f23ffde..13d7564ae5 100644 --- a/openproblems/tasks/spatial_decomposition/metrics/r2.py +++ b/openproblems/tasks/spatial_decomposition/metrics/r2.py @@ -1,7 +1,7 @@ from ....tools.decorators import metric -@metric(metric_name="r2", maximize=True) +@metric(metric_name="r2", maximize=True, paper_reference="miles2005rsquared") def r2(adata): import sklearn.metrics diff --git a/openproblems/tools/decorators.py b/openproblems/tools/decorators.py index 9157403414..c4b4872e3b 100644 --- a/openproblems/tools/decorators.py +++ b/openproblems/tools/decorators.py @@ -48,7 +48,7 @@ def _backport_code_version(apply_method, code_version): def method( method_name, paper_name, - paper_url, + paper_reference, paper_year, code_url, code_version=None, @@ -63,8 +63,8 @@ def method( Unique human readable name of the method paper_name : str Title of the seminal paper describing the method - paper_url : str - Link to the paper, preferably a DOI URL + paper_reference : str + BibTex key from `main.bib` referring to the paper paper_year : int Year the paper was published code_url : str @@ -84,7 +84,7 @@ def apply_method(*args, **kwargs): apply_method.metadata = dict( method_name=method_name, paper_name=paper_name, - paper_url=paper_url, + paper_reference=paper_reference, paper_year=paper_year, code_url=code_url, image=image, @@ -96,7 +96,7 @@ def apply_method(*args, **kwargs): return decorator -def metric(metric_name, maximize, image="openproblems"): +def metric(metric_name, maximize, paper_reference, image="openproblems"): """Decorate a metric function. Parameters @@ -108,6 +108,9 @@ def metric(metric_name, maximize, image="openproblems"): ---------- metric_name : str Unique human readable name of the metric + paper_reference : str + BibTex key from `main.bib` referring to the seminal paper in which the metric + was defined maximize : bool If True, the metric should be maximized. If False, it should be minimized. image : str, optional (default: "openproblems") @@ -121,7 +124,10 @@ def apply_metric(*args, **kwargs): return func(*args, **kwargs) apply_metric.metadata = dict( - metric_name=metric_name, maximize=maximize, image=image + metric_name=metric_name, + paper_reference=paper_reference, + maximize=maximize, + image=image, ) return apply_metric @@ -144,7 +150,8 @@ def dataset( data_url : str Link to the original source of the dataset data_reference : str - Link to the paper describing how the dataset was generated + BibTex key from `main.bib` referring to the paper describing how the dataset was + generated dataset_summary : str Short (<80 character) summary of the dataset image : str, optional (default: "openproblems") diff --git a/setup.py b/setup.py index 40e6c48955..f0083cc5c9 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,7 @@ "codecov==2.1.*", "parameterized==0.8.*", "requests==2.28.*", + "bibtexparser==1.4.*", ] version_py = os.path.join(os.path.dirname(__file__), "openproblems", "version.py") diff --git a/test/test_task_2_datasets.py b/test/test_task_2_datasets.py index f35aaa7713..2787554747 100644 --- a/test/test_task_2_datasets.py +++ b/test/test_task_2_datasets.py @@ -149,4 +149,4 @@ def test_dataset_metadata(dataset): assert isinstance(dataset.metadata["data_url"], str) assert utils.asserts.assert_url_accessible(dataset.metadata["data_url"]) assert isinstance(dataset.metadata["data_reference"], str) - assert utils.asserts.assert_url_accessible(dataset.metadata["data_reference"]) + assert utils.asserts.assert_valid_reference(dataset.metadata["data_reference"]) diff --git a/test/test_task_methods.py b/test/test_task_methods.py index 84f0c220bb..f92ce7d57d 100644 --- a/test/test_task_methods.py +++ b/test/test_task_methods.py @@ -1,6 +1,7 @@ import openproblems import os import parameterized +import utils.asserts import utils.docker import utils.git import utils.name @@ -60,7 +61,7 @@ def test_method_metadata(method): for attr in [ "method_name", "paper_name", - "paper_url", + "paper_reference", "paper_year", "code_url", "image", @@ -73,8 +74,8 @@ def test_method_metadata(method): assert isinstance(method.metadata["method_name"], str) assert isinstance(method.metadata["paper_name"], str) assert isinstance(method.metadata["paper_year"], int) - assert isinstance(method.metadata["paper_url"], str) - assert utils.asserts.assert_url_accessible(method.metadata["paper_url"]) + assert isinstance(method.metadata["paper_reference"], str) + assert utils.asserts.assert_valid_reference(method.metadata["paper_reference"]) assert isinstance(method.metadata["code_url"], str) assert utils.asserts.assert_url_accessible(method.metadata["code_url"]) assert isinstance(method.metadata["is_baseline"], bool) diff --git a/test/test_task_metrics.py b/test/test_task_metrics.py index 55075ad785..8232bb958b 100644 --- a/test/test_task_metrics.py +++ b/test/test_task_metrics.py @@ -1,5 +1,6 @@ import openproblems import parameterized +import utils.asserts import utils.docker import utils.name @@ -17,6 +18,8 @@ def test_metric_metadata(metric): assert isinstance(metric.metadata["metric_name"], str) assert isinstance(metric.metadata["image"], str) assert metric.metadata["image"].startswith("openproblems") + assert isinstance(metric.metadata["paper_reference"], str) + assert utils.asserts.assert_valid_reference(metric.metadata["paper_reference"]) @parameterized.parameterized.expand( diff --git a/test/utils/asserts.py b/test/utils/asserts.py index d5663eed29..06f85b69ab 100644 --- a/test/utils/asserts.py +++ b/test/utils/asserts.py @@ -2,12 +2,16 @@ import functools import numpy as np +import pathlib import scipy.sparse _REQUEST_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) " "Gecko/20100101 Firefox/71.0" } +FILEPATH = pathlib.Path(__file__) + +_MISSING_DOIS = ["vandermaaten2008visualizing", "hosmer2013applied"] def assert_array_equal(X, Y): @@ -43,3 +47,22 @@ def assert_url_accessible(url): with requests.head(url, headers=_REQUEST_HEADERS) as response: assert _response_ok(response), (url, response.status_code) return True + + +@functools.lru_cache(None) +def _load_bibliography(): + import bibtexparser + + bib_path = FILEPATH.parents[2].joinpath("main.bib") + with open(bib_path, "r") as handle: + return bibtexparser.load(handle) + + +def assert_valid_reference(ref): + bib = _load_bibliography() + assert ref in bib.entries_dict + bibentry = bib.entries_dict[ref] + if not bibentry["ENTRYTYPE"] == "misc" or ref in _MISSING_DOIS: + assert "doi" in bibentry + assert assert_url_accessible(f"https://doi.org/{bibentry['doi']}") + return True diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index 7fe20b9ab7..defab2fff1 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -337,7 +337,8 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): output = dict( name=dataset.metadata["dataset_name"], data_url=dataset.metadata["data_url"], - data_reference=dataset.metadata["data_reference"], + data_reference="https://openproblems.bio/" + f"bibliography#{dataset.metadata['data_reference']}", headers=dict( names=["Rank", "Name", "Mean score"], fixed=["Name", "Paper", "Library"] ), @@ -358,7 +359,8 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): result = { "Name": method.metadata["method_name"], "Paper": method.metadata["paper_name"], - "Paper URL": method.metadata["paper_url"], + "Paper URL": "https://openproblems.bio/" + f"bibliography#{method.metadata['paper_reference']}", "Year": method.metadata["paper_year"], "Library": method.metadata["code_url"], "Implementation": "https://github.com/openproblems-bio/openproblems/" From 6bf3124d2322cc225a4cb37b8c6e0bde91dd6d79 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 10 Jan 2023 01:03:34 +1100 Subject: [PATCH 196/266] fix typo in bibliography path (#774) --- .github/workflows/update_website_content.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/update_website_content.yml b/.github/workflows/update_website_content.yml index b1ddd4d027..cd270325b2 100644 --- a/.github/workflows/update_website_content.yml +++ b/.github/workflows/update_website_content.yml @@ -52,7 +52,7 @@ jobs: run: | rm -r website/content/benchmarks/*/ python openproblems/workflow/generate_website_markdown.py website/content/benchmarks - cp main.bib website/static/bibliography + cp openproblems/main.bib website/static/bibliography cd website git diff --exit-code --quiet || echo "CHANGED=true" >> $GITHUB_ENV From 80111a2fbcaf3ed125f8343fdaa09539f38f527c Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 10 Jan 2023 01:17:10 +1100 Subject: [PATCH 197/266] More bibliography typos (#775) * Fix bibliography typos * fix another typo Co-authored-by: Scott Gigante --- .pre-commit-config.yaml | 2 +- main.bib | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3ea32ca8c8..6efbfe6ee4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -43,4 +43,4 @@ repos: rev: "8838095" hooks: - id: bibtex-tidy - args: ['--omit', 'abstract', '--sort', '--duplicates', '--drop-all-caps', '--sort-fields', '--trailing-commas'] + args: ['--omit', 'abstract', '--sort', '--duplicates', '--sort-fields', '--trailing-commas'] diff --git a/main.bib b/main.bib index c040c44913..836fe90a95 100644 --- a/main.bib +++ b/main.bib @@ -51,8 +51,8 @@ @article{bland2000odds author = {J. M. Bland}, year = {2000}, month = may, - journal = {{Bmj}}, - publisher = {{Bmj}}, + journal = {{BMJ}}, + publisher = {{BMJ}}, volume = {320}, number = {7247}, pages = {1468--1468}, @@ -824,8 +824,8 @@ @inproceedings{venna2001neighborhood booktitle = {Artificial Neural Networks {\textemdash} {ICANN} 2001}, publisher = {Springer Berlin Heidelberg}, pages = {485--491}, - doi = {{10.1007/3-540-44668-0\_68}}, - url = {{https://doi.org/10.1007/3-540-44668-0\_68}}, + doi = {10.1007/3-540-44668-0\_68}, + url = {https://doi.org/10.1007/3-540-44668-0\_68}, } @article{wagner2018knearest, title = {K-nearest neighbor smoothing for high-throughput single-cell RNA-Seq data}, From 14d70b330cae09527a6d4c4e552db240601e31cf Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 10 Jan 2023 20:00:01 +1100 Subject: [PATCH 198/266] Pre-normalize dimensionality reduction datasets (#768) * Convert dimensionality reduction to use log_cpm by default * bugfixes * more bugfixes * create fake counts for obsm normalizer * docs * document uns["n_genes"] * fix densmap ref * fix tests --- .../tasks/dimensionality_reduction/README.md | 27 +++--- .../dimensionality_reduction/__init__.py | 2 +- .../tasks/dimensionality_reduction/api.py | 12 ++- .../datasets/mouse_blood_olssen_labelled.py | 5 +- .../datasets/mouse_hspc_nestorowa2016.py | 5 +- .../datasets/tenx_5k_pbmc.py | 5 +- .../methods/__init__.py | 12 ++- .../methods/baseline.py | 3 +- .../methods/densmap.py | 44 ---------- .../methods/neuralee.py | 21 +++-- .../dimensionality_reduction/methods/pca.py | 32 +++++-- .../dimensionality_reduction/methods/phate.py | 33 +++++-- .../dimensionality_reduction/methods/tsne.py | 35 ++++++-- .../dimensionality_reduction/methods/umap.py | 88 ++++++++++++++++--- .../metrics/nn_ranking.py | 4 - .../metrics/root_mean_square_error.py | 2 +- openproblems/tools/decorators.py | 8 +- test/test_core_tasks.py | 2 +- test/test_task_dimensionality_reduction.py | 2 +- 19 files changed, 236 insertions(+), 106 deletions(-) delete mode 100644 openproblems/tasks/dimensionality_reduction/methods/densmap.py diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index 0ad3d2f5d6..5cc118bbf5 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -42,10 +42,18 @@ data for visualization and interpretation. ## API -**Datasets** should provide un-normalized raw counts in `adata.X`. +WARNING: other than most tasks, `adata.X` should contain log CPM-normalized data, + This is the case as we are computing ground truth metrics on normalized data, + which means methods which use this same normalization are likely to score more + highly on these metrics. + +**Datasets** should provide *log CPM normalized counts* in `adata.X` and store the +original number of genes (i.e., `adata.shape[1]`) in `adata.uns["n_genes"]`. **Methods** should assign dimensionally-reduced 2D embedding coordinates to -`adata.obsm['X_emb']`. +`adata.obsm['X_emb']`. They *should not* modify the dimensionality of `adata.X` (e.g. +by subsetting to highly variable features, which should be done on a local copy of the +data without modifying the AnnData object that is returned.) **Metrics** should calculate the quality or "goodness of fit" of a dimensional reduction **method**. If the un-normalized input counts matrix is required by the matrix it can be @@ -57,14 +65,13 @@ Different methods can require different pre-processing of the data. Standard pre-processing functions are available as part of the `tools` module. Where possible each **method** should first call one of these functions and use the processed `adata.X` slot as the input to the method. Raw counts are also stored in `adata.layers["counts"]` -by the standard pre-processing functions, if a method performs its own pre-processing it -should also do this for use by metrics. For most methods a standard pre-processing with -the `log_cpm_hvg()` function is used which normalizes the expression matrix to counts -per million (CPM), performs a log transformation and annotates highly-variable -genes (HVGs) (as selected by scanpy's `high_variable_genes(adata, n_top_genes=1000, -flavor="cell_ranger")`) to `adata.var["highly_variable"]`. Variants of methods can be -created by applying different pre-processing prior to the method itself (see `phate.py` -for an example). +by the standard pre-processing functions, if a method performs its own pre-processing. +For most methods a standard pre-processing from `log_cpm()`, which normalizes the +expression matrix to counts per million (CPM), can be used directly from `adata.X`. +Variants of methods can be created by applying different pre-processing prior to the +method itself (see `phate.py` for an example). *Note that using a normalization method +different from that used for the metrics (log CPM) may lead to artificially poor method +performance.* ## The methods diff --git a/openproblems/tasks/dimensionality_reduction/__init__.py b/openproblems/tasks/dimensionality_reduction/__init__.py index c32292b3b7..12920cfa9e 100644 --- a/openproblems/tasks/dimensionality_reduction/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/__init__.py @@ -9,7 +9,7 @@ "Reduction of high-dimensional datasets to 2D for visualization & interpretation" ) -DEFAULT_LAYER = "counts" +DEFAULT_LAYER = "log_cpm" DATASETS = utils.get_callable_members(datasets) METHODS = utils.get_callable_members(methods) diff --git a/openproblems/tasks/dimensionality_reduction/api.py b/openproblems/tasks/dimensionality_reduction/api.py index d217c2df58..57d2a8645a 100644 --- a/openproblems/tasks/dimensionality_reduction/api.py +++ b/openproblems/tasks/dimensionality_reduction/api.py @@ -1,16 +1,23 @@ from ...data.sample import load_sample_data from ...tools.decorators import dataset +from ...tools.normalize import log_cpm import numpy as np def check_dataset(adata): """Check that dataset output fits expected API.""" + assert "n_genes" in adata.uns + assert adata.uns["n_genes"] == adata.shape[1] return True def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" + # check adata.X has not changed + assert adata.uns["n_genes"] == adata.shape[1] + assert adata.X is adata.layers["log_cpm"] + # check output assert "X_emb" in adata.obsm if not is_baseline: assert adata.obsm["X_emb"].shape[1] == 2 @@ -21,7 +28,10 @@ def check_method(adata, is_baseline=False): @dataset() def sample_dataset(): """Create a simple dataset to use for testing methods in this task.""" - return load_sample_data() + adata = load_sample_data() + adata = log_cpm(adata) + adata.uns["n_genes"] = adata.shape[1] + return adata def sample_method(adata): diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olssen_labelled.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olssen_labelled.py index 563d3278af..d789477710 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olssen_labelled.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olssen_labelled.py @@ -1,5 +1,6 @@ from ....data.mouse_blood_olssen_labelled import load_olsson_2016_mouse_blood from ....tools.decorators import dataset +from ....tools.normalize import log_cpm @dataset( @@ -11,4 +12,6 @@ "660 cells x 112815 features with 4 cell type labels", ) def olsson_2016_mouse_blood(test=False): - return load_olsson_2016_mouse_blood(test=test) + adata = load_olsson_2016_mouse_blood(test=test) + adata.uns["n_genes"] = adata.shape[1] + return log_cpm(adata) diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py index a4b7a783cd..1f24565835 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py @@ -1,5 +1,6 @@ from ....data.mouse_hspc_nestorowa2016 import load_mouse_hspc_nestorowa2016 from ....tools.decorators import dataset +from ....tools.normalize import log_cpm @dataset( @@ -11,4 +12,6 @@ "1920 cells x 43258 features with 3 cell type labels", ) def mouse_hspc_nestorowa2016(test=False): - return load_mouse_hspc_nestorowa2016(test=test) + adata = load_mouse_hspc_nestorowa2016(test=test) + adata.uns["n_genes"] = adata.shape[1] + return log_cpm(adata) diff --git a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py index 758d0dc78b..d8487ee535 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py @@ -1,5 +1,6 @@ from ....data.tenx import load_tenx_5k_pbmc from ....tools.decorators import dataset +from ....tools.normalize import log_cpm @dataset( @@ -13,4 +14,6 @@ ), ) def tenx_5k_pbmc(test=False): - return load_tenx_5k_pbmc(test=test) + adata = load_tenx_5k_pbmc(test=test) + adata.uns["n_genes"] = adata.shape[1] + return log_cpm(adata) diff --git a/openproblems/tasks/dimensionality_reduction/methods/__init__.py b/openproblems/tasks/dimensionality_reduction/methods/__init__.py index 5f18c536af..e983901f78 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/methods/__init__.py @@ -2,13 +2,21 @@ from .baseline import true_features from .baseline import true_features_log_cpm from .baseline import true_features_log_cpm_hvg -from .densmap import densmap_logCPM_1kHVG -from .densmap import densmap_pca_logCPM_1kHVG from .neuralee import neuralee_default from .neuralee import neuralee_logCPM_1kHVG +from .pca import pca_logCPM from .pca import pca_logCPM_1kHVG from .phate import phate_default +from .phate import phate_logCPM from .phate import phate_logCPM_1kHVG from .phate import phate_sqrt +from .tsne import tsne_logCPM from .tsne import tsne_logCPM_1kHVG +from .umap import densmap_logCPM +from .umap import densmap_logCPM_1kHVG +from .umap import densmap_pca_logCPM +from .umap import densmap_pca_logCPM_1kHVG +from .umap import umap_logCPM from .umap import umap_logCPM_1kHVG +from .umap import umap_pca_logCPM +from .umap import umap_pca_logCPM_1kHVG diff --git a/openproblems/tasks/dimensionality_reduction/methods/baseline.py b/openproblems/tasks/dimensionality_reduction/methods/baseline.py index 8e2c523cbc..50b1a30acf 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/baseline.py +++ b/openproblems/tasks/dimensionality_reduction/methods/baseline.py @@ -57,8 +57,7 @@ def true_features_log_cpm(adata, test=False): ) def true_features_log_cpm_hvg(adata, test=False): adata = log_cpm_hvg(adata) - adata = adata[:, adata.var["highly_variable"]].copy() - adata.obsm["X_emb"] = adata.X + adata.obsm["X_emb"] = adata[:, adata.var["highly_variable"]].copy().X if test: adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100] diff --git a/openproblems/tasks/dimensionality_reduction/methods/densmap.py b/openproblems/tasks/dimensionality_reduction/methods/densmap.py deleted file mode 100644 index d76e2375ff..0000000000 --- a/openproblems/tasks/dimensionality_reduction/methods/densmap.py +++ /dev/null @@ -1,44 +0,0 @@ -from ....tools.decorators import method -from ....tools.normalize import log_cpm_hvg -from ....tools.utils import check_version - -import functools - -_densmap_method = functools.partial( - method, - paper_name="Assessing single-cell transcriptomic variability through" - " density-preserving data visualization", - paper_reference="narayan2021assessing", - paper_year=2021, - code_url="https://github.com/lmcinnes/umap", - image="openproblems-python-extras", -) - - -def _densmap(adata, obsm=None): - from umap import UMAP - - if obsm: - X = adata.obsm[obsm] - else: - X = adata.X - adata.obsm["X_emb"] = UMAP(densmap=True, random_state=42).fit_transform(X) - adata.uns["method_code_version"] = check_version("umap-learn") - return adata - - -@_densmap_method(method_name="densMAP (logCPM, 1kHVG)") -def densmap_logCPM_1kHVG(adata, test: bool = False): - adata = log_cpm_hvg(adata) - adata = adata[:, adata.var["highly_variable"]].copy() - return _densmap(adata) - - -@_densmap_method(method_name="densMAP PCA (logCPM, 1kHVG)") -def densmap_pca_logCPM_1kHVG(adata, test: bool = False): - import scanpy as sc - - adata = log_cpm_hvg(adata) - adata = adata[:, adata.var["highly_variable"]].copy() - sc.tl.pca(adata, n_comps=50, svd_solver="arpack") - return _densmap(adata, obsm="X_pca") diff --git a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py index fc8fcff2fd..cb445a52ac 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py +++ b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py @@ -49,6 +49,7 @@ def _create_neuralee_dataset( def _neuralee( adata, + genes=None, d: int = 2, test: bool = False, subsample_genes: Optional[int] = None, @@ -58,17 +59,22 @@ def _neuralee( import torch + if genes is not None: + adata_input = adata[:, genes].copy() + else: + adata_input = adata + # this can fail due to sparseness of data; if so, retry with more genes # note that this is a deviation from the true default behavior, which fails # see https://github.com/openproblems-bio/openproblems/issues/375 while True: try: dataset = _create_neuralee_dataset( - adata, normalize=normalize, subsample_genes=subsample_genes + adata_input, normalize=normalize, subsample_genes=subsample_genes ) except ValueError: - if subsample_genes is not None and subsample_genes < adata.n_vars: - subsample_genes = min(adata.n_vars, int(subsample_genes * 1.2)) + if subsample_genes is not None and subsample_genes < adata_input.n_vars: + subsample_genes = min(adata_input.n_vars, int(subsample_genes * 1.2)) log.warning( "ValueError in neuralee_default. " f"Increased subsample_genes to {subsample_genes}" @@ -97,5 +103,10 @@ def neuralee_default(adata: AnnData, test: bool = False) -> AnnData: @_neuralee_method(method_name="NeuralEE (CPU) (logCPM, 1kHVG)") def neuralee_logCPM_1kHVG(adata: AnnData, test: bool = False) -> AnnData: adata = log_cpm_hvg(adata) - adata = adata[:, adata.var["highly_variable"]].copy() - return _neuralee(adata, test=test, normalize=False, subsample_genes=None) + return _neuralee( + adata, + genes=adata.var["highly_variable"], + test=test, + normalize=False, + subsample_genes=None, + ) diff --git a/openproblems/tasks/dimensionality_reduction/methods/pca.py b/openproblems/tasks/dimensionality_reduction/methods/pca.py index aa3491e7cc..f87e7423f7 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pca.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pca.py @@ -1,22 +1,40 @@ from ....tools.decorators import method +from ....tools.normalize import log_cpm from ....tools.normalize import log_cpm_hvg from ....tools.utils import check_version +import functools -@method( - method_name="Principle Component Analysis (PCA) (logCPM, 1kHVG)", +_pca_method = functools.partial( + method, paper_name="On lines and planes of closest fit to systems of points in space", paper_reference="pearson1901pca", paper_year=1901, code_url="https://scikit-learn.org/stable/modules/generated/" "sklearn.decomposition.PCA.html", ) -def pca_logCPM_1kHVG(adata, test: bool = False): + + +def _pca(adata, genes=None): import scanpy as sc - adata = log_cpm_hvg(adata) - adata = adata[:, adata.var["highly_variable"]].copy() - sc.tl.pca(adata, n_comps=50, svd_solver="arpack") - adata.obsm["X_emb"] = adata.obsm["X_pca"][:, :2] + if genes is not None: + X = adata[:, genes].copy().X + else: + X = adata.X + + adata.obsm["X_emb"] = sc.tl.pca(X, n_comps=2, svd_solver="arpack") adata.uns["method_code_version"] = check_version("scikit-learn") return adata + + +@_pca_method(method_name="Principle Component Analysis (PCA) (logCPM)") +def pca_logCPM(adata, test: bool = False): + adata = log_cpm(adata) + return _pca(adata) + + +@_pca_method(method_name="Principle Component Analysis (PCA) (logCPM, 1kHVG)") +def pca_logCPM_1kHVG(adata, test: bool = False): + adata = log_cpm_hvg(adata) + return _pca(adata, genes=adata.var["highly_variable"]) diff --git a/openproblems/tasks/dimensionality_reduction/methods/phate.py b/openproblems/tasks/dimensionality_reduction/methods/phate.py index 496c5d93ba..c5d6817969 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/phate.py +++ b/openproblems/tasks/dimensionality_reduction/methods/phate.py @@ -1,4 +1,5 @@ from ....tools.decorators import method +from ....tools.normalize import log_cpm from ....tools.normalize import log_cpm_hvg from ....tools.normalize import sqrt_cpm from ....tools.utils import check_version @@ -17,7 +18,9 @@ ) -def _phate(adata, test: bool = False, n_pca: Optional[int] = None, gamma: float = 1): +def _phate( + adata, test: bool = False, genes=None, n_pca: Optional[int] = None, gamma: float = 1 +): from phate import PHATE if test: @@ -25,8 +28,13 @@ def _phate(adata, test: bool = False, n_pca: Optional[int] = None, gamma: float else: # pragma: no cover n_pca = n_pca or 100 + if genes is not None: + X = adata[:, genes].copy().X + else: + X = adata.X + phate_op = PHATE(n_pca=n_pca, verbose=False, n_jobs=-1, gamma=gamma) - adata.obsm["X_emb"] = phate_op.fit_transform(adata.X) + adata.obsm["X_emb"] = phate_op.fit_transform(X) adata.uns["method_code_version"] = check_version("phate") return adata @@ -34,17 +42,28 @@ def _phate(adata, test: bool = False, n_pca: Optional[int] = None, gamma: float @_phate_method(method_name="PHATE (default)") def phate_default(adata, test: bool = False, n_pca: Optional[int] = None): adata = sqrt_cpm(adata) - return _phate(adata, test=test, n_pca=n_pca) + adata = _phate(adata, test=test, n_pca=n_pca) + # revert to expected adata.X + adata = log_cpm(adata) + return adata @_phate_method(method_name="PHATE (gamma=0)") def phate_sqrt(adata, test: bool = False, n_pca: Optional[int] = None): adata = sqrt_cpm(adata) - return _phate(adata, test=test, n_pca=n_pca, gamma=0) + adata = _phate(adata, test=test, n_pca=n_pca, gamma=0) + # revert to expected adata.X + adata = log_cpm(adata) + return adata -@_phate_method(method_name="PHATE (logCPM, 1kHVG)") +@_phate_method(method_name="PHATE (logCPM)") def phate_logCPM_1kHVG(adata, test: bool = False, n_pca: Optional[int] = None): - adata = log_cpm_hvg(adata) - adata = adata[:, adata.var["highly_variable"]].copy() + adata = log_cpm(adata) return _phate(adata, test=test, n_pca=n_pca) + + +@_phate_method(method_name="PHATE (logCPM, 1kHVG)") +def phate_logCPM(adata, test: bool = False, n_pca: Optional[int] = None): + adata = log_cpm_hvg(adata) + return _phate(adata, test=test, genes=adata.var["highly_variable"], n_pca=n_pca) diff --git a/openproblems/tasks/dimensionality_reduction/methods/tsne.py b/openproblems/tasks/dimensionality_reduction/methods/tsne.py index 0e210804c3..bb636980f8 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/tsne.py +++ b/openproblems/tasks/dimensionality_reduction/methods/tsne.py @@ -1,10 +1,12 @@ from ....tools.decorators import method +from ....tools.normalize import log_cpm from ....tools.normalize import log_cpm_hvg from ....tools.utils import check_version +import functools -@method( - method_name="“t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCPM, 1kHVG)", +_tsne_method = functools.partial( + method, paper_name="Visualizing Data using t-SNE", paper_reference="vandermaaten2008visualizing", paper_year=2008, @@ -12,13 +14,34 @@ "sklearn.manifold.TSNE.html#sklearn.manifold.TSNE", image="openproblems-python-extras", ) -def tsne_logCPM_1kHVG(adata, test: bool = False, n_pca=50): + + +def _tsne(adata, genes=None, test=False, n_pca=50): import scanpy as sc - adata = log_cpm_hvg(adata) - adata = adata[:, adata.var["highly_variable"]].copy() - sc.tl.pca(adata, n_comps=n_pca, svd_solver="arpack") + if genes is not None: + X = adata[:, genes].copy().X + else: + X = adata.X + + adata.obsm["X_pca"] = sc.tl.pca(X, n_comps=n_pca, svd_solver="arpack") sc.tl.tsne(adata, use_rep="X_pca", n_pcs=n_pca) adata.obsm["X_emb"] = adata.obsm["X_tsne"] adata.uns["method_code_version"] = check_version("MulticoreTSNE") return adata + + +@_tsne_method( + method_name="t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCPM, 1kHVG)" +) +def tsne_logCPM_1kHVG(adata, test: bool = False, n_pca=50): + adata = log_cpm_hvg(adata) + return _tsne(adata, genes=adata.var["highly_variable"], test=test, n_pca=n_pca) + + +@_tsne_method( + method_name="t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCPM)" +) +def tsne_logCPM(adata, test: bool = False, n_pca=50): + adata = log_cpm(adata) + return _tsne(adata, test=test, n_pca=n_pca) diff --git a/openproblems/tasks/dimensionality_reduction/methods/umap.py b/openproblems/tasks/dimensionality_reduction/methods/umap.py index ede81e50f3..31f42e229f 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/umap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/umap.py @@ -1,25 +1,93 @@ from ....tools.decorators import method +from ....tools.normalize import log_cpm from ....tools.normalize import log_cpm_hvg from ....tools.utils import check_version +import functools -@method( - method_name="Uniform Manifold Approximation and Projection (UMAP), " - "as implemented by scanpy (logCPM, 1kHVG)", +_umap_method = functools.partial( + method, paper_name="UMAP: Uniform Manifold Approximation and Projection for " "Dimension Reduction", paper_reference="mcinnes2018umap", paper_year=2018, code_url="https://github.com/lmcinnes/umap", ) -def umap_logCPM_1kHVG(adata, test: bool = False, n_pca=50): +_densmap_method = functools.partial( + method, + paper_name="Assessing single-cell transcriptomic variability through" + " density-preserving data visualization", + paper_reference="narayan2021assessing", + paper_year=2021, + code_url="https://github.com/lmcinnes/umap", + image="openproblems-python-extras", +) + + +def _umap(adata, n_comps=None, genes=None, densmap=False): + from umap import UMAP + import scanpy as sc - adata = log_cpm_hvg(adata) - adata = adata[:, adata.var["highly_variable"]].copy() - sc.tl.pca(adata, n_comps=50, svd_solver="arpack") - sc.pp.neighbors(adata, use_rep="X_pca", n_pcs=n_pca) - sc.tl.umap(adata) - adata.obsm["X_emb"] = adata.obsm["X_umap"] + if genes is not None: + adata_input = adata[:, genes].copy() + else: + adata_input = adata + + if n_comps is not None: + sc.tl.pca(adata_input, n_comps=n_comps, svd_solver="arpack") + X = adata_input.obsm["X_pca"] + else: + X = adata_input.X + + adata.obsm["X_emb"] = UMAP(densmap=densmap, random_state=42).fit_transform(X) adata.uns["method_code_version"] = check_version("umap-learn") return adata + + +@_umap_method(method_name="UMAP (logCPM, 1kHVG)") +def umap_logCPM_1kHVG(adata, test: bool = False): + adata = log_cpm_hvg(adata) + return _umap(adata, genes=adata.var["highly_variable"]) + + +@_umap_method(method_name="UMAP PCA (logCPM, 1kHVG)") +def umap_pca_logCPM_1kHVG(adata, test: bool = False): + adata = log_cpm_hvg(adata) + return _umap(adata, n_comps=50, genes=adata.var["highly_variable"]) + + +@_umap_method(method_name="UMAP (logCPM)") +def umap_logCPM(adata, test: bool = False): + adata = log_cpm(adata) + return _umap(adata) + + +@_umap_method(method_name="UMAP PCA (logCPM)") +def umap_pca_logCPM(adata, test: bool = False): + adata = log_cpm(adata) + return _umap(adata, n_comps=50) + + +@_densmap_method(method_name="densMAP (logCPM, 1kHVG)") +def densmap_logCPM_1kHVG(adata, test: bool = False): + adata = log_cpm_hvg(adata) + return _umap(adata, densmap=True, genes=adata.var["highly_variable"]) + + +@_densmap_method(method_name="densMAP PCA (logCPM, 1kHVG)") +def densmap_pca_logCPM_1kHVG(adata, test: bool = False): + adata = log_cpm_hvg(adata) + return _umap(adata, densmap=True, n_comps=50, genes=adata.var["highly_variable"]) + + +@_densmap_method(method_name="densMAP (logCPM)") +def densmap_logCPM(adata, test: bool = False): + adata = log_cpm(adata) + return _umap(adata, densmap=True) + + +@_densmap_method(method_name="densMAP PCA (logCPM)") +def densmap_pca_logCPM(adata, test: bool = False): + adata = log_cpm(adata) + return _umap(adata, densmap=True, n_comps=50) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py index d7526d1ef7..684eb6f1a0 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py @@ -15,7 +15,6 @@ """ from ....tools.decorators import metric -from ....tools.normalize import log_cpm_hvg from anndata import AnnData from numba import njit from typing import Tuple @@ -155,9 +154,6 @@ def _metrics( def _high_dim(adata: AnnData) -> np.ndarray: from scipy.sparse import issparse - adata.X = adata.layers["counts"] - adata = log_cpm_hvg(adata) - adata = adata[:, adata.var["highly_variable"]].copy() high_dim = adata.X return high_dim.A if issparse(high_dim) else high_dim diff --git a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py index b48fa09870..feedbda787 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py @@ -8,7 +8,7 @@ def _rmse(X, X_emb): high_dimensional_distance_vector = scipy.spatial.distance.pdist(X) low_dimensional_distance_vector = scipy.spatial.distance.pdist(X_emb) - scale, rmse = scipy.optimize.nnls( + _, rmse = scipy.optimize.nnls( low_dimensional_distance_vector[:, None], high_dimensional_distance_vector ) return rmse diff --git a/openproblems/tools/decorators.py b/openproblems/tools/decorators.py index c4b4872e3b..b32f2f3a0e 100644 --- a/openproblems/tools/decorators.py +++ b/openproblems/tools/decorators.py @@ -24,13 +24,19 @@ def normalize(adata, *args, obsm=None, obs=None, var=None, **kwargs): else: obs = adata.uns[obs] if obs else adata.obs var = adata.uns[var] if var else adata.var - adata_temp = anndata.AnnData(adata.obsm[obsm], obs=obs, var=var) + adata_temp = anndata.AnnData( + adata.obsm[obsm], + obs=obs, + var=var, + layers={"counts": adata.obsm[obsm]}, + ) adata_temp = func(adata_temp, *args, **kwargs) adata.obsm[obsm] = adata.obsm[cache_name] = adata_temp.X else: if func.__name__ in adata.layers: adata.X = adata.layers[func.__name__] else: + adata.X = adata.layers["counts"] adata = func(adata, *args, **kwargs) adata.layers[func.__name__] = adata.X diff --git a/test/test_core_tasks.py b/test/test_core_tasks.py index a2383dac54..2d724f6e40 100644 --- a/test/test_core_tasks.py +++ b/test/test_core_tasks.py @@ -29,7 +29,7 @@ def test_members(self): assert len(self.task._task_summary) < TASK_SUMMARY_MAXLEN assert hasattr(self.task, "DEFAULT_LAYER") assert isinstance(self.task.DEFAULT_LAYER, str) - assert self.task.DEFAULT_LAYER in ["counts", "log_normalized"] + assert self.task.DEFAULT_LAYER in ["counts", "log_normalized", "log_cpm"] assert hasattr(self.task, "api") assert isinstance(self.task.api, MODULE) for list_name in ["DATASETS", "METHODS", "METRICS"]: diff --git a/test/test_task_dimensionality_reduction.py b/test/test_task_dimensionality_reduction.py index a933e4f7c9..290e9cda1b 100644 --- a/test/test_task_dimensionality_reduction.py +++ b/test/test_task_dimensionality_reduction.py @@ -52,4 +52,4 @@ def test_density_preservation_matches_densmap(): adata.obsm["X_emb"] = emb actual = metric(adata) - np.testing.assert_allclose(expected, actual) + np.testing.assert_allclose(expected, actual, rtol=1e-5) From 2dba827095e1ae81c9a2fda887aa38042bbddaa0 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 12 Jan 2023 02:58:30 +1100 Subject: [PATCH 199/266] Add pymde to dimensionality reduction (#767) * Add pymde to dimensionality reduction * Install pymde * fix typo * bugfix * use partial for pymde decorator * update pymde reference * nocover --- .../requirements.txt | 1 + main.bib | 12 ++ .../methods/__init__.py | 4 + .../dimensionality_reduction/methods/pymde.py | 130 ++++++++++++++++++ 4 files changed, 147 insertions(+) create mode 100644 openproblems/tasks/dimensionality_reduction/methods/pymde.py diff --git a/docker/openproblems-python-pytorch/requirements.txt b/docker/openproblems-python-pytorch/requirements.txt index 9eed4839ed..b1ff654ff9 100644 --- a/docker/openproblems-python-pytorch/requirements.txt +++ b/docker/openproblems-python-pytorch/requirements.txt @@ -3,6 +3,7 @@ git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix jax==0.3.25 jaxlib==0.3.25 +pymde==0.1.15 scalex==1.0.2 scikit-misc==0.1.* scvi-tools~=0.17 # pinned in #313 diff --git a/main.bib b/main.bib index 836fe90a95..fa7abe01c0 100644 --- a/main.bib +++ b/main.bib @@ -10,6 +10,18 @@ @misc{10x2019pbmc year = {2019}, url = {https://www.10xgenomics.com/resources/datasets/5-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-with-cell-surface-proteins-v-3-chemistry-3-1-standard-3-1-0}, } +@article{agrawal2021mde, + title = {Minimum-Distortion Embedding}, + author = {Akshay Agrawal and Alnur Ali and Stephen Boyd}, + year = {2021}, + journal = {Foundations and Trends{\textregistered} in Machine Learning}, + publisher = {Now Publishers}, + volume = {14}, + number = {3}, + pages = {211--378}, + doi = {10.1561/2200000090}, + url = {https://doi.org/10.1561/2200000090}, +} @article{andersson2020single, title = {Single-cell and spatial transcriptomics enables probabilistic inference of cell type topography}, author = {Alma Andersson and Joseph Bergenstr{\aa}hle and Michaela Asp and Ludvig Bergenstr{\aa}hle and Aleksandra Jurek and Jos{\'{e}} Fern{\'{a}}ndez Navarro and Joakim Lundeberg}, diff --git a/openproblems/tasks/dimensionality_reduction/methods/__init__.py b/openproblems/tasks/dimensionality_reduction/methods/__init__.py index e983901f78..5480405af4 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/methods/__init__.py @@ -10,6 +10,10 @@ from .phate import phate_logCPM from .phate import phate_logCPM_1kHVG from .phate import phate_sqrt +from .pymde import pymde_distances_log_cpm +from .pymde import pymde_distances_log_cpm_hvg +from .pymde import pymde_neighbors_log_cpm +from .pymde import pymde_neighbors_log_cpm_hvg from .tsne import tsne_logCPM from .tsne import tsne_logCPM_1kHVG from .umap import densmap_logCPM diff --git a/openproblems/tasks/dimensionality_reduction/methods/pymde.py b/openproblems/tasks/dimensionality_reduction/methods/pymde.py new file mode 100644 index 0000000000..2892def1c1 --- /dev/null +++ b/openproblems/tasks/dimensionality_reduction/methods/pymde.py @@ -0,0 +1,130 @@ +from ....tools.decorators import method +from ....tools.normalize import log_cpm +from ....tools.normalize import log_cpm_hvg +from ....tools.utils import check_version +from typing import Optional + +import functools +import scanpy as sc + +_pymde_method = functools.partial( + method, + paper_name="Minimum-Distortion Embedding", + paper_reference="agrawal2021mde", + paper_year=2021, + code_url="https://pymde.org/", + image="openproblems-python-pytorch", +) + + +def _pymde( + adata, + method: str = "neighbors", + genes=None, + test: bool = False, + max_iter: Optional[int] = None, + memory_size: Optional[int] = None, +): + import pymde + + if genes is not None: + adata_input = adata[:, genes].copy() + else: + adata_input = adata + + embed_kwargs = {} + if test: + sc.tl.pca(adata_input, n_comps=20, svd_solver="arpack") + X = adata_input.obsm["X_pca"] + embed_kwargs["max_iter"] = max_iter or 20 + embed_kwargs["memory_size"] = memory_size or 2 + else: # pragma: nocover + X = adata_input.X + if max_iter is not None: + embed_kwargs["max_iter"] = max_iter + if memory_size is not None: + embed_kwargs["memory_size"] = memory_size + if method == "neighbors": + mde_fn = pymde.preserve_neighbors + elif method == "distances": + mde_fn = pymde.preserve_distances + else: + raise NotImplementedError + adata.obsm["X_emb"] = ( + mde_fn(X, embedding_dim=2, verbose=True) + .embed(**embed_kwargs, verbose=True) + .detach() + .numpy() + ) + adata.uns["method_code_version"] = check_version("pymde") + return adata + + +@_pymde_method( + method_name="PyMDE Preserve Neighbors (logCPM)", +) +def pymde_neighbors_log_cpm( + adata, + test: bool = False, + max_iter: Optional[int] = None, + memory_size: Optional[int] = None, +): + adata = log_cpm(adata) + return _pymde( + adata, method="neighbors", test=test, max_iter=max_iter, memory_size=memory_size + ) + + +@_pymde_method( + method_name="PyMDE Preserve Neighbors (logCPM, 1kHVG)", +) +def pymde_neighbors_log_cpm_hvg( + adata, + test: bool = False, + max_iter: Optional[int] = None, + memory_size: Optional[int] = None, +): + adata = log_cpm_hvg(adata) + return _pymde( + adata, + method="neighbors", + genes=adata.var["highly_variable"], + test=test, + max_iter=max_iter, + memory_size=memory_size, + ) + + +@_pymde_method( + method_name="PyMDE Preserve Distances (logCPM)", +) +def pymde_distances_log_cpm( + adata, + test: bool = False, + max_iter: Optional[int] = None, + memory_size: Optional[int] = None, +): + adata = log_cpm(adata) + return _pymde( + adata, method="distances", test=test, max_iter=max_iter, memory_size=memory_size + ) + + +@_pymde_method( + method_name="PyMDE Preserve Distances (logCPM, 1kHVG)", +) +def pymde_distances_log_cpm_hvg( + adata, + test: bool = False, + max_iter: Optional[int] = None, + memory_size: Optional[int] = None, +): + adata = log_cpm_hvg(adata) + return _pymde( + adata, + method="distances", + genes=adata.var["highly_variable"], + test=test, + max_iter=max_iter, + memory_size=memory_size, + ) From 2154bf70482e0104b4c037620e31708444ea86e3 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 24 Jan 2023 00:00:39 +1100 Subject: [PATCH 200/266] Fix flaky R installations in docker build (#783) * print * resolve with retries * scran 1.24.0 * scran 1.24.0 * bump to bioc 3.16 * more fixes * complexheatmap * scater * basilisk * set biocversion in R install * master * more backoff for github failures * force full rebuild * increase backoff * sleep for a whole minute * memoise renv_remotes_resolve * patch properly --- .dockerignore | 9 ++++ .github/workflows/run_tests.yml | 4 +- docker/openproblems-r-base/Dockerfile | 2 +- docker/openproblems-r-base/r_requirements.txt | 2 +- docker/openproblems-r-extras/Dockerfile | 1 + .../openproblems-r-extras/r_requirements.txt | 16 +++---- openproblems/version.py | 2 +- scripts/install_renv.R | 48 ++++++++++++++----- workflow/Snakefile | 4 +- 9 files changed, 63 insertions(+), 25 deletions(-) diff --git a/.dockerignore b/.dockerignore index 4e785b3bf1..a40704eba5 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,3 +2,12 @@ nf-openproblems workflow website .github +.snakemake +static +test +*.egg-info +.coverage* +.pytest_cache +.idea +.vscode +*.md diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 3aebda83a0..12e13820d7 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -297,6 +297,8 @@ jobs: ${{ env.LINUX_VERSION }}-renv-${{ env.R_VERSION }}- - name: Install R packages + env: + BIOCVERSION: '3.16' run: | if (!requireNamespace("renv", quietly = TRUE)) install.packages("renv") renv::restore() @@ -305,7 +307,7 @@ jobs: install_renv("docker/openproblems-github-actions/r_requirements.txt") shell: Rscript {0} - - name: Update Docker docker images + - name: Update Docker images if: "needs.build_images.result == 'skipped'" run: | cd workflow diff --git a/docker/openproblems-r-base/Dockerfile b/docker/openproblems-r-base/Dockerfile index 6e9a2c1baa..71d909d34d 100644 --- a/docker/openproblems-r-base/Dockerfile +++ b/docker/openproblems-r-base/Dockerfile @@ -27,7 +27,7 @@ RUN apt-get update -qq RUN apt-get install -yq --no-install-suggests --no-install-recommends r-base-dev=4.2\* RUN apt-get clean -y && apt-get autoremove -y ENV R_HOME=/usr/lib/R -ENV BIOCVERSION="3.15" +ENV BIOCVERSION="3.16" # Install R packages RUN R -e "install.packages('renv'); renv::consent(TRUE)" diff --git a/docker/openproblems-r-base/r_requirements.txt b/docker/openproblems-r-base/r_requirements.txt index 6ee9be204a..382bf5ced3 100644 --- a/docker/openproblems-r-base/r_requirements.txt +++ b/docker/openproblems-r-base/r_requirements.txt @@ -1,3 +1,3 @@ -bioc::scran@1.24.1 +bioc::scran@1.26.1 IRkernel@1.3.1 RcppAnnoy@0.0.20 diff --git a/docker/openproblems-r-extras/Dockerfile b/docker/openproblems-r-extras/Dockerfile index dee7cc83b8..115b056678 100644 --- a/docker/openproblems-r-extras/Dockerfile +++ b/docker/openproblems-r-extras/Dockerfile @@ -20,6 +20,7 @@ RUN apt-get clean autoclean && \ # Install R packages COPY ./docker/openproblems-r-extras/r_requirements.txt ./r_requirements.txt +COPY ./scripts/install_renv.R ./install_renv.R RUN R -e "source(\"install_renv.R\"); install_renv(\"r_requirements.txt\")" # install dependencies and openproblems diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index 238573d2a3..fea6a4b4fa 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -1,9 +1,9 @@ -bioc::batchelor@1.12.3 -bioc::ComplexHeatmap@2.12.1 +bioc::basilisk@1.10.2 +bioc::batchelor@1.14.1 +bioc::ComplexHeatmap@2.14.0 bioc::dir.expiry -bioc::scater@1.24.0 -bioc::scran@1.24.1 -bioc::scuttle@1.6.3 +bioc::scater@1.26.1 +bioc::scuttle@1.8.3 bslib@0.4.0 caret@6.0-93 cli@3.4.1 @@ -11,7 +11,7 @@ conos@1.5.0 crayon@1.5.2 dbplyr@2.2.1 devtools@2.4.5 -dmcable/spacexr@56787ee # master +dmcable/spacexr@9461a8d # master downlit@0.4.2 dplyr@1.0.10 e1071@1.7-12 @@ -24,7 +24,7 @@ htmltools@0.5.3 htmlwidgets@1.5.4 igraph@1.3.5 lifecycle@1.0.3 -LTLA/basilisk.utils # required for liana0.1.9 +LTLA/basilisk.utils@411502f # required for liana0.1.9 Matrix@1.5-1 pkgdown@2.0.6 pkgload@1.3.1 @@ -39,7 +39,7 @@ rliger@1.0.0 rmarkdown@2.2 RSQLite@2.2.4 saezlab/liana@0.1.9 -saezlab/OmnipathR@679bb79 # master +saezlab/OmnipathR@edf276b # master sass@0.4.2 sctransform@0.3.5 Seurat@4.3.0 diff --git a/openproblems/version.py b/openproblems/version.py index 906d362f7d..49e0fc1e09 100644 --- a/openproblems/version.py +++ b/openproblems/version.py @@ -1 +1 @@ -__version__ = "0.6.0" +__version__ = "0.7.0" diff --git a/scripts/install_renv.R b/scripts/install_renv.R index f1aaf9a553..b3a24d58ad 100644 --- a/scripts/install_renv.R +++ b/scripts/install_renv.R @@ -19,7 +19,7 @@ compare_version <- function(v1, v2) { } check_available <- function(remote) { - remote <- renv:::renv_remotes_resolve(remote) + remote <- with_retries(renv:::renv_remotes_resolve, spec = remote) tryCatch( { version <- packageVersion(remote$Package) @@ -39,33 +39,59 @@ strip_comments <- function(remote) { gsub("\\s*#.*", "", remote) } -install_with_retries <- function(remotes, - attempts = 3, - sleep = 3, - backoff = 2, - ...) { +with_retries <- function(func, + attempts = 5, + sleep_once = 3, + sleep_multiple = 60, + backoff = 2, + ...) { result <- NULL attempt <- 1 - while (is.null(result) && attempt <= attempts - 1) { + sleep <- sleep_once + while (is.null(result) && attempt < attempts) { attempt <- attempt + 1 try( - result <- renv::install(remotes, ...) + result <- func(...) ) + closeAllConnections() Sys.sleep(sleep) - sleep <- sleep * backoff + if (sleep == sleep_once) { + sleep <- sleep_multiple + } else { + sleep <- sleep * backoff + } } if (is.null(result)) { # last attempt - renv::install(remotes, ...) + result <- func(...) + } + result +} + +patch_renv <- function() { + if (!requireNamespace("memoise", quietly = TRUE)) install.packages("memoise") + # set the new env between renv imports and base env, only if not already done + if (!is(renv:::renv_remotes_resolve, "memoised")) { + # memoize renv_remotes_resolve + renv_remotes_resolve_memoised <- memoise::memoise( + renv:::renv_remotes_resolve + ) + assignInNamespace( + "renv_remotes_resolve", + renv_remotes_resolve_memoised, + "renv" + ) } } install_renv <- function(requirements_file, ...) { + patch_renv() remotes <- scan(requirements_file, what = character(), sep = "\n") remotes <- sapply(remotes, strip_comments) remotes_installed <- sapply(remotes, check_available) remotes_to_install <- remotes[!remotes_installed] + message(paste0("Installing ", length(remotes_to_install), " packages")) if (length(remotes_to_install) > 0) { - install_with_retries(remotes_to_install, ...) + with_retries(renv::install, packages = remotes_to_install, ...) } } diff --git a/workflow/Snakefile b/workflow/Snakefile index 6f02c7fe2b..d67769e8e9 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -40,7 +40,7 @@ rule refresh_docker_image: label = tools.build_type, hash = tools.build_hash, shell: - "docker build --label bio.openproblems.build={params.label} --label bio.openproblems.hash={params.hash} -f {input.dockerfile} -t {params.user}/{wildcards.image} .." + "docker build --progress=plain --label bio.openproblems.build={params.label} --label bio.openproblems.hash={params.hash} -f {input.dockerfile} -t {params.user}/{wildcards.image} .." rule refresh_dockerfile: priority: 50 @@ -92,7 +92,7 @@ rule build_docker_image: label = tools.build_type, hash = tools.build_hash, shell: - "docker build --label bio.openproblems.build={params.label} --label bio.openproblems.hash={params.hash} -f {input.dockerfile} -t {params.user}/{wildcards.image} .." + "docker build --progress=plain --label bio.openproblems.build={params.label} --label bio.openproblems.hash={params.hash} -f {input.dockerfile} -t {params.user}/{wildcards.image} .." rule password_docker: output: From 32014d432e657792c3223352721a11e14fc19220 Mon Sep 17 00:00:00 2001 From: Daniel Strobl <50872326+danielStrobl@users.noreply.github.com> Date: Mon, 23 Jan 2023 14:56:30 +0100 Subject: [PATCH 201/266] save initial layer in X for adata_pre (#784) Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .../_batch_integration/batch_integration_embed/metrics/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py index 45c21c2205..8fe3be43aa 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py @@ -1,4 +1,5 @@ def _get_split(adata): uni = adata uni.obsm["X_pca"] = uni.obsm["X_uni_pca"] + uni.X = uni.layers["log_normalized"] return (uni, adata) From b133b741adc6ee6fc7a6160ac2a57129a686f386 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 24 Jan 2023 01:30:42 +1100 Subject: [PATCH 202/266] Filter datasets by celltype (#770) * Filter datasets by celltype * filter sample dataset * more bugfixes * fewer labels -> more cells * Require same api of all subtask datasets --- .../_batch_integration/_common/__init__.py | 0 .../tasks/_batch_integration/_common/api.py | 65 +++++++++++++++++++ .../_common/datasets/__init__.py | 2 + .../datasets/immune.py | 6 +- .../datasets/pancreas.py | 6 +- .../tasks/_batch_integration/_common/utils.py | 11 ++++ .../batch_integration_embed/api.py | 34 ++-------- .../datasets/__init__.py | 4 +- .../batch_integration_feature/api.py | 38 +---------- .../datasets/__init__.py | 4 +- .../batch_integration_graph/api.py | 39 +++-------- .../datasets/__init__.py | 4 +- 12 files changed, 109 insertions(+), 104 deletions(-) create mode 100644 openproblems/tasks/_batch_integration/_common/__init__.py create mode 100644 openproblems/tasks/_batch_integration/_common/api.py create mode 100644 openproblems/tasks/_batch_integration/_common/datasets/__init__.py rename openproblems/tasks/_batch_integration/{batch_integration_graph => _common}/datasets/immune.py (82%) rename openproblems/tasks/_batch_integration/{batch_integration_graph => _common}/datasets/pancreas.py (82%) create mode 100644 openproblems/tasks/_batch_integration/_common/utils.py diff --git a/openproblems/tasks/_batch_integration/_common/__init__.py b/openproblems/tasks/_batch_integration/_common/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/openproblems/tasks/_batch_integration/_common/api.py b/openproblems/tasks/_batch_integration/_common/api.py new file mode 100644 index 0000000000..5bec47bc27 --- /dev/null +++ b/openproblems/tasks/_batch_integration/_common/api.py @@ -0,0 +1,65 @@ +from ....data.sample import load_sample_data +from ....tools.decorators import dataset +from .utils import filter_celltypes + +import numpy as np + +MIN_CELLS_PER_CELLTYPE = 50 + + +def check_neighbors(adata, neighbors_key, connectivities_key, distances_key): + assert neighbors_key in adata.uns + assert adata.uns[neighbors_key]["connectivities_key"] == connectivities_key + assert adata.uns[neighbors_key]["distances_key"] == distances_key + assert connectivities_key in adata.obsp + assert distances_key in adata.obsp + + +def check_dataset(adata, do_check_pca=False, do_check_neighbors=False): + """Check that dataset output fits expected API.""" + + assert "batch" in adata.obs + assert "labels" in adata.obs + assert (adata.obs["labels"].value_counts() >= MIN_CELLS_PER_CELLTYPE).all() + + assert "log_normalized" in adata.layers + assert "counts" in adata.layers + + assert adata.var_names.is_unique + assert adata.obs_names.is_unique + + assert "organism" in adata.uns + assert adata.uns["organism"] in ["mouse", "human"] + + if do_check_pca: + assert "X_uni_pca" in adata.obsm + + if do_check_neighbors: + check_neighbors(adata, "uni", "uni_connectivities", "uni_distances") + + return True + + +@dataset() +def sample_dataset(run_pca: bool = False, run_neighbors: bool = False): + """Create a simple dataset to use for testing methods in this task.""" + import scanpy as sc + + adata = load_sample_data() + adata.uns["organism"] = "human" + + adata.var.index = adata.var.gene_short_name.astype(str) + adata.var_names_make_unique() + adata.obs_names_make_unique() + + sc.pp.normalize_total(adata) + adata.layers["log_normalized"] = adata.X + + adata.obs["batch"] = np.random.choice(2, adata.shape[0], replace=True).astype(str) + adata.obs["labels"] = np.random.choice(3, adata.shape[0], replace=True).astype(str) + adata = filter_celltypes(adata) + if run_pca: + adata.obsm["X_uni_pca"] = sc.pp.pca(adata.X) + if run_neighbors: + sc.pp.neighbors(adata, use_rep="X_uni_pca", key_added="uni") + return adata diff --git a/openproblems/tasks/_batch_integration/_common/datasets/__init__.py b/openproblems/tasks/_batch_integration/_common/datasets/__init__.py new file mode 100644 index 0000000000..3369c29cd4 --- /dev/null +++ b/openproblems/tasks/_batch_integration/_common/datasets/__init__.py @@ -0,0 +1,2 @@ +from .immune import immune_batch +from .pancreas import pancreas_batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py b/openproblems/tasks/_batch_integration/_common/datasets/immune.py similarity index 82% rename from openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py rename to openproblems/tasks/_batch_integration/_common/datasets/immune.py index 084baac165..c6563177df 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py +++ b/openproblems/tasks/_batch_integration/_common/datasets/immune.py @@ -1,5 +1,7 @@ from .....data.immune_cells import load_immune from .....tools.decorators import dataset +from ..utils import filter_celltypes +from typing import Optional @dataset( @@ -11,13 +13,15 @@ "Smart-seq2).", image="openproblems", ) -def immune_batch(test=False): +def immune_batch(test: bool = False, min_celltype_count: Optional[int] = None): import scanpy as sc adata = load_immune(test) adata.uns["organism"] = "human" adata.obs["labels"] = adata.obs["final_annotation"] + adata = filter_celltypes(adata, min_celltype_count=min_celltype_count) + sc.pp.filter_genes(adata, min_counts=1) sc.pp.filter_genes(adata, min_cells=1) adata.var_names_make_unique() diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py b/openproblems/tasks/_batch_integration/_common/datasets/pancreas.py similarity index 82% rename from openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py rename to openproblems/tasks/_batch_integration/_common/datasets/pancreas.py index 01c9af49e4..9ebe868cff 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py +++ b/openproblems/tasks/_batch_integration/_common/datasets/pancreas.py @@ -1,5 +1,7 @@ from .....data.pancreas import load_pancreas from .....tools.decorators import dataset +from ..utils import filter_celltypes +from typing import Optional @dataset( @@ -11,7 +13,7 @@ "and SMARTER-seq).", image="openproblems", ) -def pancreas_batch(test=False): +def pancreas_batch(test: bool = False, min_celltype_count: Optional[int] = None): import scanpy as sc adata = load_pancreas(test) @@ -19,6 +21,8 @@ def pancreas_batch(test=False): adata.obs["labels"] = adata.obs["celltype"] adata.obs["batch"] = adata.obs["tech"] + adata = filter_celltypes(adata, min_celltype_count=min_celltype_count) + sc.pp.filter_genes(adata, min_counts=1) sc.pp.filter_genes(adata, min_cells=1) diff --git a/openproblems/tasks/_batch_integration/_common/utils.py b/openproblems/tasks/_batch_integration/_common/utils.py new file mode 100644 index 0000000000..99523b6253 --- /dev/null +++ b/openproblems/tasks/_batch_integration/_common/utils.py @@ -0,0 +1,11 @@ +from typing import Optional + + +def filter_celltypes(adata, min_celltype_count: Optional[int] = None): + + min_celltype_count = min_celltype_count or 50 + + celltype_counts = adata.obs["labels"].value_counts() + keep_celltypes = celltype_counts[celltype_counts >= min_celltype_count].index + keep_cells = adata.obs["labels"].isin(keep_celltypes) + return adata[keep_cells].copy() diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py index b28df1418d..d13a8d0e07 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py @@ -1,20 +1,9 @@ -from ....data.sample import load_sample_data from ....tools.decorators import dataset +from .._common import api -import numpy as np +import functools - -def check_dataset(adata): - """Check that dataset output fits expected API.""" - - assert "X_uni_pca" in adata.obsm - assert "batch" in adata.obs - assert "labels" in adata.obs - assert "log_normalized" in adata.layers - assert "organism" in adata.uns - assert adata.uns["organism"] in ["mouse", "human"] - - return True +check_dataset = functools.partial(api.check_dataset, do_check_pca=True) def check_method(adata, is_baseline=False): @@ -27,22 +16,7 @@ def check_method(adata, is_baseline=False): @dataset() def sample_dataset(): - """Create a simple dataset to use for testing methods in this task.""" - import scanpy as sc - - adata = load_sample_data() - adata.uns["organism"] = "human" - - adata.var.index = adata.var.gene_short_name.astype(str) - sc.pp.normalize_total(adata) - sc.pp.log1p(adata) - adata.layers["log_normalized"] = adata.X - adata.obsm["X_uni_pca"] = sc.pp.pca(adata.X) - adata.obs["batch"] = np.random.choice(2, adata.shape[0], replace=True).astype(str) - adata.obs["labels"] = np.random.choice(5, adata.shape[0], replace=True).astype(str) - adata.var_names_make_unique() - adata.obs_names_make_unique() - return adata + return api.sample_dataset(run_pca=True) def sample_method(adata): diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/datasets/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_embed/datasets/__init__.py index 4b86a1c17c..0ad80a4782 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/datasets/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/datasets/__init__.py @@ -1,2 +1,2 @@ -from ...batch_integration_graph.datasets.immune import immune_batch -from ...batch_integration_graph.datasets.pancreas import pancreas_batch +from ..._common.datasets.immune import immune_batch +from ..._common.datasets.pancreas import pancreas_batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py index 8baf5ff663..6d0f1ae781 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py @@ -1,20 +1,6 @@ -from ....data.sample import load_sample_data -from ....tools.decorators import dataset +from .._common import api -import numpy as np - - -def check_dataset(adata): - """Check that dataset output fits expected API.""" - - assert "batch" in adata.obs - assert "labels" in adata.obs - assert "log_normalized" in adata.layers - assert "counts" in adata.layers - assert adata.var_names.is_unique - assert adata.obs_names.is_unique - - return True +check_dataset = api.check_dataset def check_method(adata, is_baseline=False): @@ -25,25 +11,7 @@ def check_method(adata, is_baseline=False): return True -@dataset() -def sample_dataset(): - """Create a simple dataset to use for testing methods in this task.""" - import scanpy as sc - - adata = load_sample_data() - - adata.var.index = adata.var.gene_short_name.astype(str) - sc.pp.normalize_total(adata) - - adata.obsm["X_uni"] = sc.pp.pca(adata.X) - adata.obs["batch"] = np.random.choice(2, adata.shape[0], replace=True).astype(str) - adata.obs["labels"] = np.random.choice(5, adata.shape[0], replace=True).astype(str) - adata.layers["log_normalized"] = adata.X.multiply( - 10000 / adata.X.sum(axis=1) - ).tocsr() - adata.var_names_make_unique() - adata.obs_names_make_unique() - return adata +sample_dataset = api.sample_dataset def sample_method(adata): diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/datasets/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_feature/datasets/__init__.py index 4b86a1c17c..0ad80a4782 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/datasets/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/datasets/__init__.py @@ -1,2 +1,2 @@ -from ...batch_integration_graph.datasets.immune import immune_batch -from ...batch_integration_graph.datasets.pancreas import pancreas_batch +from ..._common.datasets.immune import immune_batch +from ..._common.datasets.pancreas import pancreas_batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/api.py b/openproblems/tasks/_batch_integration/batch_integration_graph/api.py index 5cc9831f99..e708faef5a 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/api.py @@ -1,48 +1,25 @@ -from ....data.sample import load_sample_data from ....tools.decorators import dataset +from .._common import api -import numpy as np +import functools +MIN_CELLS_PER_CELLTYPE = 50 -def check_dataset(adata): - """Check that dataset output fits expected API.""" - assert "X_uni_pca" in adata.obsm - assert "batch" in adata.obs - assert "labels" in adata.obs - assert "uni" in adata.uns - assert adata.uns["uni"]["connectivities_key"] == "uni_connectivities" - assert adata.uns["uni"]["distances_key"] == "uni_distances" - assert "uni_connectivities" in adata.obsp - assert "uni_distances" in adata.obsp - assert "log_normalized" in adata.layers - - return True +check_dataset = functools.partial( + api.check_dataset, do_check_pca=True, do_check_neighbors=True +) def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" - assert "neighbors" in adata.uns - assert adata.uns["neighbors"]["connectivities_key"] == "connectivities" - assert adata.uns["neighbors"]["distances_key"] == "distances" - assert "connectivities" in adata.obsp - assert "distances" in adata.obsp + api.check_neighbors(adata, "neighbors", "connectivities", "distances") return True @dataset() def sample_dataset(): - """Create a simple dataset to use for testing methods in this task.""" - import scanpy as sc - - adata = load_sample_data() - adata.obsm["X_uni_pca"] = sc.pp.pca(adata.X) - adata.layers["log_normalized"] = adata.X - adata.obs["batch"] = np.random.choice(2, adata.shape[0], replace=True).astype(str) - adata.obs["labels"] = np.random.choice(5, adata.shape[0], replace=True).astype(str) - - sc.pp.neighbors(adata, use_rep="X_uni_pca", key_added="uni") - return adata + return api.sample_dataset(run_pca=True, run_neighbors=True) def sample_method(adata): diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py index 3369c29cd4..0ad80a4782 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py @@ -1,2 +1,2 @@ -from .immune import immune_batch -from .pancreas import pancreas_batch +from ..._common.datasets.immune import immune_batch +from ..._common.datasets.pancreas import pancreas_batch From 9a7485c2e9f18a7d5fd117d1fdd637816b4aee3f Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 24 Jan 2023 02:19:44 +1100 Subject: [PATCH 203/266] pass raw counts to neuralee (#779) --- .../tasks/dimensionality_reduction/methods/neuralee.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py index cb445a52ac..dd47933dfb 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py +++ b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py @@ -97,7 +97,12 @@ def _neuralee( @_neuralee_method(method_name="NeuralEE (CPU) (Default)") def neuralee_default(adata: AnnData, test: bool = False) -> AnnData: - return _neuralee(adata, test=test, normalize=True, subsample_genes=500) + # neuralee needs raw counts + adata.X = adata.layers["counts"] + adata = _neuralee(adata, test=test, normalize=True, subsample_genes=500) + # revert to expected values + adata.X = adata.layers["log_cpm"] + return adata @_neuralee_method(method_name="NeuralEE (CPU) (logCPM, 1kHVG)") From e5f0bc663de6d5130a21aca0e5a46fa27fb2c966 Mon Sep 17 00:00:00 2001 From: Nikolay Markov Date: Mon, 23 Jan 2023 11:52:54 -0600 Subject: [PATCH 204/266] Label projection describe datasets (#776) * Rename zebrafish_labels to zebrafish_labs * Add label pr dataset dimensions & cell type counts * pre-commit * pre-commit * Fix flake 88 line length Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- CONTRIBUTING.md | 2 +- openproblems/api/README.md | 6 +++--- .../tasks/label_projection/datasets/__init__.py | 2 +- .../tasks/label_projection/datasets/cengen.py | 10 +++++++--- .../tasks/label_projection/datasets/pancreas.py | 12 +++++++++--- .../label_projection/datasets/tabula_muris_senis.py | 4 +++- .../tasks/label_projection/datasets/zebrafish.py | 12 ++++++++---- test/test_core_cli.py | 4 ++-- 8 files changed, 34 insertions(+), 18 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9fd9468ed0..072851ce8f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -333,7 +333,7 @@ docker run \ -it singlecellopenproblems/openproblems-python-extras bash openproblems-cli test \ --task label_projection \ - --dataset zebrafish_labels \ + --dataset zebrafish_labs \ --method logistic_regression_log_cpm \ --metric f1 ``` diff --git a/openproblems/api/README.md b/openproblems/api/README.md index 69e9dd5284..2d14fefe81 100644 --- a/openproblems/api/README.md +++ b/openproblems/api/README.md @@ -63,7 +63,7 @@ You can then list the avaiable datasets, methods, and metrics for a partiular ta > openproblems-cli list --datasets --task label_projection pancreas_batch pancreas_random -zebrafish_labels +zebrafish_labs zebrafish_random > openproblems-cli list --methods --task label_projection @@ -96,7 +96,7 @@ multimodal_data_integration $ openproblems-cli list --datasets --task label_projection pancreas_batch pancreas_random -zebrafish_labels +zebrafish_labs zebrafish_random $ openproblems-cli load --task label_projection --output dataset.h5ad pancreas_batch $ openproblems-cli list --methods --task label_projection @@ -141,7 +141,7 @@ multimodal_data_integration $ openproblems-cli list --datasets --task label_projection pancreas_batch pancreas_random -zebrafish_labels +zebrafish_labs zebrafish_random $ openproblems-cli image --datasets --task label_projection pancreas_batch openproblems diff --git a/openproblems/tasks/label_projection/datasets/__init__.py b/openproblems/tasks/label_projection/datasets/__init__.py index ecd67d700d..5b9afe98b5 100644 --- a/openproblems/tasks/label_projection/datasets/__init__.py +++ b/openproblems/tasks/label_projection/datasets/__init__.py @@ -4,5 +4,5 @@ from .pancreas import pancreas_random from .pancreas import pancreas_random_label_noise from .tabula_muris_senis import tabula_muris_senis_lung_random -from .zebrafish import zebrafish_labels +from .zebrafish import zebrafish_labs from .zebrafish import zebrafish_random diff --git a/openproblems/tasks/label_projection/datasets/cengen.py b/openproblems/tasks/label_projection/datasets/cengen.py index 6089b2fad6..6575688b3e 100644 --- a/openproblems/tasks/label_projection/datasets/cengen.py +++ b/openproblems/tasks/label_projection/datasets/cengen.py @@ -5,11 +5,13 @@ @dataset( - "CeNGEN (by batch)", + "CeNGEN (split by batch)", data_url=load_cengen.metadata["data_url"], data_reference=load_cengen.metadata["data_reference"], dataset_summary="100k FACS-isolated C. elegans neurons from 17 experiments " - "sequenced on 10x Genomics. Split into train/test by experimental batch.", + "sequenced on 10x Genomics. Split into train/test by experimental batch. " + "Dimensions: 100955 cells, 22469 genes. 169 cell types " + "(avg. 597±800 cells per cell type).", ) def cengen_batch(test=False): adata = load_cengen(test=test) @@ -31,7 +33,9 @@ def cengen_batch(test=False): data_url=load_cengen.metadata["data_url"], data_reference=load_cengen.metadata["data_reference"], dataset_summary="100k FACS-isolated C. elegans neurons from 17 experiments " - "sequenced on 10x Genomics. Split into train/test randomly.", + "sequenced on 10x Genomics. Split into train/test randomly. " + "Dimensions: 100955 cells, 22469 genes. 169 cell types " + "avg. 597±800 cells per cell type).", ) def cengen_random(test=False): adata = load_cengen(test=test) diff --git a/openproblems/tasks/label_projection/datasets/pancreas.py b/openproblems/tasks/label_projection/datasets/pancreas.py index 0981ed8ed0..e303f9ac22 100644 --- a/openproblems/tasks/label_projection/datasets/pancreas.py +++ b/openproblems/tasks/label_projection/datasets/pancreas.py @@ -11,7 +11,9 @@ data_reference=load_pancreas.metadata["data_reference"], dataset_summary="Human pancreatic islet scRNA-seq data from 6 datasets " "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " - "and SMARTER-seq). Split into train/test by experimental batch.", + "and SMARTER-seq). Split into train/test by experimental batch. " + "Dimensions: 16382 cells, 18771 genes. 14 cell types " + "(avg. 1170±1703 cells per cell type).", ) def pancreas_batch(test=False): adata = load_pancreas(test=test) @@ -34,7 +36,9 @@ def pancreas_batch(test=False): data_reference=load_pancreas.metadata["data_reference"], dataset_summary="Human pancreatic islet scRNA-seq data from 6 datasets " "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " - "and SMARTER-seq). Split into train/test randomly.", + "and SMARTER-seq). Split into train/test randomly. " + "Dimensions: 16382 cells, 18771 genes. 14 cell types " + "(avg. 1170±1703 cells per cell type).", ) def pancreas_random(test=False): adata = load_pancreas(test=test) @@ -55,7 +59,9 @@ def pancreas_random(test=False): data_reference=load_pancreas.metadata["data_reference"], dataset_summary="Human pancreatic islet scRNA-seq data from 6 datasets " "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " - "and SMARTER-seq). Split into train/test randomly with 20% label noise.", + "and SMARTER-seq). Split into train/test randomly with 20% label noise. " + "Dimensions: 16382 cells, 18771 genes. 14 cell types " + "(avg. 1170±1703 cells per cell type).", ) def pancreas_random_label_noise(test=False): adata = load_pancreas(test=test) diff --git a/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py b/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py index ec9274a4cc..3f927a83a8 100644 --- a/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py +++ b/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py @@ -9,7 +9,9 @@ data_url=load_tabula_muris_senis.metadata["data_url"], data_reference=load_tabula_muris_senis.metadata["data_reference"], dataset_summary="All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 " - "organs and tissues across the mouse lifespan. Split into train/test randomly.", + "organs and tissues across the mouse lifespan. Split into train/test randomly. " + "Dimensions: 24540 cells, 17985 genes. 39 cell types " + "(avg. 629±999 cells per cell type).", ) def tabula_muris_senis_lung_random(test=False): adata = load_tabula_muris_senis( diff --git a/openproblems/tasks/label_projection/datasets/zebrafish.py b/openproblems/tasks/label_projection/datasets/zebrafish.py index 30999ed261..ca1935b23a 100644 --- a/openproblems/tasks/label_projection/datasets/zebrafish.py +++ b/openproblems/tasks/label_projection/datasets/zebrafish.py @@ -5,14 +5,16 @@ @dataset( - "Zebrafish (by labels)", + "Zebrafish (by laboratory)", data_url=load_zebrafish.metadata["data_url"], data_reference=load_zebrafish.metadata["data_reference"], dataset_summary="90k cells from zebrafish embryos throughout the first day of " "development, with and without a knockout of chordin, an important developmental " - "gene. Split into train/test by laboratory.", + "gene. Split into train/test by laboratory. " + "Dimensions: 26022 cells, 25258 genes. 24 cell types " + "(avg. 1084±1156 cells per cell type).", ) -def zebrafish_labels(test=False): +def zebrafish_labs(test=False): adata = load_zebrafish(test=test) adata.obs["labels"] = adata.obs["cell_type"] adata.obs["batch"] = adata.obs["lab"] @@ -26,7 +28,9 @@ def zebrafish_labels(test=False): data_reference=load_zebrafish.metadata["data_reference"], dataset_summary="90k cells from zebrafish embryos throughout the first day of " "development, with and without a knockout of chordin, an important developmental " - "gene. Split into train/test randomly.", + "gene. Split into train/test randomly. " + "Dimensions: 26022 cells, 25258 genes. 24 cell types " + "(avg. 1084±1156 cells per cell type).", ) def zebrafish_random(test=False): adata = load_zebrafish(test=test) diff --git a/test/test_core_cli.py b/test/test_core_cli.py index 17d7924fa7..8bd90c0e3a 100644 --- a/test/test_core_cli.py +++ b/test/test_core_cli.py @@ -160,7 +160,7 @@ def test_hash_docker_api(): @parameterized.parameterized.expand( [ (dataset, method, metric) - for dataset in ["zebrafish_labels", None] + for dataset in ["zebrafish_labs", None] for method in ["logistic_regression_log_cpm", None] for metric in ["accuracy", None] ], @@ -232,7 +232,7 @@ def test_pipeline(): "--test", "--output", dataset_file, - "zebrafish_labels", + "zebrafish_labs", ], do_print=False, ) From a2d20759e87b9a9c3fdd9172428974698c999757 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 24 Jan 2023 17:25:03 +0100 Subject: [PATCH 205/266] Add missing DR references (#782) * Add missing DR references * pre-commit * Fix comments * fix whitespace Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- main.bib | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/main.bib b/main.bib index fa7abe01c0..0162461a18 100644 --- a/main.bib +++ b/main.bib @@ -135,6 +135,19 @@ @article{cao2020human doi = {10.1126/science.aba7721}, url = {https://doi.org/10.1126/science.aba7721}, } +@article{chen2009local, + title = {Local Multidimensional Scaling for Nonlinear Dimension Reduction, Graph Drawing, and Proximity Analysis}, + author = {Lisha Chen and Andreas Buja}, + year = {2009}, + month = mar, + journal = {Journal of the American Statistical Association}, + publisher = {Informa {UK} Limited}, + volume = {104}, + number = {485}, + pages = {209--219}, + doi = {10.1198/jasa.2009.0111}, + url = {https://doi.org/10.1198/jasa.2009.0111}, +} @inproceedings{chen2016xgboost, title = {{XGBoost}}, author = {Tianqi Chen and Carlos Guestrin}, @@ -453,6 +466,18 @@ @article{korsunsky2019fast doi = {10.1038/s41592-019-0619-0}, url = {https://doi.org/10.1038/s41592-019-0619-0}, } +@article{kraemer2018dimred, + title = {{dimRed} and {coRanking} - Unifying Dimensionality Reduction in R}, + author = {Guido Kraemer and Markus Reichstein and Miguel, D. Mahecha}, + year = {2018}, + journal = {The R Journal}, + publisher = {The R Foundation}, + volume = {10}, + number = {1}, + pages = {342}, + doi = {10.32614/rj-2018-039}, + url = {https://doi.org/10.32614/rj-2018-039}, +} @article{kruskal1964mds, title = {Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis}, author = {J. B. Kruskal}, @@ -486,6 +511,19 @@ @book{lawson1995solving doi = {10.1137/1.9781611971217}, url = {https://doi.org/10.1137/1.9781611971217}, } +@article{lee2009quality, + title = {Quality assessment of dimensionality reduction: Rank-based criteria}, + author = {John A. Lee and Michel Verleysen}, + year = {2009}, + month = mar, + journal = {Neurocomputing}, + publisher = {Elsevier {BV}}, + volume = {72}, + number = {7-9}, + pages = {1431--1443}, + doi = {10.1016/j.neucom.2008.12.017}, + url = {https://doi.org/10.1016/j.neucom.2008.12.017}, +} @article{linderman2018zero, title = {Zero-preserving imputation of scRNA-seq data using low-rank approximation}, author = {Linderman, George C. and Zhao, Jun and Kluger, Yuval}, @@ -547,6 +585,16 @@ @article{luecken2022benchmarking doi = {10.1038/s41592-021-01336-8}, url = {https://doi.org/10.1038/s41592-021-01336-8}, } +@article{lueks2011evaluate, + title = {How to Evaluate Dimensionality Reduction? - Improving the Co-ranking Matrix}, + author = {Lueks, Wouter and Mokbel, Bassam and Biehl, Michael and Hammer, Barbara}, + year = {2011}, + journal = {arXiv}, + doi = {10.48550/ARXIV.1110.3917}, + url = {https://arxiv.org/abs/1110.3917}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (cs.LG), Information Retrieval (cs.IR), FOS: Computer and information sciences, FOS: Computer and information sciences}, +} @misc{lun2019fastmnn, title = {A description of the theory behind the fastMNN algorithm}, author = {Lun, Aaron}, @@ -754,6 +802,18 @@ @article{stuart2019comprehensive pages = {1888--1902.e21}, doi = {10.1016/j.cell.2019.05.031}, } +@article{szubert2019structurepreserving, + title = {Structure-preserving visualisation of high dimensional single-cell datasets}, + author = {Benjamin Szubert and Jennifer E. Cole and Claudia Monaco and Ignat Drozdov}, + year = {2019}, + month = jun, + journal = {Scientific Reports}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {9}, + number = {1}, + doi = {10.1038/s41598-019-45301-0}, + url = {https://doi.org/10.1038/s41598-019-45301-0}, +} @article{tabula2018single, title = {Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris}, author = {{Tabula Muris Consortium}}, @@ -839,6 +899,19 @@ @inproceedings{venna2001neighborhood doi = {10.1007/3-540-44668-0\_68}, url = {https://doi.org/10.1007/3-540-44668-0\_68}, } +@article{venna2006local, + title = {Local multidimensional scaling}, + author = {Jarkko Venna and Samuel Kaski}, + year = {2006}, + month = jul, + journal = {Neural Networks}, + publisher = {Elsevier {BV}}, + volume = {19}, + number = {6-7}, + pages = {889--899}, + doi = {10.1016/j.neunet.2006.05.014}, + url = {https://doi.org/10.1016/j.neunet.2006.05.014}, +} @article{wagner2018knearest, title = {K-nearest neighbor smoothing for high-throughput single-cell RNA-Seq data}, author = {Wagner, Florian and Yan, Yun and Yanai, Itai}, From 1b84d26d15f989006e24a99be199ca0a7b484a5c Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 31 Jan 2023 02:48:50 +1100 Subject: [PATCH 206/266] Bugfix/lowercase GitHub repo owner (#794) * Allow for uppercase repo owner * Fix sklearn req * bash not sh --- .github/workflows/run_tests.yml | 82 ++++++++++++++++++--------------- setup.py | 2 +- 2 files changed, 46 insertions(+), 38 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 12e13820d7..40816da195 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -51,6 +51,9 @@ jobs: - name: Set up environment run: | echo "LINUX_VERSION=$(uname -rs)" >> $GITHUB_ENV + echo "REPO_OWNER_LOWER=${OWNER,,}" >> ${GITHUB_ENV} + env: + OWNER: '${{ github.repository_owner }}' - name: Cache Python packages uses: actions/cache@v3 @@ -113,7 +116,7 @@ jobs: uses: docker/login-action@v2 with: registry: ghcr.io - username: ${{ github.repository_owner }} + username: ${{ env.REPO_OWNER_LOWER }} password: ${{ secrets.GITHUB_TOKEN }} - name: Export docker images @@ -121,7 +124,7 @@ jobs: run: | IMAGES="$(find ./docker -mindepth 1 -type d -exec basename {} \;)" for image in ${IMAGES}; do - GHCR_IMAGE="ghcr.io/${{ github.repository_owner }}/${image}:${{ github.run_id }}" + GHCR_IMAGE="ghcr.io/${REPO_OWNER_LOWER}/${image}:${{ github.run_id }}" docker tag singlecellopenproblems/$image $GHCR_IMAGE docker push $GHCR_IMAGE & PIDS+=("${!}") @@ -236,22 +239,32 @@ jobs: with: fetch-depth: 0 + - name: Set up environment + run: | + echo "LINUX_VERSION=$(uname -rs)" >> $GITHUB_ENV + echo "pythonLocation=$(which python)" >> $GITHUB_ENV + echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV + echo "R_VERSION=$(R --version | head -n 1)" >> $GITHUB_ENV + echo "REPO_OWNER_LOWER=${OWNER,,}" >> ${GITHUB_ENV} + env: + OWNER: '${{ github.repository_owner }}' + shell: bash -e {0} + - name: Log in to the Container registry uses: docker/login-action@v2 if: "needs.build_images.result == 'success'" with: registry: ghcr.io - username: ${{ github.repository_owner }} + username: ${{ env.REPO_OWNER_LOWER }} password: ${{ secrets.GITHUB_TOKEN }} - name: Download docker images if: "needs.build_images.result == 'success'" env: - REPO_OWNER: ${{ github.repository_owner }} RUN_ID: ${{ github.run_id }} run: | for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do - GHCR_IMAGE="ghcr.io/${REPO_OWNER}/${image}:${RUN_ID}" + GHCR_IMAGE="ghcr.io/${REPO_OWNER_LOWER}/${image}:${RUN_ID}" docker pull $GHCR_IMAGE & PIDS+=("${!}") done @@ -261,18 +274,11 @@ jobs: done # tag images for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do - GHCR_IMAGE="ghcr.io/${REPO_OWNER}/${image}:${RUN_ID}" + GHCR_IMAGE="ghcr.io/${REPO_OWNER_LOWER}/${image}:${RUN_ID}" docker tag $GHCR_IMAGE singlecellopenproblems/${image}:latest done shell: bash -e {0} - - name: Set up environment - run: | - echo "LINUX_VERSION=$(uname -rs)" >> $GITHUB_ENV - echo "pythonLocation=$(which python)" >> $GITHUB_ENV - echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV - echo "R_VERSION=$(R --version | head -n 1)" >> $GITHUB_ENV - - name: Cache Python packages uses: actions/cache@v3 with: @@ -419,20 +425,42 @@ jobs: - name: Clear space on runner run: ./scripts/clear_runner_diskspace.sh + - name: Set up environment + id: setup-environment + run: | + # If not on the base repository, append first 6 characters of username to the image name + # to avoid clashes on ECR + REPO_PARSED=$(echo ${{ github.repository }} | awk '{print $1}' FS=/ | head -c 6) + BRANCH_PARSED=$(echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[^a-zA-Z0-9]:-:g') + if [[ ${{ startsWith(github.ref, 'refs/tags') || startsWith(github.ref, 'refs/heads/test_full_benchmark') }} == true ]]; then + BRANCH="prod" + elif [[ "${{ github.repository }}" == "openproblems-bio/openproblems" ]]; then + BRANCH=`echo $BRANCH_PARSED | head -c 40` + else + BRANCH="${REPO_PARSED}-`echo $BRANCH_PARSED | head -c 33`" + fi + BRANCH=`echo $BRANCH | sed 's/[^a-zA-Z0-9]*$//'` + echo "BRANCH=${BRANCH}" >> $GITHUB_ENV + echo "branch=${BRANCH}" >> $GITHUB_OUTPUT + RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" + echo "run_name=${RUN_NAME}" >> $GITHUB_OUTPUT + echo "REPO_OWNER_LOWER=${OWNER,,}" >> ${GITHUB_ENV} + env: + OWNER: '${{ github.repository_owner }}' + - name: Log in to the Container registry uses: docker/login-action@v2 with: registry: ghcr.io - username: ${{ github.repository_owner }} + username: ${{ env.REPO_OWNER_LOWER }} password: ${{ secrets.GITHUB_TOKEN }} - name: Download docker images env: - REPO_OWNER: ${{ github.repository_owner }} RUN_ID: ${{ github.run_id }} run: | for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do - GHCR_IMAGE="ghcr.io/${REPO_OWNER}/${image}:${RUN_ID}" + GHCR_IMAGE="ghcr.io/${REPO_OWNER_LOWER}/${image}:${RUN_ID}" docker pull $GHCR_IMAGE & PIDS+=("${!}") done @@ -442,31 +470,11 @@ jobs: done # tag images for image in $(find ./docker -mindepth 1 -type d -exec basename {} \;); do - GHCR_IMAGE="ghcr.io/${REPO_OWNER}/${image}:${RUN_ID}" + GHCR_IMAGE="ghcr.io/${REPO_OWNER_LOWER}/${image}:${RUN_ID}" docker tag $GHCR_IMAGE singlecellopenproblems/${image}:latest done shell: bash -e {0} - - name: Set up environment - id: setup-environment - run: | - # If not on the base repository, append first 6 characters of username to the image name - # to avoid clashes on ECR - REPO_PARSED=$(echo ${{ github.repository }} | awk '{print $1}' FS=/ | head -c 6) - BRANCH_PARSED=$(echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[^a-zA-Z0-9]:-:g') - if [[ ${{ startsWith(github.ref, 'refs/tags') || startsWith(github.ref, 'refs/heads/test_full_benchmark') }} == true ]]; then - BRANCH="prod" - elif [[ "${{ github.repository }}" == "openproblems-bio/openproblems" ]]; then - BRANCH=`echo $BRANCH_PARSED | head -c 40` - else - BRANCH="${REPO_PARSED}-`echo $BRANCH_PARSED | head -c 33`" - fi - BRANCH=`echo $BRANCH | sed 's/[^a-zA-Z0-9]*$//'` - echo "BRANCH=${BRANCH}" >> $GITHUB_ENV - echo "branch=${BRANCH}" >> $GITHUB_OUTPUT - RUN_NAME="$(echo "$BRANCH" | sed "s/[^a-zA-Z0-9]/_/g")_$(git rev-parse --short HEAD)_${GITHUB_RUN_ATTEMPT}" - echo "run_name=${RUN_NAME}" >> $GITHUB_OUTPUT - - name: Upload Docker images env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} diff --git a/setup.py b/setup.py index f0083cc5c9..53881a202f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ install_requires = [ "numpy>=1.21,<1.24", - "scikit-learn>=1.0.*,<=1.1.*", + "scikit-learn>=1.0,<1.2", "anndata==0.8.*", "scprep>=1.2.1", "scipy>=1.7,<1.10", From 3dae495a752f3421b2b7304daea8900be7dcb542 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 31 Jan 2023 04:08:41 +1100 Subject: [PATCH 207/266] Upgrade isort (#795) * Upgrade pip * skip 22.3.1 * skip 22.3.0 * It's not pip * Upgrade isort --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6efbfe6ee4..e722941c34 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: - id: requirements-txt-fixer files: .*requirements.*\.txt - repo: https://github.com/timothycrosley/isort - rev: 5.10.1 + rev: 5.11.5 hooks: - id: isort - repo: https://github.com/psf/black From 8d90bc527f1db963a3bb551eb5e30ccde415d0af Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 31 Jan 2023 04:13:15 +1100 Subject: [PATCH 208/266] Include README --- .dockerignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.dockerignore b/.dockerignore index a40704eba5..f42541ded2 100644 --- a/.dockerignore +++ b/.dockerignore @@ -11,3 +11,4 @@ test .idea .vscode *.md +!./README.md From a38f45bd94815141c5cc388a5d1d134b1d937188 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 31 Jan 2023 06:46:32 +1100 Subject: [PATCH 209/266] Update styler to 1.9.0 # ci skip (#787) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-github-actions/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-github-actions/r_requirements.txt b/docker/openproblems-github-actions/r_requirements.txt index 23dafb913f..764418061e 100644 --- a/docker/openproblems-github-actions/r_requirements.txt +++ b/docker/openproblems-github-actions/r_requirements.txt @@ -2,5 +2,5 @@ backports@1.4.1 docopt@0.7.1 git2r@0.30.1 lintr@3.0.2 -styler@1.8.1 +styler@1.9.0 tibble@3.1.8 From f93d4961890e6de162270fc09199119dd144fec6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 30 Jan 2023 15:56:55 -0500 Subject: [PATCH 210/266] Update docker version # ci skip (#798) Co-authored-by: openproblems-bio --- docker/.version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/.version b/docker/.version index a918a2aa18..faef31a435 100644 --- a/docker/.version +++ b/docker/.version @@ -1 +1 @@ -0.6.0 +0.7.0 From 32a189ec98fa23d7c425d6e845c15269aa106f52 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 31 Jan 2023 09:39:05 +1100 Subject: [PATCH 211/266] Update bslib to 0.4.2 # ci skip (#759) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-extras/r_requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/openproblems-r-extras/r_requirements.txt b/docker/openproblems-r-extras/r_requirements.txt index fea6a4b4fa..42a8df1066 100644 --- a/docker/openproblems-r-extras/r_requirements.txt +++ b/docker/openproblems-r-extras/r_requirements.txt @@ -4,7 +4,7 @@ bioc::ComplexHeatmap@2.14.0 bioc::dir.expiry bioc::scater@1.26.1 bioc::scuttle@1.8.3 -bslib@0.4.0 +bslib@0.4.2 caret@6.0-93 cli@3.4.1 conos@1.5.0 @@ -20,7 +20,7 @@ forecast@8.18 hardhat@1.2.0 here@1.0.1 hexbin@1.28.2 -htmltools@0.5.3 +htmltools@0.5.4 htmlwidgets@1.5.4 igraph@1.3.5 lifecycle@1.0.3 From 620e751eedfa8c97e1d44e166e1205a2948a980a Mon Sep 17 00:00:00 2001 From: Daniel Dimitrov <50865230+dbdimitrov@users.noreply.github.com> Date: Tue, 31 Jan 2023 02:31:49 +0100 Subject: [PATCH 212/266] add missing logfc decorator (#796) --- .../_common/methods/liana.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py index 87856a10a6..b4a9545b84 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py @@ -187,11 +187,22 @@ def connectome_sum(adata, test=False): return adata +_logfc_method = functools.partial( + method, + paper_name="Comparison of methods and resources for cell-cell " + "communication inference from single-cell RNA-Seq data", + paper_reference="dimitrov2022comparison", + paper_year=2022, + code_url="https://github.com/saezlab/liana", + image="openproblems-r-extras", +) + + def _logfc(adata, test=False): return _liana(adata, method="logfc", score_col="logfc_comb", test=test) -@_connectome_method( +@_logfc_method( method_name="Log2FC (max)", ) def logfc_max(adata, test=False): @@ -201,7 +212,7 @@ def logfc_max(adata, test=False): return adata -@_connectome_method( +@_logfc_method( method_name="Log2FC (sum)", ) def logfc_sum(adata, test=False): From 5f8fd17e63771996a040735813c662a542e5c157 Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Mon, 30 Jan 2023 22:29:30 -0500 Subject: [PATCH 213/266] Add ALRA preprocessing identical to literature (#763) * Adds a log normalized version log normalized version is identical to ALRA paper preprocessing, whereas sqrt normalized version mimics MAGIC preprocessing * Renames alra methods to alra_sqrt and alra_log * Update __init__.py * pre-commit * Update alra.py * pre-commit * Update alra.py * Update alra.py * Update alra.py * Update alra.py * Update __init__.py * pre-commit * Update alra.py * pre-commit * Update alra.py * pre-commit * Update alra.py * Update alra.py * pre-commit * Update alra.py * pre-commit * Update alra.py * Update alra.py * Update alra.py * pre-commit * Update alra.py * Update alra.py * Update alra.py --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .../tasks/denoising/methods/__init__.py | 3 +- openproblems/tasks/denoising/methods/alra.py | 53 ++++++++++++++++++- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/openproblems/tasks/denoising/methods/__init__.py b/openproblems/tasks/denoising/methods/__init__.py index 1bfa91d5c6..8dbff51e8d 100644 --- a/openproblems/tasks/denoising/methods/__init__.py +++ b/openproblems/tasks/denoising/methods/__init__.py @@ -1,4 +1,5 @@ -from .alra import alra +from .alra import alra_log +from .alra import alra_sqrt from .baseline import no_denoising from .baseline import perfect_denoising from .dca import dca diff --git a/openproblems/tasks/denoising/methods/alra.py b/openproblems/tasks/denoising/methods/alra.py index d1946395e5..9a2f54849d 100644 --- a/openproblems/tasks/denoising/methods/alra.py +++ b/openproblems/tasks/denoising/methods/alra.py @@ -9,7 +9,7 @@ @method( - method_name="ALRA", + method_name="ALRA (sqrt norm)", paper_name="Zero-preserving imputation of scRNA-seq data using " "low-rank approximation", paper_reference="linderman2018zero", @@ -17,7 +17,7 @@ code_url="https://github.com/KlugerLab/ALRA", image="openproblems-r-extras", ) -def alra(adata, test=False): +def alra_sqrt(adata, test=False): import numpy as np import rpy2.rinterface_lib.embedded import scprep @@ -45,9 +45,58 @@ def alra(adata, test=False): raise # transform back into original space + # functions are reversed! Y = scprep.utils.matrix_transform(Y, np.square) Y = scprep.utils.matrix_vector_elementwise_multiply(Y, libsize, axis=0) adata.obsm["denoised"] = Y adata.uns["method_code_version"] = "1.0.0" return adata + + +@method( + method_name="ALRA (log norm)", + paper_name="Zero-preserving imputation of scRNA-seq data using " + "low-rank approximation", + paper_reference="linderman2018zero", + paper_year=2018, + code_url="https://github.com/KlugerLab/ALRA", + image="openproblems-r-extras", +) +def alra_log(adata, test=False): + import numpy as np + import rpy2.rinterface_lib.embedded + import scprep + + # libsize and log norm + # lib norm + adata.obsm["train_norm"], libsize = scprep.normalize.library_size_normalize( + adata.obsm["train"], rescale=1, return_library_size=True + ) + # log + adata.obsm["train_norm"] = scprep.utils.matrix_transform( + adata.obsm["train_norm"], np.log1p + ) + # to csr + adata.obsm["train_norm"] = adata.obsm["train_norm"].tocsr() + # run alra + # _alra takes sparse array, returns dense array + Y = None + attempts = 0 + while Y is None: + try: + Y = _alra(adata) + except rpy2.rinterface_lib.embedded.RRuntimeError: # pragma: no cover + if attempts < 10: + attempts += 1 + log.warning(f"alra.R failed (attempt {attempts})") + else: + raise + + # transform back into original space + Y = scprep.utils.matrix_transform(Y, np.expm1) + Y = scprep.utils.matrix_vector_elementwise_multiply(Y, libsize, axis=0) + adata.obsm["denoised"] = Y + + adata.uns["method_code_version"] = "1.0.0" + return adata From 672650ba691332f2f205a721e8f148b90cb1b26e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 31 Jan 2023 09:32:39 -0500 Subject: [PATCH 214/266] fix github actions badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e43845212b..2db00acf91 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Open Problems in Single-Cell Analysis -[![GitHub Workflow Status](https://img.shields.io/github/workflow/status/singlecellopenproblems/singlecellopenproblems/Run%20Tests/master?label=Github%20Actions)](https://github.com/openproblems-bio/openproblems/actions) +[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/openproblems-bio/openproblems/run_tests.yml?branch=main)](https://github.com/openproblems-bio/openproblems/actions) [![Coverage Status](https://codecov.io/gh/openproblems-bio/openproblems/branch/main/graph/badge.svg?token=S1ZIME1ZZR)](https://codecov.io/gh/openproblems-bio/openproblems) [![Netlify Status](https://api.netlify.com/api/v1/badges/83b92388-53c7-4fef-9003-e14d94c6ac6f/deploy-status)](https://app.netlify.com/sites/openproblems/deploys) [![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) From 5beb8f9c4f183b06da0735217d7a1f33306346c3 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 31 Jan 2023 12:46:32 -0500 Subject: [PATCH 215/266] run CI on PRs only with approving review (#804) * run CI on PRs only with approving review * bugfix * debug * tojson * just run on approved review * submitted / approved * fix pre-commit * always run pre-commit --- .github/pull_request_template.md | 8 -------- .github/workflows/pre-commit.yml | 6 +----- .github/workflows/run_tests.yml | 20 ++++++++------------ 3 files changed, 9 insertions(+), 25 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 4ff20f797a..b59ced7833 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -15,9 +15,6 @@ * [ ] This submission was written on a forked copy of openproblems * [ ] Nextflow test pipeline is passing on this base branch of this pull request (include link to passed test on NF Tower found in GitHub Actions summary: ) -* [ ] If this pull request is not ready for review (including passing the Nextflow test - pipeline), I will open this PR as a draft (click on the down arrow next to the - "Create Pull Request" button) ### Submission guidelines @@ -31,11 +28,6 @@ This PR will be evaluated on the basis of the following checks: * [ ] The task addresses a valid open problem in single-cell analysis -* [ ] The latest version of master is merged and tested -* [ ] The methods/metrics are imported to `__init__.py` and were tested in the pipeline -* [ ] Method and metric decorators are annotated with paper title, year, author, code - version, and date -* [ ] The README gives an outline of the methods, metrics and datasets in the folder * [ ] The README provides a satisfactory task explanation (for new tasks) * [ ] The sample test data is appropriate to test implementation of all methods and metrics (for new tasks) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index c2e9331f64..89f579de00 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -19,11 +19,7 @@ jobs: options: --user root if: >- - !endsWith(github.event.head_commit.message, '# ci skip') && - ( - startsWith(github.ref, 'refs/heads') || - github.event.pull_request.draft == false - ) + !endsWith(github.event.head_commit.message, '# ci skip') steps: diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 40816da195..e332d0b379 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -6,8 +6,9 @@ on: - '*' branches: - '**' - pull_request: - types: [opened, synchronize, reopened, ready_for_review] + pull_request_review: + types: + - 'submitted' concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -21,10 +22,9 @@ jobs: !endsWith(github.event.head_commit.message, '# ci skip') && !startsWith(github.ref, 'refs/heads/test_process') && ( - github.event_name != 'pull_request' || + (github.event_name != 'pull_request_review') || ( - github.event_name == 'pull_request' && - github.event.pull_request.draft == false && + github.event.review.state == 'approved' && github.event.pull_request.head.repo.owner.id == github.event.pull_request.base.repo.owner.id ) ) @@ -147,12 +147,8 @@ jobs: !startsWith(github.ref, 'refs/heads/test_process') && !startsWith(github.ref, 'refs/heads/test_website') && ( - github.event_name != 'pull_request' || - ( - github.event_name == 'pull_request' && - github.event.pull_request.draft == false && - github.actor != 'dependabot[bot]' - ) + github.event_name != 'pull_request_review' || + github.event.review.state == 'approved' ) outputs: @@ -374,7 +370,7 @@ jobs: always() && !endsWith(github.event.head_commit.message, '# ci skip') && needs.build_images.result == 'success' && - github.event_name != 'pull_request' && + github.event_name != 'pull_request_review' && ( needs.run_tester.result == 'success' || needs.run_tester.result == 'skipped' From 57041f2a3b83fd40bb2011d85ec45010d1057134 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 31 Jan 2023 13:08:00 -0500 Subject: [PATCH 216/266] add new workflow to add status (#805) * add new workflow to add status * set as target * temp * debug # ci skip * typo # ci skip * repo full_name * only on new PRs --- .github/workflows/comment_pull_request.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .github/workflows/comment_pull_request.yml diff --git a/.github/workflows/comment_pull_request.yml b/.github/workflows/comment_pull_request.yml new file mode 100644 index 0000000000..fc9d971de5 --- /dev/null +++ b/.github/workflows/comment_pull_request.yml @@ -0,0 +1,16 @@ +name: Comment on Pull Request Status + +on: + pull_request: + types: [opened] + +jobs: + comment_pr: + + runs-on: ubuntu-latest + + steps: + - uses: thollander/actions-comment-pull-request@v2 + with: + message: | + [![Current build status](https://img.shields.io/github/actions/workflow/status/${{ github.event.pull_request.head.repo.full_name }}/run_tests.yml?branch=${{ github.event.pull_request.head.ref }})](https://github.com/${{ github.event.pull_request.head.repo.full_name }}/actions/workflows/run_tests.yml?query=branch%3A${{ github.event.pull_request.head.ref }}) From f5e4778cfa18b8a13dbd9e99c66d955fed157def Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 31 Jan 2023 13:12:50 -0500 Subject: [PATCH 217/266] move comment to PR target --- .github/workflows/comment_pull_request.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .github/workflows/comment_pull_request.yml diff --git a/.github/workflows/comment_pull_request.yml b/.github/workflows/comment_pull_request.yml new file mode 100644 index 0000000000..4b47fe19f6 --- /dev/null +++ b/.github/workflows/comment_pull_request.yml @@ -0,0 +1,17 @@ +name: Comment on Pull Request Status + +on: + pull_request_target: + types: [opened, synchronize, reopened, ready_for_review] + +jobs: + comment_pr: + + runs-on: ubuntu-latest + + steps: + - uses: thollander/actions-comment-pull-request@v2 + with: + message: | + [![Current build status](https://img.shields.io/github/actions/workflow/status/${{ github.event.pull_request.head.repo.full_name }}/run_tests.yml?branch=${{ github.event.pull_request.head.ref }})](https://github.com/${{ github.event.pull_request.head.repo.full_name }}/actions/workflows/run_tests.yml?query=branch%3A${{ github.event.pull_request.head.ref }}) + comment_tag: build_status From 386dfb7c483515830c763ec61b7880562f82a6e7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 1 Feb 2023 05:20:56 +1100 Subject: [PATCH 218/266] Update bioc/scran to 1.26.2 # ci skip (#799) Co-authored-by: openproblems-bio Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-base/r_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-r-base/r_requirements.txt b/docker/openproblems-r-base/r_requirements.txt index 382bf5ced3..11d15839fd 100644 --- a/docker/openproblems-r-base/r_requirements.txt +++ b/docker/openproblems-r-base/r_requirements.txt @@ -1,3 +1,3 @@ -bioc::scran@1.26.1 +bioc::scran@1.26.2 IRkernel@1.3.1 RcppAnnoy@0.0.20 From 6c1d4dd7a9a421744f2edc0530fe953fed30c78d Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 31 Jan 2023 16:52:56 -0500 Subject: [PATCH 219/266] Specify PR number (#808) * specify number * temp --- .github/workflows/run_tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index e332d0b379..e7d132483b 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -313,7 +313,7 @@ jobs: if: "needs.build_images.result == 'skipped'" run: | cd workflow - snakemake -j $(nproc) docker + snakemake -j $(nproc) docker_pull cd .. - name: Run tests @@ -355,6 +355,7 @@ jobs: flags: unittests fail_ci_if_error: ${{ github.repository == 'openproblems-bio/openproblems' }} verbose: true + override_pr: ${{ github.event.pull_request.number }} - name: Delete coverage artifacts uses: geekyeggo/delete-artifact@v2 From 458fd8d84be5587833660daff0def6ff22248351 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 31 Jan 2023 16:53:11 -0500 Subject: [PATCH 220/266] add magic with reverse norm order (#797) --- .../tasks/denoising/methods/__init__.py | 2 ++ openproblems/tasks/denoising/methods/magic.py | 32 ++++++++++++++++--- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/openproblems/tasks/denoising/methods/__init__.py b/openproblems/tasks/denoising/methods/__init__.py index 8dbff51e8d..4afcf70031 100644 --- a/openproblems/tasks/denoising/methods/__init__.py +++ b/openproblems/tasks/denoising/methods/__init__.py @@ -7,3 +7,5 @@ from .magic import knn_naive from .magic import magic from .magic import magic_approx +from .magic import magic_approx_reverse_norm +from .magic import magic_reverse_norm diff --git a/openproblems/tasks/denoising/methods/magic.py b/openproblems/tasks/denoising/methods/magic.py index 68fcfd9e38..5e06b5ca5f 100644 --- a/openproblems/tasks/denoising/methods/magic.py +++ b/openproblems/tasks/denoising/methods/magic.py @@ -16,7 +16,7 @@ ) -def _magic(adata, solver, normtype="sqrt", **kwargs): +def _magic(adata, solver, normtype="sqrt", reverse_norm_order=False, **kwargs): from magic import MAGIC if normtype == "sqrt": @@ -28,11 +28,19 @@ def _magic(adata, solver, normtype="sqrt", **kwargs): else: raise NotImplementedError - X, libsize = scprep.normalize.library_size_normalize( - adata.obsm["train"], rescale=1, return_library_size=True - ) + X = adata.obsm["train"] + if reverse_norm_order: + # inexplicably, this sometimes performs better + X = scprep.utils.matrix_transform(X, norm_fn) + X, libsize = scprep.normalize.library_size_normalize( + X, rescale=1, return_library_size=True + ) + else: + X, libsize = scprep.normalize.library_size_normalize( + X, rescale=1, return_library_size=True + ) + X = scprep.utils.matrix_transform(X, norm_fn) - X = scprep.utils.matrix_transform(X, norm_fn) Y = MAGIC(solver=solver, **kwargs, verbose=False).fit_transform( X, genes="all_genes" ) @@ -52,6 +60,13 @@ def magic(adata, test=False): return _magic(adata, solver="exact", normtype="sqrt") +@_magic_method( + method_name="MAGIC (reversed normalization)", +) +def magic_reverse_norm(adata, test=False): + return _magic(adata, solver="exact", normtype="sqrt", reverse_norm_order=True) + + @_magic_method( method_name="MAGIC (approximate)", ) @@ -59,6 +74,13 @@ def magic_approx(adata, test=False): return _magic(adata, solver="approximate", normtype="sqrt") +@_magic_method( + method_name="MAGIC (approximate, reversed normalization)", +) +def magic_approx_reverse_norm(adata, test=False): + return _magic(adata, solver="approximate", normtype="sqrt", reverse_norm_order=True) + + @method( method_name="KNN smoothing", paper_name="KNN Smoothing (baseline)", From 61be1cf3c16f2a8dd8f3687b246c2b2e44d045b3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 1 Feb 2023 08:53:54 +1100 Subject: [PATCH 221/266] Bump pymde from 0.1.15 to 0.1.18 in /docker/openproblems-python-pytorch (#801) Bumps [pymde](https://github.com/cvxgrp/pymde) from 0.1.15 to 0.1.18. - [Release notes](https://github.com/cvxgrp/pymde/releases) - [Commits](https://github.com/cvxgrp/pymde/compare/v0.1.15...v0.1.18) --- updated-dependencies: - dependency-name: pymde dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-python-pytorch/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/openproblems-python-pytorch/requirements.txt b/docker/openproblems-python-pytorch/requirements.txt index b1ff654ff9..d416272787 100644 --- a/docker/openproblems-python-pytorch/requirements.txt +++ b/docker/openproblems-python-pytorch/requirements.txt @@ -3,7 +3,7 @@ git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix jax==0.3.25 jaxlib==0.3.25 -pymde==0.1.15 +pymde==0.1.18 scalex==1.0.2 scikit-misc==0.1.* scvi-tools~=0.17 # pinned in #313 From 940ed0b1c6eb1a24200886ffb1758e7826f2a4e6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 Feb 2023 04:02:00 +1100 Subject: [PATCH 222/266] Update scvi-tools requirement from ~=0.16 to ~=0.19 in /docker/openproblems-r-pytorch (#731) * Update scvi-tools requirement in /docker/openproblems-r-pytorch Updates the requirements on [scvi-tools](https://github.com/scverse/scvi-tools) to permit the latest version. - [Release notes](https://github.com/scverse/scvi-tools/releases) - [Commits](https://github.com/scverse/scvi-tools/compare/0.16.0...0.19.0) --- updated-dependencies: - dependency-name: scvi-tools dependency-type: direct:production ... Signed-off-by: dependabot[bot] * cell2location --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-python-pytorch/requirements.txt | 4 ++-- docker/openproblems-r-pytorch/requirements.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/openproblems-python-pytorch/requirements.txt b/docker/openproblems-python-pytorch/requirements.txt index d416272787..c615e671ae 100644 --- a/docker/openproblems-python-pytorch/requirements.txt +++ b/docker/openproblems-python-pytorch/requirements.txt @@ -1,4 +1,4 @@ -git+https://github.com/BayraktarLab/cell2location.git@7e7aa231cc61ff460da14402fa3b9a1fa3ec69ac +git+https://github.com/BayraktarLab/cell2location.git@47c8d6dc90dd3f1ab639861e8617c6ef0b62bb89 git+https://github.com/czbiohub/molecular-cross-validation@04d9df0 git+https://github.com/michalk8/neuralee@8946abf # contains gradient error fix jax==0.3.25 @@ -6,7 +6,7 @@ jaxlib==0.3.25 pymde==0.1.18 scalex==1.0.2 scikit-misc==0.1.* -scvi-tools~=0.17 # pinned in #313 +scvi-tools~=0.19 tangram-sc==1.0.* torch==1.12.* xgboost==1.6.* diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index 65d79f6782..c8b7a846d0 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -4,5 +4,5 @@ git+https://github.com/chriscainx/mnnpy@2097dec # master harmony-pytorch==0.1.* scanorama==1.7.0 scib==1.0.5 -scvi-tools~=0.16 # pinned in #313 +scvi-tools~=0.19 # pinned in #313 torch==1.13.* From fe18dfbc22da22e9d60db1b7b503610b1e3eee3b Mon Sep 17 00:00:00 2001 From: Daniel Strobl <50872326+danielStrobl@users.noreply.github.com> Date: Wed, 1 Feb 2023 20:02:22 +0100 Subject: [PATCH 223/266] Use graph and embedding metrics for feature and embedding subtask (#807) * wrappers for output generation * pre-commit * add pca to sample feature task dataset * pre-commit * Update api.py * bugfixes * pre-commit * flake8 import * pre-commit * test other syntax * pre-commit * disable flake8 for long import * pre-commit * added whitespace * pre-commit * Address flake8 * pre-commit * address flake8 * pre-commit * flake8 * Fix syntax * pre-commit * pre-commit * graph conn flake8 * pre-commit * clean up gitignore * refactor for readability * require uncorrected PCA for feature task * pre-commit --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Co-authored-by: Scott Gigante --- .gitignore | 6 +-- .../metrics/__init__.py | 4 ++ .../batch_integration_embed/metrics/ari.py | 27 +++++++++++++ .../metrics/graph_connectivity.py | 33 +++++++++++++++ .../metrics/iso_label_f1.py | 38 ++++++++++++++++++ .../batch_integration_embed/metrics/nmi.py | 26 ++++++++++++ .../batch_integration_feature/README.md | 1 + .../batch_integration_feature/api.py | 9 ++++- .../metrics/__init__.py | 10 +++++ .../batch_integration_feature/metrics/ari.py | 29 ++++++++++++++ .../metrics/cc_score.py | 32 +++++++++++++++ .../metrics/graph_connectivity.py | 35 ++++++++++++++++ .../metrics/iso_label_f1.py | 40 +++++++++++++++++++ .../metrics/iso_label_sil.py | 26 ++++++++++++ .../batch_integration_feature/metrics/kBET.py | 38 ++++++++++++++++++ .../batch_integration_feature/metrics/nmi.py | 28 +++++++++++++ .../batch_integration_feature/metrics/pcr.py | 29 ++++++++++++++ .../metrics/sil_batch.py | 35 ++++++++++++++++ .../metrics/silhouette.py | 23 +++++++++++ .../metrics/iso_label_f1.py | 2 +- 20 files changed, 464 insertions(+), 7 deletions(-) create mode 100644 openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py diff --git a/.gitignore b/.gitignore index f9bee9a858..ccfb3f5b6a 100644 --- a/.gitignore +++ b/.gitignore @@ -146,14 +146,12 @@ nf-openproblems # Editor .idea +.vscode scratch/ openproblems/results/ openproblems/work/ batch_embed.txt -immune.h5ad +*.h5ad -immune.h5ad -batch_embed.txt -.vscode/launch.json run_bbknn.py diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/__init__.py index 215ac44937..95bc254069 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/__init__.py @@ -1,6 +1,10 @@ +from .ari import ari from .cc_score import cc_score +from .graph_connectivity import graph_connectivity +from .iso_label_f1 import isolated_labels_f1 from .iso_label_sil import isolated_labels_sil from .kBET import kBET +from .nmi import nmi from .pcr import pcr from .sil_batch import silhouette_batch from .silhouette import silhouette diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py new file mode 100644 index 0000000000..e63e80df52 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py @@ -0,0 +1,27 @@ +from .....tools.decorators import metric +from ...batch_integration_graph import metrics as graph_metrics + +""" +The Rand index compares the overlap of two clusterings; +it considers both correct clustering overlaps while also counting correct +disagreements between two clusterings. +Similar to NMI, we compared the cell-type labels with the NMI-optimized +Louvain clustering computed on the integrated dataset. +The adjustment of the Rand index corrects for randomly correct labels. +An ARI of 0 or 1 corresponds to random labeling or a perfect match, +respectively. +We also used the scikit-learn (v.0.22.1) implementation of the ARI. +""" + + +@metric( + metric_name="ARI", + maximize=True, + paper_reference="luecken2022benchmarking", + image="openproblems-r-pytorch", +) +def ari(adata): + from scanpy.pp import neighbors + + neighbors(adata, use_rep="X_emb") + return graph_metrics.ari(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py new file mode 100644 index 0000000000..0fb72d7dd7 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py @@ -0,0 +1,33 @@ +from .....tools.decorators import metric +from ...batch_integration_graph import metrics as graph_metrics + +""" +The graph connectivity metric assesses whether the kNN graph representation, +G, of the integrated data directly connects all cells with the same cell +identity label. For each cell identity label c, we created the subset kNN +graph G(Nc;Ec) to contain only cells from a given label. Using these subset +kNN graphs, we computed the graph connectivity score using the equation: + +gc =1/|C| Σc∈C |LCC(G(Nc;Ec))|/|Nc|. + +Here, C represents the set of cell identity labels, |LCC()| is the number +of nodes in the largest connected component of the graph, and |Nc| is the +number of nodes with cell identity c. The resultant score has a range +of (0;1], where 1 indicates that all cells with the same cell identity +are connected in the integrated kNN graph, and the lowest possible score +indicates a graph where no cell is connected. As this score is computed +on the kNN graph, it can be used to evaluate all integration outputs. +""" + + +@metric( + metric_name="Graph connectivity", + paper_reference="luecken2022benchmarking", + maximize=True, + image="openproblems-r-pytorch", +) +def graph_connectivity(adata): + from scanpy.pp import neighbors + + neighbors(adata, use_rep="X_emb") + return graph_metrics.graph_connectivity(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py new file mode 100644 index 0000000000..04dc79c498 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py @@ -0,0 +1,38 @@ +from .....tools.decorators import metric +from ...batch_integration_graph import metrics as graph_metrics + +""" +We developed two isolated label scores to evaluate how well the data integration methods +dealt with cell identity labels shared by few batches. Specifically, we identified +isolated cell labels as the labels present in the least number of batches in the +integration task. +The score evaluates how well these isolated labels separate from other cell identities. +We implemented the isolated label metric in two versions: +(1) the best clustering of the isolated label (F1 score) and +(2) the global ASW of the isolated label. For the cluster-based score, +we first optimize the cluster assignment of the isolated label using the F1 score +across louvain clustering resolutions ranging from 0.1 to 2 in resolution steps of 0.1. +The optimal F1 score for the isolated label is then used as the metric score. +The F1 score is a weighted mean of precision and recall given by the equation: +𝐹1=2×(precision×recall)/(precision+recall). + +It returns a value between 0 and 1, +where 1 shows that all of the isolated label cells and no others are captured in +the cluster. For the isolated label ASW score, we compute the ASW of isolated +versus nonisolated labels on the PCA embedding (ASW metric above) and scale this +score to be between 0 and 1. The final score for each metric version consists of +the mean isolated score of all isolated labels. +""" + + +@metric( + metric_name="Isolated label F1", + paper_reference="luecken2022benchmarking", + maximize=True, + image="openproblems-r-pytorch", +) +def isolated_labels_f1(adata): + from scanpy.pp import neighbors + + neighbors(adata, use_rep="X_emb") + return graph_metrics.isolated_labels_f1(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py new file mode 100644 index 0000000000..438e2f9198 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py @@ -0,0 +1,26 @@ +from .....tools.decorators import metric +from ...batch_integration_graph import metrics as graph_metrics + +"""NMI compares the overlap of two clusterings. +We used NMI to compare the cell-type labels with Louvain clusters computed on +the integrated dataset. The overlap was scaled using the mean of the entropy terms +for cell-type and cluster labels. Thus, NMI scores of 0 or 1 correspond to uncorrelated +clustering or a perfect match, respectively. We performed optimized Louvain clustering +for this metric to obtain the best match between clusters and labels. +Louvain clustering was performed at a resolution range of 0.1 to 2 in steps of 0.1, +and the clustering output with the highest NMI with the label set was used. We used +the scikit-learn27 (v.0.22.1) implementation of NMI. +""" + + +@metric( + metric_name="NMI", + paper_reference="luecken2022benchmarking", + maximize=True, + image="openproblems-r-pytorch", +) +def nmi(adata): + from scanpy.pp import neighbors + + neighbors(adata, use_rep="X_emb") + return graph_metrics.nmi(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md index 209d3f0dfb..1235c93c30 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md @@ -42,6 +42,7 @@ Datasets should contain the following attributes: * `adata.obs["batch"]` with the batch covariate, and * `adata.obs["label"]` with the cell identity label +* `adata.obs["X_uni_pca"]` with a PCA embedding of the uncorrected data * `adata.layers['counts']` with raw, integer UMI count data, * `adata.layers['log_normalized']` with log-normalized data and * `adata.X` with log-normalized data diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py index 6d0f1ae781..637eea141c 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py @@ -1,6 +1,9 @@ +from ....tools.decorators import dataset from .._common import api -check_dataset = api.check_dataset +import functools + +check_dataset = functools.partial(api.check_dataset, do_check_pca=True) def check_method(adata, is_baseline=False): @@ -11,7 +14,9 @@ def check_method(adata, is_baseline=False): return True -sample_dataset = api.sample_dataset +@dataset() +def sample_dataset(): + return api.sample_dataset(run_pca=True) def sample_method(adata): diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/__init__.py index fde16aa8c0..8bd5a56992 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/__init__.py @@ -1 +1,11 @@ +from .ari import ari +from .cc_score import cc_score +from .graph_connectivity import graph_connectivity from .hvg_conservation import hvg_conservation +from .iso_label_f1 import isolated_labels_f1 +from .iso_label_sil import isolated_labels_sil +from .kBET import kBET +from .nmi import nmi +from .pcr import pcr +from .sil_batch import silhouette_batch +from .silhouette import silhouette diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py new file mode 100644 index 0000000000..1c22dc5f5b --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py @@ -0,0 +1,29 @@ +from .....tools.decorators import metric +from ...batch_integration_graph import metrics as graph_metrics + +""" +The Rand index compares the overlap of two clusterings; +it considers both correct clustering overlaps while also counting correct +disagreements between two clusterings. +Similar to NMI, we compared the cell-type labels with the NMI-optimized +Louvain clustering computed on the integrated dataset. +The adjustment of the Rand index corrects for randomly correct labels. +An ARI of 0 or 1 corresponds to random labeling or a perfect match, +respectively. +We also used the scikit-learn (v.0.22.1) implementation of the ARI. +""" + + +@metric( + metric_name="ARI", + maximize=True, + paper_reference="luecken2022benchmarking", + image="openproblems-r-pytorch", +) +def ari(adata): + from scanpy.pp import neighbors + from scanpy.tl import pca + + adata.obsm["X_emb"] = pca(adata.X) + neighbors(adata, use_rep="X_emb") + return graph_metrics.ari(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py new file mode 100644 index 0000000000..541502daf4 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py @@ -0,0 +1,32 @@ +from .....tools.decorators import metric +from ...batch_integration_embed import metrics as embed_metrics + +""" +The cell-cycle conservation score evaluates how well the cell-cycle effect can be +captured before and after integration. We computed cell-cycle scores using Scanpy’s +score_cell_cycle function with a reference gene set from Tirosh et al for the +respective cell-cycle phases. We used the same set of cell-cycle genes for mouse and +human data (using capitalization to convert between the gene symbols). We then computed +the variance contribution of the resulting S and G2/M phase scores using principal +component regression (Principal component regression), which was performed for each +batch separately. The differences in variance before, Varbefore, and after, Varafter, +integration were aggregated into a final score between 0 and 1, using the equation: +CCconservation=1−|Varafter−Varbefore|/Varbefore. + +In this equation, values close to 0 indicate lower conservation and 1 indicates complete +conservation of the variance explained by cell cycle. In other words, the variance +remains unchanged within each batch for complete conservation, while any deviation from +the preintegration variance contribution reduces the score.""" + + +@metric( + metric_name="Cell Cycle Score", + paper_reference="luecken2022benchmarking", + maximize=True, + image="openproblems-r-pytorch", +) +def cc_score(adata, test=False): + from scanpy.tl import pca + + adata.obsm["X_emb"] = pca(adata.X) + return embed_metrics.cc_score(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py new file mode 100644 index 0000000000..f574335598 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py @@ -0,0 +1,35 @@ +from .....tools.decorators import metric +from ...batch_integration_graph import metrics as graph_metrics + +""" +The graph connectivity metric assesses whether the kNN graph representation, +G, of the integrated data directly connects all cells with the same cell +identity label. For each cell identity label c, we created the subset kNN +graph G(Nc;Ec) to contain only cells from a given label. Using these subset +kNN graphs, we computed the graph connectivity score using the equation: + +gc =1/|C| Σc∈C |LCC(G(Nc;Ec))|/|Nc|. + +Here, C represents the set of cell identity labels, |LCC()| is the number +of nodes in the largest connected component of the graph, and |Nc| is the +number of nodes with cell identity c. The resultant score has a range +of (0;1], where 1 indicates that all cells with the same cell identity +are connected in the integrated kNN graph, and the lowest possible score +indicates a graph where no cell is connected. As this score is computed +on the kNN graph, it can be used to evaluate all integration outputs. +""" + + +@metric( + metric_name="Graph connectivity", + paper_reference="luecken2022benchmarking", + maximize=True, + image="openproblems-r-pytorch", +) +def graph_connectivity(adata): + from scanpy.pp import neighbors + from scanpy.tl import pca + + adata.obsm["X_emb"] = pca(adata.X) + neighbors(adata, use_rep="X_emb") + return graph_metrics.graph_connectivity(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py new file mode 100644 index 0000000000..241d25f9e4 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py @@ -0,0 +1,40 @@ +from .....tools.decorators import metric +from ...batch_integration_graph import metrics as graph_metrics + +""" +We developed two isolated label scores to evaluate how well the data integration methods +dealt with cell identity labels shared by few batches. Specifically, we identified +isolated cell labels as the labels present in the least number of batches in the +integration task. +The score evaluates how well these isolated labels separate from other cell identities. +We implemented the isolated label metric in two versions: +(1) the best clustering of the isolated label (F1 score) and +(2) the global ASW of the isolated label. For the cluster-based score, +we first optimize the cluster assignment of the isolated label using the F1 score˚ +across louvain clustering resolutions ranging from 0.1 to 2 in resolution steps of 0.1. +The optimal F1 score for the isolated label is then used as the metric score. +The F1 score is a weighted mean of precision and recall given by the equation: +𝐹1=2×(precision×recall)/(precision+recall). + +It returns a value between 0 and 1, +where 1 shows that all of the isolated label cells and no others are captured in +the cluster. For the isolated label ASW score, we compute the ASW of isolated +versus nonisolated labels on the PCA embedding (ASW metric above) and scale this +score to be between 0 and 1. The final score for each metric version consists of +the mean isolated score of all isolated labels. +""" + + +@metric( + metric_name="Isolated label F1", + paper_reference="luecken2022benchmarking", + maximize=True, + image="openproblems-r-pytorch", +) +def isolated_labels_f1(adata): + from scanpy.pp import neighbors + from scanpy.tl import pca + + adata.obsm["X_emb"] = pca(adata.X) + neighbors(adata, use_rep="X_emb") + return graph_metrics.isolated_labels_f1(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py new file mode 100644 index 0000000000..81c5119328 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py @@ -0,0 +1,26 @@ +from .....tools.decorators import metric +from ...batch_integration_embed import metrics as embed_metrics + +""" +Isolated cell labels are defined as the labels present in the least number +of batches in the integration task. The score evaluates how well these isolated labels +separate from other cell identities. + +The isolated label ASW score is obtained by computing the +ASW of isolated versus non-isolated labels on the PCA embedding (ASW metric above) and +scaling this score to be between 0 and 1. The final score for each metric version +consists of the mean isolated score of all isolated labels. +""" + + +@metric( + metric_name="Isolated label Silhouette", + paper_reference="luecken2022benchmarking", + maximize=True, + image="openproblems-r-pytorch", +) +def isolated_labels_sil(adata): + from scanpy.tl import pca + + adata.obsm["X_emb"] = pca(adata.X) + return embed_metrics.isolated_labels_sil(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py new file mode 100644 index 0000000000..dec11bfb25 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py @@ -0,0 +1,38 @@ +from .....tools.decorators import metric +from ...batch_integration_embed import metrics as embed_metrics + +""" +The kBET algorithm (v.0.99.6, release 4c9dafa) determines whether the label composition +of a k nearest neighborhood of a cell is similar to the expected (global) label +composition (Buettner et al., Nat Meth 2019). The test is repeated for a random subset +of cells, and the results are summarized as a rejection rate over all tested +neighborhoods. Thus, kBET works on a kNN graph. + +We compute kNN graphs where k = 50 for joint embeddings and corrected feature outputs +via Scanpy preprocessing steps. To test for technical effects and to account for +cell-type frequency shifts across datasets, we applied kBET +separately on the batch variable for each cell identity label. Using the kBET defaults, +a k equal to the median of the number of cells per batch within each label is used for +this computation. Additionally, we set the minimum and maximum thresholds of k to 10 and +100, respectively. As kNN graphs that have been subset by cell identity labels may no +longer be connected, we compute kBET per connected component. If >25% of cells were +assigned to connected components too small for kBET computation (smaller than k × 3), +we assigned a kBET score of 1 to denote poor batch removal. Subsequently, kBET scores +for each label were averaged and subtracted from 1 to give a final kBET score. + +In Open Problems we do not run kBET on graph outputs to avoid computation-intensive +diffusion processes being run. +""" + + +@metric( + metric_name="kBET", + paper_reference="bttner2018test", + maximize=True, + image="openproblems-r-extras", +) +def kBET(adata): + from scanpy.tl import pca + + adata.obsm["X_emb"] = pca(adata.X) + return embed_metrics.kBET(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py new file mode 100644 index 0000000000..b35daf2856 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py @@ -0,0 +1,28 @@ +from .....tools.decorators import metric +from ...batch_integration_graph import metrics as graph_metrics + +"""NMI compares the overlap of two clusterings. +We used NMI to compare the cell-type labels with Louvain clusters computed on +the integrated dataset. The overlap was scaled using the mean of the entropy terms +for cell-type and cluster labels. Thus, NMI scores of 0 or 1 correspond to uncorrelated +clustering or a perfect match, respectively. We performed optimized Louvain clustering +for this metric to obtain the best match between clusters and labels. +Louvain clustering was performed at a resolution range of 0.1 to 2 in steps of 0.1, +and the clustering output with the highest NMI with the label set was used. We used +the scikit-learn27 (v.0.22.1) implementation of NMI. +""" + + +@metric( + metric_name="NMI", + paper_reference="luecken2022benchmarking", + maximize=True, + image="openproblems-r-pytorch", +) +def nmi(adata): + from scanpy.pp import neighbors + from scanpy.tl import pca + + adata.obsm["X_emb"] = pca(adata.X) + neighbors(adata, use_rep="X_emb") + return graph_metrics.nmi(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py new file mode 100644 index 0000000000..5b30bdd8a1 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py @@ -0,0 +1,29 @@ +from .....tools.decorators import metric +from ...batch_integration_embed import metrics as embed_metrics + +""" +Principal component regression, derived from PCA, has previously been used to quantify +batch removal. Briefly, the R2 was calculated from a linear regression of the +covariate of interest (for example, the batch variable B) onto each principal component. +The variance contribution of the batch effect per principal component was then +calculated as the product of the variance explained by the ith principal component (PC) +and the corresponding R2(PCi|B). The sum across all variance contributions by the batch +effects in all principal components gives the total variance explained by the batch +variable as follows: +Var(𝐶|𝐵)=∑𝑖=1𝐺Var(𝐶|PC𝑖)×𝑅2(PC𝑖|𝐵), + +where Var(C|PCi) is the variance of the data matrix C explained by the ith principal +component.""" + + +@metric( + metric_name="PC Regression", + paper_reference="luecken2022benchmarking", + maximize=True, + image="openproblems-r-pytorch", +) +def pcr(adata): + from scanpy.tl import pca + + adata.obsm["X_emb"] = pca(adata.X) + return embed_metrics.pcr(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py new file mode 100644 index 0000000000..118b9b7863 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py @@ -0,0 +1,35 @@ +from .....tools.decorators import metric +from ...batch_integration_embed import metrics as embed_metrics + +""" +We consider the absolute silhouette width, s(i), on +batch labels per cell i. Here, 0 indicates that batches are well mixed, and any +deviation from 0 indicates a batch effect: +𝑠batch(𝑖)=|𝑠(𝑖)|. + +To ensure higher scores indicate better batch mixing, these scores are scaled by +subtracting them from 1. As we expect batches to integrate within cell identity +clusters, we compute the batchASWj score for each cell label j separately, +using the equation: +batchASW𝑗=1|𝐶𝑗|∑𝑖∈𝐶𝑗1−𝑠batch(𝑖), + +where Cj is the set of cells with the cell label j and |Cj| denotes the number of cells +in that set. + +To obtain the final batchASW score, the label-specific batchASWj scores are averaged: +batchASW=1|𝑀|∑𝑗∈𝑀batchASW𝑗. + +Here, M is the set of unique cell labels.""" + + +@metric( + metric_name="Batch ASW", + paper_reference="luecken2022benchmarking", + maximize=True, + image="openproblems-r-pytorch", +) +def silhouette_batch(adata): + from scanpy.tl import pca + + adata.obsm["X_emb"] = pca(adata.X) + return embed_metrics.silhouette_batch(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py new file mode 100644 index 0000000000..b248a592cf --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py @@ -0,0 +1,23 @@ +from .....tools.decorators import metric +from ...batch_integration_embed import metrics as embed_metrics + +""" +For the bio-conservation score, the ASW was computed on cell identity labels and +scaled to a value between 0 and 1 using the equation: +celltypeASW=(ASW_C+1)/2, + +where C denotes the set of all cell identity labels. +For information about the batch silhouette score, check sil_batch.""" + + +@metric( + metric_name="Silhouette", + paper_reference="luecken2022benchmarking", + maximize=True, + image="openproblems-r-pytorch", +) +def silhouette(adata): + from scanpy.tl import pca + + adata.obsm["X_emb"] = pca(adata.X) + return embed_metrics.silhouette(adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py index 0caac73f2b..f4195e3c4d 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py @@ -37,7 +37,7 @@ def isolated_labels_f1(adata): adata, label_key="labels", batch_key="batch", - embed="X_pca", + embed=None, cluster=True, verbose=False, ) From 85b3d19e9d077efc2d1115ee4c417f4afe804f4b Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 1 Feb 2023 20:36:34 +0100 Subject: [PATCH 224/266] Fix typo in dimensionality reduction dataset names (#802) * Fix type in name of mouse_blood_olsson_labelled.py Change the incorrect 'e' to an 'o' * Fix typo in Nestorowa * Rename main Olsson dataset file --------- Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- ..._blood_olssen_labelled.py => mouse_blood_olsson_labelled.py} | 0 openproblems/data/mouse_hspc_nestorowa2016.py | 2 +- .../tasks/dimensionality_reduction/datasets/__init__.py | 2 +- ..._blood_olssen_labelled.py => mouse_blood_olsson_labelled.py} | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename openproblems/data/{mouse_blood_olssen_labelled.py => mouse_blood_olsson_labelled.py} (100%) rename openproblems/tasks/dimensionality_reduction/datasets/{mouse_blood_olssen_labelled.py => mouse_blood_olsson_labelled.py} (90%) diff --git a/openproblems/data/mouse_blood_olssen_labelled.py b/openproblems/data/mouse_blood_olsson_labelled.py similarity index 100% rename from openproblems/data/mouse_blood_olssen_labelled.py rename to openproblems/data/mouse_blood_olsson_labelled.py diff --git a/openproblems/data/mouse_hspc_nestorowa2016.py b/openproblems/data/mouse_hspc_nestorowa2016.py index f39254287f..a43c397146 100644 --- a/openproblems/data/mouse_hspc_nestorowa2016.py +++ b/openproblems/data/mouse_hspc_nestorowa2016.py @@ -11,7 +11,7 @@ @utils.loader(data_url=URL, data_reference="nestorowa2016single") def load_mouse_hspc_nestorowa2016(test=False): - """Download Nesterova data from Figshare.""" + """Download Nestorowa data from Figshare.""" import scanpy as sc if test: diff --git a/openproblems/tasks/dimensionality_reduction/datasets/__init__.py b/openproblems/tasks/dimensionality_reduction/datasets/__init__.py index 0c41aaa00b..e9ea18289c 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/__init__.py @@ -1,3 +1,3 @@ -from .mouse_blood_olssen_labelled import olsson_2016_mouse_blood +from .mouse_blood_olsson_labelled import olsson_2016_mouse_blood from .mouse_hspc_nestorowa2016 import mouse_hspc_nestorowa2016 from .tenx_5k_pbmc import tenx_5k_pbmc diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olssen_labelled.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py similarity index 90% rename from openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olssen_labelled.py rename to openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py index d789477710..c480455b1c 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olssen_labelled.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py @@ -1,4 +1,4 @@ -from ....data.mouse_blood_olssen_labelled import load_olsson_2016_mouse_blood +from ....data.mouse_blood_olsson_labelled import load_olsson_2016_mouse_blood from ....tools.decorators import dataset from ....tools.normalize import log_cpm From c366d94fd90827516b8eaa15a5ad9b0cdec7531a Mon Sep 17 00:00:00 2001 From: Daniel Strobl <50872326+danielStrobl@users.noreply.github.com> Date: Wed, 1 Feb 2023 22:14:21 +0100 Subject: [PATCH 225/266] add new dataloaders (#792) * add new dataloaders * task dataloaders * address comments * correct docstring * change dataset name * remove whitespace * add to init * pre-commit * filter celltypes * pre-commit * filter celltypes * pre-commit * update urllib requirement * pre-commit * urllib * Use scprep 1.2.2 * remove immune human mouse for now * Remove urllib dep * Remove broken import --------- Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- openproblems/data/lung.py | 45 +++++++++++++++++++ .../_common/datasets/__init__.py | 1 + .../_common/datasets/lung.py | 40 +++++++++++++++++ .../datasets/__init__.py | 1 + .../datasets/__init__.py | 1 + .../datasets/__init__.py | 1 + setup.py | 2 +- 7 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 openproblems/data/lung.py create mode 100644 openproblems/tasks/_batch_integration/_common/datasets/lung.py diff --git a/openproblems/data/lung.py b/openproblems/data/lung.py new file mode 100644 index 0000000000..1f1d409eef --- /dev/null +++ b/openproblems/data/lung.py @@ -0,0 +1,45 @@ +from . import utils + +import os +import scprep +import tempfile + +# sparsified from https://figshare.com/articles/dataset/Benchmarking_atlas-level_data_integration_in_single-cell_genomics_-_integration_task_datasets_Immune_and_pancreas_/12420968/2 # noqa: E501 +URL = "https://figshare.com/ndownloader/files/24539942" + + +@utils.loader(data_url=URL, data_reference="luecken2022benchmarking") +def load_lung(test=False): + """Download lung data from figshare.""" + import scanpy as sc + + if test: + # load full data first, cached if available + adata = load_lung(test=False) + + # Subsample immune data to two batches with 250 cells each + adata = adata[:, :500].copy() + batch1 = adata[adata.obs.batch == "4"][:250] + batch2 = adata[adata.obs.batch == "A6"][:250] + adata = batch1.concatenate(batch2) + # Note: could also use 200-500 HVGs rather than 200 random genes + + # Ensure there are no cells or genes with 0 counts + utils.filter_genes_cells(adata) + + return adata + + else: + with tempfile.TemporaryDirectory() as tempdir: + filepath = os.path.join(tempdir, "Lung_atlas_public.h5ad") + scprep.io.download.download_url(URL, filepath) + adata = sc.read(filepath) + + # NOTE: adata.X contains log-normalized data, so we're moving it + adata.layers["log_normalized"] = adata.X + adata.X = adata.layers["counts"] + + # Ensure there are no cells or genes with 0 counts + utils.filter_genes_cells(adata) + + return adata diff --git a/openproblems/tasks/_batch_integration/_common/datasets/__init__.py b/openproblems/tasks/_batch_integration/_common/datasets/__init__.py index 3369c29cd4..f2fbcbc05f 100644 --- a/openproblems/tasks/_batch_integration/_common/datasets/__init__.py +++ b/openproblems/tasks/_batch_integration/_common/datasets/__init__.py @@ -1,2 +1,3 @@ from .immune import immune_batch +from .lung import lung_batch from .pancreas import pancreas_batch diff --git a/openproblems/tasks/_batch_integration/_common/datasets/lung.py b/openproblems/tasks/_batch_integration/_common/datasets/lung.py new file mode 100644 index 0000000000..6b7049aa26 --- /dev/null +++ b/openproblems/tasks/_batch_integration/_common/datasets/lung.py @@ -0,0 +1,40 @@ +from .....data.lung import load_lung +from .....tools.decorators import dataset +from ..utils import filter_celltypes +from typing import Optional + + +@dataset( + dataset_name="Lung (Viera Braga et al.)", + data_url=load_lung.metadata["data_url"], + data_reference=load_lung.metadata["data_reference"], + dataset_summary="Human lung scRNA-seq data from 3 datasets with 32,472 cells." + "From Vieira Braga et al. Technologies: 10X and Drop-seq.", + image="openproblems", +) +def lung_batch(test: bool = False, min_celltype_count: Optional[int] = None): + import scanpy as sc + + adata = load_lung(test) + adata.uns["organism"] = "human" + adata.obs["labels"] = adata.obs["cell_type"] + # No need to rename batch column as it already exists + + adata = filter_celltypes(adata, min_celltype_count=min_celltype_count) + + sc.pp.filter_genes(adata, min_counts=1) + sc.pp.filter_genes(adata, min_cells=1) + + adata.X = adata.layers["log_normalized"] + + sc.tl.pca( + adata, + svd_solver="arpack", + return_info=True, + ) + adata.obsm["X_uni_pca"] = adata.obsm["X_pca"] + + sc.pp.neighbors(adata, use_rep="X_uni_pca", key_added="uni") + + adata.var_names_make_unique() + return adata diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/datasets/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_embed/datasets/__init__.py index 0ad80a4782..bac200686e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/datasets/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/datasets/__init__.py @@ -1,2 +1,3 @@ from ..._common.datasets.immune import immune_batch +from ..._common.datasets.lung import lung_batch from ..._common.datasets.pancreas import pancreas_batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/datasets/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_feature/datasets/__init__.py index 0ad80a4782..bac200686e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/datasets/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/datasets/__init__.py @@ -1,2 +1,3 @@ from ..._common.datasets.immune import immune_batch +from ..._common.datasets.lung import lung_batch from ..._common.datasets.pancreas import pancreas_batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py index 0ad80a4782..bac200686e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py @@ -1,2 +1,3 @@ from ..._common.datasets.immune import immune_batch +from ..._common.datasets.lung import lung_batch from ..._common.datasets.pancreas import pancreas_batch diff --git a/setup.py b/setup.py index 53881a202f..1b0fb142aa 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ "numpy>=1.21,<1.24", "scikit-learn>=1.0,<1.2", "anndata==0.8.*", - "scprep>=1.2.1", + "scprep>=1.2.2", "scipy>=1.7,<1.10", "scanpy>=1.6", "louvain==0.8.*", From 8dd58b66ce4417b4142ec2f430ec056aa332bbdc Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Wed, 1 Feb 2023 16:15:05 -0500 Subject: [PATCH 226/266] rmse -> distance correlation (#811) * rmse -> distance correlation * import stats --- .../tasks/dimensionality_reduction/README.md | 10 +++---- .../metrics/__init__.py | 4 +-- ...quare_error.py => distance_correlation.py} | 29 +++++++++---------- 3 files changed, 21 insertions(+), 22 deletions(-) rename openproblems/tasks/dimensionality_reduction/metrics/{root_mean_square_error.py => distance_correlation.py} (68%) diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index 5cc118bbf5..21c92622c9 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -23,12 +23,12 @@ data for visualization and interpretation. ## The metrics -* **Root mean square error**: the square root of the mean squared difference between +* **Distance correlation**: the Spearman correlation between ground truth distances in the high-dimensional data and Euclidean distances in the - dimension-reduced data, invariant to scalar multiplication. *RMSE* computes - high-dimensional distances in Euclidean space, while *RMSE (spectral)* computes - [diffusion distances](http://dx.doi.org/10.1016/j.acha.2006.04.006) (i.e. Euclidean - distances on the [Laplacian Eigenmap](http://dx.doi.org/10.1162/089976603321780317)). + dimension-reduced data, invariant to scalar multiplication. *Distance correlation* + computes high-dimensional distances in Euclidean space, while *Distance correlation + (spectral)* computes [diffusion distances](http://dx.doi.org/10.1016/j.acha.2006.04.006) + (i.e. Euclidean distances on the [Laplacian Eigenmap](http://dx.doi.org/10.1162/089976603321780317)). * **Trustworthiness**: a measurement of similarity between the rank of each point's nearest neighbors in the high-dimensional data and the reduced data ([Venna & Kaski, 2001](https://openproblems.bio/bibliography#venna2001neighborhood)). diff --git a/openproblems/tasks/dimensionality_reduction/metrics/__init__.py b/openproblems/tasks/dimensionality_reduction/metrics/__init__.py index f0b46e6813..943ede344b 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/__init__.py @@ -1,10 +1,10 @@ from .density import density_preservation +from .distance_correlation import distance_correlation +from .distance_correlation import distance_correlation_spectral from .nn_ranking import continuity from .nn_ranking import lcmc from .nn_ranking import qglobal from .nn_ranking import qlocal from .nn_ranking import qnn from .nn_ranking import qnn_auc -from .root_mean_square_error import rmse -from .root_mean_square_error import rmse_spectral from .trustworthiness import trustworthiness diff --git a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py similarity index 68% rename from openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py rename to openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py index feedbda787..0ed69e78e2 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py @@ -2,24 +2,23 @@ from ....tools.normalize import log_cpm -def _rmse(X, X_emb): - import scipy.optimize +def _distance_correlation(X, X_emb): import scipy.spatial + import scipy.stats high_dimensional_distance_vector = scipy.spatial.distance.pdist(X) low_dimensional_distance_vector = scipy.spatial.distance.pdist(X_emb) - _, rmse = scipy.optimize.nnls( - low_dimensional_distance_vector[:, None], high_dimensional_distance_vector - ) - return rmse + return scipy.stats.spearmanr( + low_dimensional_distance_vector, high_dimensional_distance_vector + )[0] @metric( - metric_name="RMSE", - maximize=False, - paper_reference="kruskal1964mds", + metric_name="Distance correlation", + maximize=True, + paper_reference="schober2018correlation", ) -def rmse(adata, n_svd=200): +def distance_correlation(adata, n_svd=200): """Calculate the root mean squared error. Computes (RMSE) between the full (or processed) data matrix and the @@ -30,15 +29,15 @@ def rmse(adata, n_svd=200): adata = log_cpm(adata) X = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) - return _rmse(X, adata.obsm["X_emb"]) + return _distance_correlation(X, adata.obsm["X_emb"]) @metric( - metric_name="RMSE (spectral)", - maximize=False, + metric_name="Distance correlation (spectral)", + maximize=True, paper_reference="coifman2006diffusion", ) -def rmse_spectral(adata, n_comps=200): +def distance_correlation_spectral(adata, n_comps=200): """Calculate the spectral root mean squared error Computes (RMSE) between high-dimensional Laplacian eigenmaps on the full (or @@ -57,4 +56,4 @@ def rmse_spectral(adata, n_comps=200): X = umap.spectral.spectral_layout( adata.X, graph, n_comps, random_state=np.random.default_rng() ) - return _rmse(X, adata.obsm["X_emb"]) + return _distance_correlation(X, adata.obsm["X_emb"]) From d056a1fc2b05b40ef2a0e80b870f9804ad7b8265 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 1 Feb 2023 16:19:55 -0500 Subject: [PATCH 227/266] fix missing space --- openproblems/tasks/_batch_integration/_common/datasets/lung.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/_batch_integration/_common/datasets/lung.py b/openproblems/tasks/_batch_integration/_common/datasets/lung.py index 6b7049aa26..e3f20e4daa 100644 --- a/openproblems/tasks/_batch_integration/_common/datasets/lung.py +++ b/openproblems/tasks/_batch_integration/_common/datasets/lung.py @@ -8,7 +8,7 @@ dataset_name="Lung (Viera Braga et al.)", data_url=load_lung.metadata["data_url"], data_reference=load_lung.metadata["data_reference"], - dataset_summary="Human lung scRNA-seq data from 3 datasets with 32,472 cells." + dataset_summary="Human lung scRNA-seq data from 3 datasets with 32,472 cells. " "From Vieira Braga et al. Technologies: 10X and Drop-seq.", image="openproblems", ) From 0a0e902bd1482e35418f7816fc91e9bc31a33126 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Wed, 1 Feb 2023 16:36:53 -0500 Subject: [PATCH 228/266] CPM -> CP10k (#812) * CPM -> CP10k * fix in docs * bad diff --- CONTRIBUTING.md | 2 +- openproblems/api/README.md | 22 ++++---- .../_common/methods/liana.py | 8 +-- .../tasks/dimensionality_reduction/README.md | 18 +++---- .../dimensionality_reduction/__init__.py | 2 +- .../tasks/dimensionality_reduction/api.py | 6 +-- .../datasets/mouse_blood_olsson_labelled.py | 4 +- .../datasets/mouse_hspc_nestorowa2016.py | 4 +- .../datasets/tenx_5k_pbmc.py | 4 +- .../methods/__init__.py | 42 +++++++-------- .../methods/baseline.py | 16 +++--- .../methods/neuralee.py | 10 ++-- .../dimensionality_reduction/methods/pca.py | 16 +++--- .../dimensionality_reduction/methods/phate.py | 26 +++++----- .../dimensionality_reduction/methods/pymde.py | 28 +++++----- .../dimensionality_reduction/methods/tsne.py | 16 +++--- .../dimensionality_reduction/methods/umap.py | 52 +++++++++---------- .../metrics/distance_correlation.py | 6 +-- .../label_projection/methods/__init__.py | 8 +-- .../methods/knn_classifier.py | 8 +-- .../methods/logistic_regression.py | 8 +-- .../tasks/label_projection/methods/mlp.py | 8 +-- .../tasks/label_projection/methods/xgboost.py | 8 +-- .../methods/__init__.py | 4 +- .../methods/baseline.py | 6 +-- .../methods/harmonic_alignment.py | 14 ++--- .../methods/mnn.py | 12 ++--- .../methods/procrustes.py | 6 +-- openproblems/tools/normalize.py | 30 +++++------ test/test_core_cli.py | 4 +- test/test_core_tasks.py | 2 +- 31 files changed, 200 insertions(+), 200 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 072851ce8f..b187372df5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -334,7 +334,7 @@ docker run \ openproblems-cli test \ --task label_projection \ --dataset zebrafish_labs \ - --method logistic_regression_log_cpm \ + --method logistic_regression_log_cp10k \ --metric f1 ``` diff --git a/openproblems/api/README.md b/openproblems/api/README.md index 2d14fefe81..ee6cdfd66e 100644 --- a/openproblems/api/README.md +++ b/openproblems/api/README.md @@ -39,7 +39,7 @@ For example: # Download a task-specific dataset and save it to `dataset.h5ad` openproblems-cli load --task label_projection --output dataset.h5ad pancreas_batch # Run a method on a datasets and save output to `method.h5ad` -openproblems-cli run --task label_projection --input dataset.h5ad --output method.h5ad logistic_regression_log_cpm +openproblems-cli run --task label_projection --input dataset.h5ad --output method.h5ad logistic_regression_log_cp10k # Evaluate the performance of a previously run method using the `accuracy` metric openproblems-cli evaluate --task label_projection --input method.h5ad accuracy ``` @@ -67,11 +67,11 @@ zebrafish_labs zebrafish_random > openproblems-cli list --methods --task label_projection -knn_classifier_log_cpm +knn_classifier_log_cp10k knn_classifier_scran -logistic_regression_log_cpm +logistic_regression_log_cp10k logistic_regression_scran -mlp_log_cpm +mlp_log_cp10k mlp_scran > openproblems-cli list --metrics --task label_projection @@ -100,11 +100,11 @@ zebrafish_labs zebrafish_random $ openproblems-cli load --task label_projection --output dataset.h5ad pancreas_batch $ openproblems-cli list --methods --task label_projection -logistic_regression_log_cpm +logistic_regression_log_cp10k logistic_regression_scran -mlp_log_cpm +mlp_log_cp10k mlp_scran -$ openproblems-cli run --task label_projection --input dataset.h5ad --output method.h5ad logistic_regression_log_cpm +$ openproblems-cli run --task label_projection --input dataset.h5ad --output method.h5ad logistic_regression_log_cp10k $ openproblems-cli list --metrics --task label_projection $ openproblems-cli evaluate --task label_projection --input method.h5ad accuracy 0.9521233432512848 @@ -121,7 +121,7 @@ openproblems-cli image --datasets --task label_projection pancreas_batch docker run -dt openproblems-cli load --task label_projection --output dataset.h5ad pancreas_batch openproblems-cli list --methods --task label_projection openproblems-cli image --methods --task label_projection logistic_regression_scran -openproblems-cli run --task label_projection --input dataset.h5ad --output method.h5ad logistic_regression_log_cpm +openproblems-cli run --task label_projection --input dataset.h5ad --output method.h5ad logistic_regression_log_cp10k openproblems-cli list --metrics --task label_projection openproblems-cli image --metrics --task label_projection accuracy openproblems-cli evaluate --task label_projection --input method.h5ad accuracy @@ -147,13 +147,13 @@ $ openproblems-cli image --datasets --task label_projection pancreas_batch openproblems $ docker run -dt singlecellopenproblems/openproblems openproblems-cli load --task label_projection --output dataset.h5ad pancreas_batch $ openproblems-cli list --methods --task label_projection -logistic_regression_log_cpm +logistic_regression_log_cp10k logistic_regression_scran -mlp_log_cpm +mlp_log_cp10k mlp_scran $ openproblems-cli image --methods --task label_projection logistic_regression_scran openproblems-r-base -$ docker run -dt singlecellopenproblems/openproblems-r-base openproblems-cli run --task label_projection --input dataset.h5ad --output method.h5ad logistic_regression_log_cpm +$ docker run -dt singlecellopenproblems/openproblems-r-base openproblems-cli run --task label_projection --input dataset.h5ad --output method.h5ad logistic_regression_log_cp10k $ openproblems-cli list --metrics --task label_projection accuracy f1 diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py index b4a9545b84..c297163bbd 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py @@ -1,6 +1,6 @@ from .....tools.conversion import r_function from .....tools.decorators import method -from .....tools.normalize import log_cpm +from .....tools.normalize import log_cp10k from .....tools.utils import check_r_version from ..utils import aggregate_method_scores from ..utils import ligand_receptor_resource @@ -41,9 +41,9 @@ def _liana( **kwargs, ): # log-normalize - adata = log_cpm(adata) - adata.layers["logcounts"] = adata.layers["log_cpm"] - del adata.layers["log_cpm"] + adata = log_cp10k(adata) + adata.layers["logcounts"] = adata.layers["log_cp10k"] + del adata.layers["log_cp10k"] # Run LIANA liana_res = _r_liana( diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index 21c92622c9..df07de392c 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -42,12 +42,12 @@ data for visualization and interpretation. ## API -WARNING: other than most tasks, `adata.X` should contain log CPM-normalized data, +WARNING: other than most tasks, `adata.X` should contain log CP10k-normalized data, This is the case as we are computing ground truth metrics on normalized data, which means methods which use this same normalization are likely to score more highly on these metrics. -**Datasets** should provide *log CPM normalized counts* in `adata.X` and store the +**Datasets** should provide *log CP10k normalized counts* in `adata.X` and store the original number of genes (i.e., `adata.shape[1]`) in `adata.uns["n_genes"]`. **Methods** should assign dimensionally-reduced 2D embedding coordinates to @@ -66,11 +66,11 @@ pre-processing functions are available as part of the `tools` module. Where poss each **method** should first call one of these functions and use the processed `adata.X` slot as the input to the method. Raw counts are also stored in `adata.layers["counts"]` by the standard pre-processing functions, if a method performs its own pre-processing. -For most methods a standard pre-processing from `log_cpm()`, which normalizes the -expression matrix to counts per million (CPM), can be used directly from `adata.X`. +For most methods a standard pre-processing from `log_cp10k()`, which normalizes the +expression matrix to counts per 10,000 (CP10k), can be used directly from `adata.X`. Variants of methods can be created by applying different pre-processing prior to the method itself (see `phate.py` for an example). *Note that using a normalization method -different from that used for the metrics (log CPM) may lead to artificially poor method +different from that used for the metrics (log CP10k) may lead to artificially poor method performance.* ## The methods @@ -138,7 +138,7 @@ from [umap-learn](https://umap-learn.readthedocs.io/en/latest/densmap_demo.html) **Variants:** -* The (logCPM-normalized, 1000 HVG) expression matrix +* The (logCP10k-normalized, 1000 HVG) expression matrix * 50 principal components ### Potential of heat-diffusion for affinity-based transition embedding (PHATE) @@ -157,8 +157,8 @@ This implementation is from the [phate package](https://phate.readthedocs.io/en/ **Variants:** -* The square-root CPM transformed expression matrix -* 50 principal components of the logCPM-normalised, 1000 HVG expression matrix +* The square-root CP10k transformed expression matrix +* 50 principal components of the logCP10k-normalised, 1000 HVG expression matrix ### ivis @@ -177,7 +177,7 @@ package](https://neuralee.readthedocs.io/en/latest/). **Variants:** * Scaled 500 HVGs from a logged expression matrix (no library size normalization) -* LogCPM-normalised, 1000 HVG expression matrix +* LogCP10k-normalised, 1000 HVG expression matrix ### scvis diff --git a/openproblems/tasks/dimensionality_reduction/__init__.py b/openproblems/tasks/dimensionality_reduction/__init__.py index 12920cfa9e..6b6af5d52a 100644 --- a/openproblems/tasks/dimensionality_reduction/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/__init__.py @@ -9,7 +9,7 @@ "Reduction of high-dimensional datasets to 2D for visualization & interpretation" ) -DEFAULT_LAYER = "log_cpm" +DEFAULT_LAYER = "log_cp10k" DATASETS = utils.get_callable_members(datasets) METHODS = utils.get_callable_members(methods) diff --git a/openproblems/tasks/dimensionality_reduction/api.py b/openproblems/tasks/dimensionality_reduction/api.py index 57d2a8645a..7ec44c1676 100644 --- a/openproblems/tasks/dimensionality_reduction/api.py +++ b/openproblems/tasks/dimensionality_reduction/api.py @@ -1,6 +1,6 @@ from ...data.sample import load_sample_data from ...tools.decorators import dataset -from ...tools.normalize import log_cpm +from ...tools.normalize import log_cp10k import numpy as np @@ -16,7 +16,7 @@ def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" # check adata.X has not changed assert adata.uns["n_genes"] == adata.shape[1] - assert adata.X is adata.layers["log_cpm"] + assert adata.X is adata.layers["log_cp10k"] # check output assert "X_emb" in adata.obsm if not is_baseline: @@ -29,7 +29,7 @@ def check_method(adata, is_baseline=False): def sample_dataset(): """Create a simple dataset to use for testing methods in this task.""" adata = load_sample_data() - adata = log_cpm(adata) + adata = log_cp10k(adata) adata.uns["n_genes"] = adata.shape[1] return adata diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py index c480455b1c..9cbf278db0 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py @@ -1,6 +1,6 @@ from ....data.mouse_blood_olsson_labelled import load_olsson_2016_mouse_blood from ....tools.decorators import dataset -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k @dataset( @@ -14,4 +14,4 @@ def olsson_2016_mouse_blood(test=False): adata = load_olsson_2016_mouse_blood(test=test) adata.uns["n_genes"] = adata.shape[1] - return log_cpm(adata) + return log_cp10k(adata) diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py index 1f24565835..10f4d428cb 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py @@ -1,6 +1,6 @@ from ....data.mouse_hspc_nestorowa2016 import load_mouse_hspc_nestorowa2016 from ....tools.decorators import dataset -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k @dataset( @@ -14,4 +14,4 @@ def mouse_hspc_nestorowa2016(test=False): adata = load_mouse_hspc_nestorowa2016(test=test) adata.uns["n_genes"] = adata.shape[1] - return log_cpm(adata) + return log_cp10k(adata) diff --git a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py index d8487ee535..2811fb58db 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py @@ -1,6 +1,6 @@ from ....data.tenx import load_tenx_5k_pbmc from ....tools.decorators import dataset -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k @dataset( @@ -16,4 +16,4 @@ def tenx_5k_pbmc(test=False): adata = load_tenx_5k_pbmc(test=test) adata.uns["n_genes"] = adata.shape[1] - return log_cpm(adata) + return log_cp10k(adata) diff --git a/openproblems/tasks/dimensionality_reduction/methods/__init__.py b/openproblems/tasks/dimensionality_reduction/methods/__init__.py index 5480405af4..746452572b 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/methods/__init__.py @@ -1,26 +1,26 @@ from .baseline import random_features from .baseline import true_features -from .baseline import true_features_log_cpm -from .baseline import true_features_log_cpm_hvg +from .baseline import true_features_log_cp10k +from .baseline import true_features_log_cp10k_hvg from .neuralee import neuralee_default -from .neuralee import neuralee_logCPM_1kHVG -from .pca import pca_logCPM -from .pca import pca_logCPM_1kHVG +from .neuralee import neuralee_logCP10k_1kHVG +from .pca import pca_logCP10k +from .pca import pca_logCP10k_1kHVG from .phate import phate_default -from .phate import phate_logCPM -from .phate import phate_logCPM_1kHVG +from .phate import phate_logCP10k +from .phate import phate_logCP10k_1kHVG from .phate import phate_sqrt -from .pymde import pymde_distances_log_cpm -from .pymde import pymde_distances_log_cpm_hvg -from .pymde import pymde_neighbors_log_cpm -from .pymde import pymde_neighbors_log_cpm_hvg -from .tsne import tsne_logCPM -from .tsne import tsne_logCPM_1kHVG -from .umap import densmap_logCPM -from .umap import densmap_logCPM_1kHVG -from .umap import densmap_pca_logCPM -from .umap import densmap_pca_logCPM_1kHVG -from .umap import umap_logCPM -from .umap import umap_logCPM_1kHVG -from .umap import umap_pca_logCPM -from .umap import umap_pca_logCPM_1kHVG +from .pymde import pymde_distances_log_cp10k +from .pymde import pymde_distances_log_cp10k_hvg +from .pymde import pymde_neighbors_log_cp10k +from .pymde import pymde_neighbors_log_cp10k_hvg +from .tsne import tsne_logCP10k +from .tsne import tsne_logCP10k_1kHVG +from .umap import densmap_logCP10k +from .umap import densmap_logCP10k_1kHVG +from .umap import densmap_pca_logCP10k +from .umap import densmap_pca_logCP10k_1kHVG +from .umap import umap_logCP10k +from .umap import umap_logCP10k_1kHVG +from .umap import umap_pca_logCP10k +from .umap import umap_pca_logCP10k_1kHVG diff --git a/openproblems/tasks/dimensionality_reduction/methods/baseline.py b/openproblems/tasks/dimensionality_reduction/methods/baseline.py index 50b1a30acf..33621c76aa 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/baseline.py +++ b/openproblems/tasks/dimensionality_reduction/methods/baseline.py @@ -1,6 +1,6 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm -from ....tools.normalize import log_cpm_hvg +from ....tools.normalize import log_cp10k +from ....tools.normalize import log_cp10k_hvg from ....tools.utils import check_version import functools @@ -39,10 +39,10 @@ def true_features(adata, test=False): @_baseline_method( - method_name="True Features (logCPM)", + method_name="True Features (logCP10k)", ) -def true_features_log_cpm(adata, test=False): - adata = log_cpm(adata) +def true_features_log_cp10k(adata, test=False): + adata = log_cp10k(adata) adata.obsm["X_emb"] = adata.X if test: adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100] @@ -53,10 +53,10 @@ def true_features_log_cpm(adata, test=False): @_baseline_method( - method_name="True Features (logCPM, 1kHVG)", + method_name="True Features (logCP10k, 1kHVG)", ) -def true_features_log_cpm_hvg(adata, test=False): - adata = log_cpm_hvg(adata) +def true_features_log_cp10k_hvg(adata, test=False): + adata = log_cp10k_hvg(adata) adata.obsm["X_emb"] = adata[:, adata.var["highly_variable"]].copy().X if test: adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100] diff --git a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py index dd47933dfb..2ba2942a59 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py +++ b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py @@ -1,5 +1,5 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm_hvg +from ....tools.normalize import log_cp10k_hvg from ....tools.utils import check_version from anndata import AnnData from typing import Optional @@ -101,13 +101,13 @@ def neuralee_default(adata: AnnData, test: bool = False) -> AnnData: adata.X = adata.layers["counts"] adata = _neuralee(adata, test=test, normalize=True, subsample_genes=500) # revert to expected values - adata.X = adata.layers["log_cpm"] + adata.X = adata.layers["log_cp10k"] return adata -@_neuralee_method(method_name="NeuralEE (CPU) (logCPM, 1kHVG)") -def neuralee_logCPM_1kHVG(adata: AnnData, test: bool = False) -> AnnData: - adata = log_cpm_hvg(adata) +@_neuralee_method(method_name="NeuralEE (CPU) (logCP10k, 1kHVG)") +def neuralee_logCP10k_1kHVG(adata: AnnData, test: bool = False) -> AnnData: + adata = log_cp10k_hvg(adata) return _neuralee( adata, genes=adata.var["highly_variable"], diff --git a/openproblems/tasks/dimensionality_reduction/methods/pca.py b/openproblems/tasks/dimensionality_reduction/methods/pca.py index f87e7423f7..488f684dea 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pca.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pca.py @@ -1,6 +1,6 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm -from ....tools.normalize import log_cpm_hvg +from ....tools.normalize import log_cp10k +from ....tools.normalize import log_cp10k_hvg from ....tools.utils import check_version import functools @@ -28,13 +28,13 @@ def _pca(adata, genes=None): return adata -@_pca_method(method_name="Principle Component Analysis (PCA) (logCPM)") -def pca_logCPM(adata, test: bool = False): - adata = log_cpm(adata) +@_pca_method(method_name="Principle Component Analysis (PCA) (logCP10k)") +def pca_logCP10k(adata, test: bool = False): + adata = log_cp10k(adata) return _pca(adata) -@_pca_method(method_name="Principle Component Analysis (PCA) (logCPM, 1kHVG)") -def pca_logCPM_1kHVG(adata, test: bool = False): - adata = log_cpm_hvg(adata) +@_pca_method(method_name="Principle Component Analysis (PCA) (logCP10k, 1kHVG)") +def pca_logCP10k_1kHVG(adata, test: bool = False): + adata = log_cp10k_hvg(adata) return _pca(adata, genes=adata.var["highly_variable"]) diff --git a/openproblems/tasks/dimensionality_reduction/methods/phate.py b/openproblems/tasks/dimensionality_reduction/methods/phate.py index c5d6817969..1706a5caf8 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/phate.py +++ b/openproblems/tasks/dimensionality_reduction/methods/phate.py @@ -1,7 +1,7 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm -from ....tools.normalize import log_cpm_hvg -from ....tools.normalize import sqrt_cpm +from ....tools.normalize import log_cp10k +from ....tools.normalize import log_cp10k_hvg +from ....tools.normalize import sqrt_cp10k from ....tools.utils import check_version from typing import Optional @@ -41,29 +41,29 @@ def _phate( @_phate_method(method_name="PHATE (default)") def phate_default(adata, test: bool = False, n_pca: Optional[int] = None): - adata = sqrt_cpm(adata) + adata = sqrt_cp10k(adata) adata = _phate(adata, test=test, n_pca=n_pca) # revert to expected adata.X - adata = log_cpm(adata) + adata = log_cp10k(adata) return adata @_phate_method(method_name="PHATE (gamma=0)") def phate_sqrt(adata, test: bool = False, n_pca: Optional[int] = None): - adata = sqrt_cpm(adata) + adata = sqrt_cp10k(adata) adata = _phate(adata, test=test, n_pca=n_pca, gamma=0) # revert to expected adata.X - adata = log_cpm(adata) + adata = log_cp10k(adata) return adata -@_phate_method(method_name="PHATE (logCPM)") -def phate_logCPM_1kHVG(adata, test: bool = False, n_pca: Optional[int] = None): - adata = log_cpm(adata) +@_phate_method(method_name="PHATE (logCP10k)") +def phate_logCP10k_1kHVG(adata, test: bool = False, n_pca: Optional[int] = None): + adata = log_cp10k(adata) return _phate(adata, test=test, n_pca=n_pca) -@_phate_method(method_name="PHATE (logCPM, 1kHVG)") -def phate_logCPM(adata, test: bool = False, n_pca: Optional[int] = None): - adata = log_cpm_hvg(adata) +@_phate_method(method_name="PHATE (logCP10k, 1kHVG)") +def phate_logCP10k(adata, test: bool = False, n_pca: Optional[int] = None): + adata = log_cp10k_hvg(adata) return _phate(adata, test=test, genes=adata.var["highly_variable"], n_pca=n_pca) diff --git a/openproblems/tasks/dimensionality_reduction/methods/pymde.py b/openproblems/tasks/dimensionality_reduction/methods/pymde.py index 2892def1c1..050c61df12 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pymde.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pymde.py @@ -1,6 +1,6 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm -from ....tools.normalize import log_cpm_hvg +from ....tools.normalize import log_cp10k +from ....tools.normalize import log_cp10k_hvg from ....tools.utils import check_version from typing import Optional @@ -61,30 +61,30 @@ def _pymde( @_pymde_method( - method_name="PyMDE Preserve Neighbors (logCPM)", + method_name="PyMDE Preserve Neighbors (logCP10k)", ) -def pymde_neighbors_log_cpm( +def pymde_neighbors_log_cp10k( adata, test: bool = False, max_iter: Optional[int] = None, memory_size: Optional[int] = None, ): - adata = log_cpm(adata) + adata = log_cp10k(adata) return _pymde( adata, method="neighbors", test=test, max_iter=max_iter, memory_size=memory_size ) @_pymde_method( - method_name="PyMDE Preserve Neighbors (logCPM, 1kHVG)", + method_name="PyMDE Preserve Neighbors (logCP10k, 1kHVG)", ) -def pymde_neighbors_log_cpm_hvg( +def pymde_neighbors_log_cp10k_hvg( adata, test: bool = False, max_iter: Optional[int] = None, memory_size: Optional[int] = None, ): - adata = log_cpm_hvg(adata) + adata = log_cp10k_hvg(adata) return _pymde( adata, method="neighbors", @@ -96,30 +96,30 @@ def pymde_neighbors_log_cpm_hvg( @_pymde_method( - method_name="PyMDE Preserve Distances (logCPM)", + method_name="PyMDE Preserve Distances (logCP10k)", ) -def pymde_distances_log_cpm( +def pymde_distances_log_cp10k( adata, test: bool = False, max_iter: Optional[int] = None, memory_size: Optional[int] = None, ): - adata = log_cpm(adata) + adata = log_cp10k(adata) return _pymde( adata, method="distances", test=test, max_iter=max_iter, memory_size=memory_size ) @_pymde_method( - method_name="PyMDE Preserve Distances (logCPM, 1kHVG)", + method_name="PyMDE Preserve Distances (logCP10k, 1kHVG)", ) -def pymde_distances_log_cpm_hvg( +def pymde_distances_log_cp10k_hvg( adata, test: bool = False, max_iter: Optional[int] = None, memory_size: Optional[int] = None, ): - adata = log_cpm_hvg(adata) + adata = log_cp10k_hvg(adata) return _pymde( adata, method="distances", diff --git a/openproblems/tasks/dimensionality_reduction/methods/tsne.py b/openproblems/tasks/dimensionality_reduction/methods/tsne.py index bb636980f8..49e14d3ee2 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/tsne.py +++ b/openproblems/tasks/dimensionality_reduction/methods/tsne.py @@ -1,6 +1,6 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm -from ....tools.normalize import log_cpm_hvg +from ....tools.normalize import log_cp10k +from ....tools.normalize import log_cp10k_hvg from ....tools.utils import check_version import functools @@ -32,16 +32,16 @@ def _tsne(adata, genes=None, test=False, n_pca=50): @_tsne_method( - method_name="t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCPM, 1kHVG)" + method_name="t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCP10k, 1kHVG)" ) -def tsne_logCPM_1kHVG(adata, test: bool = False, n_pca=50): - adata = log_cpm_hvg(adata) +def tsne_logCP10k_1kHVG(adata, test: bool = False, n_pca=50): + adata = log_cp10k_hvg(adata) return _tsne(adata, genes=adata.var["highly_variable"], test=test, n_pca=n_pca) @_tsne_method( - method_name="t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCPM)" + method_name="t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCP10k)" ) -def tsne_logCPM(adata, test: bool = False, n_pca=50): - adata = log_cpm(adata) +def tsne_logCP10k(adata, test: bool = False, n_pca=50): + adata = log_cp10k(adata) return _tsne(adata, test=test, n_pca=n_pca) diff --git a/openproblems/tasks/dimensionality_reduction/methods/umap.py b/openproblems/tasks/dimensionality_reduction/methods/umap.py index 31f42e229f..cea0257323 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/umap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/umap.py @@ -1,6 +1,6 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm -from ....tools.normalize import log_cpm_hvg +from ....tools.normalize import log_cp10k +from ....tools.normalize import log_cp10k_hvg from ....tools.utils import check_version import functools @@ -45,49 +45,49 @@ def _umap(adata, n_comps=None, genes=None, densmap=False): return adata -@_umap_method(method_name="UMAP (logCPM, 1kHVG)") -def umap_logCPM_1kHVG(adata, test: bool = False): - adata = log_cpm_hvg(adata) +@_umap_method(method_name="UMAP (logCP10k, 1kHVG)") +def umap_logCP10k_1kHVG(adata, test: bool = False): + adata = log_cp10k_hvg(adata) return _umap(adata, genes=adata.var["highly_variable"]) -@_umap_method(method_name="UMAP PCA (logCPM, 1kHVG)") -def umap_pca_logCPM_1kHVG(adata, test: bool = False): - adata = log_cpm_hvg(adata) +@_umap_method(method_name="UMAP PCA (logCP10k, 1kHVG)") +def umap_pca_logCP10k_1kHVG(adata, test: bool = False): + adata = log_cp10k_hvg(adata) return _umap(adata, n_comps=50, genes=adata.var["highly_variable"]) -@_umap_method(method_name="UMAP (logCPM)") -def umap_logCPM(adata, test: bool = False): - adata = log_cpm(adata) +@_umap_method(method_name="UMAP (logCP10k)") +def umap_logCP10k(adata, test: bool = False): + adata = log_cp10k(adata) return _umap(adata) -@_umap_method(method_name="UMAP PCA (logCPM)") -def umap_pca_logCPM(adata, test: bool = False): - adata = log_cpm(adata) +@_umap_method(method_name="UMAP PCA (logCP10k)") +def umap_pca_logCP10k(adata, test: bool = False): + adata = log_cp10k(adata) return _umap(adata, n_comps=50) -@_densmap_method(method_name="densMAP (logCPM, 1kHVG)") -def densmap_logCPM_1kHVG(adata, test: bool = False): - adata = log_cpm_hvg(adata) +@_densmap_method(method_name="densMAP (logCP10k, 1kHVG)") +def densmap_logCP10k_1kHVG(adata, test: bool = False): + adata = log_cp10k_hvg(adata) return _umap(adata, densmap=True, genes=adata.var["highly_variable"]) -@_densmap_method(method_name="densMAP PCA (logCPM, 1kHVG)") -def densmap_pca_logCPM_1kHVG(adata, test: bool = False): - adata = log_cpm_hvg(adata) +@_densmap_method(method_name="densMAP PCA (logCP10k, 1kHVG)") +def densmap_pca_logCP10k_1kHVG(adata, test: bool = False): + adata = log_cp10k_hvg(adata) return _umap(adata, densmap=True, n_comps=50, genes=adata.var["highly_variable"]) -@_densmap_method(method_name="densMAP (logCPM)") -def densmap_logCPM(adata, test: bool = False): - adata = log_cpm(adata) +@_densmap_method(method_name="densMAP (logCP10k)") +def densmap_logCP10k(adata, test: bool = False): + adata = log_cp10k(adata) return _umap(adata, densmap=True) -@_densmap_method(method_name="densMAP PCA (logCPM)") -def densmap_pca_logCPM(adata, test: bool = False): - adata = log_cpm(adata) +@_densmap_method(method_name="densMAP PCA (logCP10k)") +def densmap_pca_logCP10k(adata, test: bool = False): + adata = log_cp10k(adata) return _umap(adata, densmap=True, n_comps=50) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py index 0ed69e78e2..0542c0bc59 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py @@ -1,5 +1,5 @@ from ....tools.decorators import metric -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k def _distance_correlation(X, X_emb): @@ -26,7 +26,7 @@ def distance_correlation(adata, n_svd=200): """ import sklearn.decomposition - adata = log_cpm(adata) + adata = log_cp10k(adata) X = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) return _distance_correlation(X, adata.obsm["X_emb"]) @@ -48,7 +48,7 @@ def distance_correlation_spectral(adata, n_comps=200): import umap import umap.spectral - adata = log_cpm(adata) + adata = log_cp10k(adata) n_comps = min(n_comps, min(adata.shape) - 2) diff --git a/openproblems/tasks/label_projection/methods/__init__.py b/openproblems/tasks/label_projection/methods/__init__.py index e6d932e2ba..066ba83e8e 100644 --- a/openproblems/tasks/label_projection/methods/__init__.py +++ b/openproblems/tasks/label_projection/methods/__init__.py @@ -1,16 +1,16 @@ from .baseline import majority_vote from .baseline import random_labels from .baseline import true_labels -from .knn_classifier import knn_classifier_log_cpm +from .knn_classifier import knn_classifier_log_cp10k from .knn_classifier import knn_classifier_scran -from .logistic_regression import logistic_regression_log_cpm +from .logistic_regression import logistic_regression_log_cp10k from .logistic_regression import logistic_regression_scran -from .mlp import mlp_log_cpm +from .mlp import mlp_log_cp10k from .mlp import mlp_scran from .scvi_tools import scanvi_all_genes from .scvi_tools import scanvi_hvg from .scvi_tools import scarches_scanvi_all_genes from .scvi_tools import scarches_scanvi_hvg from .seurat import seurat -from .xgboost import xgboost_log_cpm +from .xgboost import xgboost_log_cp10k from .xgboost import xgboost_scran diff --git a/openproblems/tasks/label_projection/methods/knn_classifier.py b/openproblems/tasks/label_projection/methods/knn_classifier.py index bc60082019..b32f9d29a4 100644 --- a/openproblems/tasks/label_projection/methods/knn_classifier.py +++ b/openproblems/tasks/label_projection/methods/knn_classifier.py @@ -1,5 +1,5 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k from ....tools.normalize import log_scran_pooling from .sklearn import classifier @@ -16,12 +16,12 @@ @_knn_classifier_method( - method_name="K-neighbors classifier (log CPM)", + method_name="K-neighbors classifier (log CP10k)", ) -def knn_classifier_log_cpm(adata, test=False): +def knn_classifier_log_cp10k(adata, test=False): import sklearn.neighbors - adata = log_cpm(adata) + adata = log_cp10k(adata) return classifier(adata, estimator=sklearn.neighbors.KNeighborsClassifier) diff --git a/openproblems/tasks/label_projection/methods/logistic_regression.py b/openproblems/tasks/label_projection/methods/logistic_regression.py index 6bed5d9758..bbd51bf442 100644 --- a/openproblems/tasks/label_projection/methods/logistic_regression.py +++ b/openproblems/tasks/label_projection/methods/logistic_regression.py @@ -1,5 +1,5 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k from ....tools.normalize import log_scran_pooling from .sklearn import classifier @@ -28,10 +28,10 @@ def _logistic_regression(adata, test=False, max_iter=None): @_logistic_regression_method( - method_name="Logistic regression (log CPM)", + method_name="Logistic regression (log CP10k)", ) -def logistic_regression_log_cpm(adata, test=False, max_iter=None): - adata = log_cpm(adata) +def logistic_regression_log_cp10k(adata, test=False, max_iter=None): + adata = log_cp10k(adata) return _logistic_regression(adata, test=test, max_iter=max_iter) diff --git a/openproblems/tasks/label_projection/methods/mlp.py b/openproblems/tasks/label_projection/methods/mlp.py index 294d487652..e421b9b3c4 100644 --- a/openproblems/tasks/label_projection/methods/mlp.py +++ b/openproblems/tasks/label_projection/methods/mlp.py @@ -1,5 +1,5 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k from ....tools.normalize import log_scran_pooling from .sklearn import classifier @@ -33,10 +33,10 @@ def _mlp(adata, test=False, max_iter=None, hidden_layer_sizes=None): @_mlp_method( - method_name="Multilayer perceptron (log CPM)", + method_name="Multilayer perceptron (log CP10k)", ) -def mlp_log_cpm(adata, test=False, max_iter=None, hidden_layer_sizes=None): - adata = log_cpm(adata) +def mlp_log_cp10k(adata, test=False, max_iter=None, hidden_layer_sizes=None): + adata = log_cp10k(adata) return _mlp( adata, test=test, max_iter=max_iter, hidden_layer_sizes=hidden_layer_sizes ) diff --git a/openproblems/tasks/label_projection/methods/xgboost.py b/openproblems/tasks/label_projection/methods/xgboost.py index 92ec3c7e6b..7eef8bcbea 100644 --- a/openproblems/tasks/label_projection/methods/xgboost.py +++ b/openproblems/tasks/label_projection/methods/xgboost.py @@ -1,5 +1,5 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k from ....tools.normalize import log_scran_pooling from ....tools.utils import check_version from typing import Optional @@ -55,11 +55,11 @@ def _xgboost(adata, test: bool = False, num_round: Optional[int] = None): @_xgboost_method( - method_name="XGBoost (log CPM)", + method_name="XGBoost (log CP10k)", image="openproblems-python-extras", ) -def xgboost_log_cpm(adata, test: bool = False, num_round: Optional[int] = None): - adata = log_cpm(adata) +def xgboost_log_cp10k(adata, test: bool = False, num_round: Optional[int] = None): + adata = log_cp10k(adata) return _xgboost(adata, test=test, num_round=num_round) diff --git a/openproblems/tasks/multimodal_data_integration/methods/__init__.py b/openproblems/tasks/multimodal_data_integration/methods/__init__.py index ec3fcfa409..dae56bb780 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/__init__.py +++ b/openproblems/tasks/multimodal_data_integration/methods/__init__.py @@ -1,7 +1,7 @@ from .baseline import random_features from .baseline import true_features from .harmonic_alignment import harmonic_alignment_log_scran_pooling -from .harmonic_alignment import harmonic_alignment_sqrt_cpm -from .mnn import mnn_log_cpm +from .harmonic_alignment import harmonic_alignment_sqrt_cp10k +from .mnn import mnn_log_cp10k from .mnn import mnn_log_scran_pooling from .procrustes import procrustes diff --git a/openproblems/tasks/multimodal_data_integration/methods/baseline.py b/openproblems/tasks/multimodal_data_integration/methods/baseline.py index 01f682aedb..042b0ac233 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/baseline.py +++ b/openproblems/tasks/multimodal_data_integration/methods/baseline.py @@ -1,5 +1,5 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k from ....tools.utils import check_version import numpy as np @@ -17,7 +17,7 @@ def random_features(adata, test=False, n_svd=20): import sklearn.decomposition n_svd = min([n_svd, min(adata.X.shape) - 1, min(adata.obsm["mode2"].shape) - 1]) - adata = log_cpm(adata) + adata = log_cp10k(adata) X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) adata.obsm["aligned"] = X_pca[np.random.permutation(np.arange(adata.shape[0]))] adata.obsm["mode2_aligned"] = X_pca[ @@ -39,7 +39,7 @@ def true_features(adata, test=False, n_svd=20): import sklearn.decomposition n_svd = min([n_svd, min(adata.X.shape) - 1, min(adata.obsm["mode2"].shape) - 1]) - adata = log_cpm(adata) + adata = log_cp10k(adata) X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) adata.obsm["aligned"] = X_pca adata.obsm["mode2_aligned"] = X_pca diff --git a/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py b/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py index 65f1572ea1..6e28be7445 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py +++ b/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py @@ -1,7 +1,7 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k from ....tools.normalize import log_scran_pooling -from ....tools.normalize import sqrt_cpm +from ....tools.normalize import sqrt_cp10k from ....tools.utils import check_version import functools @@ -51,13 +51,13 @@ def _harmonic_alignment( @_harmonic_alignment_method( - method_name="Harmonic Alignment (sqrt CPM)", image="openproblems-python-extras" + method_name="Harmonic Alignment (sqrt CP10k)", image="openproblems-python-extras" ) -def harmonic_alignment_sqrt_cpm( +def harmonic_alignment_sqrt_cp10k( adata, test=False, n_svd=None, n_eigenvectors=None, n_pca_XY=None, n_filters=None ): - adata = sqrt_cpm(adata) - adata = log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + adata = sqrt_cp10k(adata) + adata = log_cp10k(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") _harmonic_alignment( adata, test=test, @@ -76,7 +76,7 @@ def harmonic_alignment_log_scran_pooling( adata, test=False, n_svd=None, n_eigenvectors=None, n_pca_XY=None, n_filters=None ): adata = log_scran_pooling(adata) - adata = log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + adata = log_cp10k(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") _harmonic_alignment( adata, test=test, diff --git a/openproblems/tasks/multimodal_data_integration/methods/mnn.py b/openproblems/tasks/multimodal_data_integration/methods/mnn.py index c0bd27e501..6f94b3695c 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/mnn.py +++ b/openproblems/tasks/multimodal_data_integration/methods/mnn.py @@ -1,6 +1,6 @@ from ....tools.conversion import r_function from ....tools.decorators import method -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k from ....tools.normalize import log_scran_pooling from ....tools.utils import check_r_version @@ -20,11 +20,11 @@ @_mnn_method( - method_name="Mutual Nearest Neighbors (log CPM)", + method_name="Mutual Nearest Neighbors (log CP10k)", ) -def mnn_log_cpm(adata, test=False): - adata = log_cpm(adata) - adata = log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") +def mnn_log_cp10k(adata, test=False): + adata = log_cp10k(adata) + adata = log_cp10k(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") adata = _mnn(adata) adata.uns["method_code_version"] = check_r_version("batchelor") return adata @@ -35,7 +35,7 @@ def mnn_log_cpm(adata, test=False): ) def mnn_log_scran_pooling(adata, test=False): adata = log_scran_pooling(adata) - adata = log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + adata = log_cp10k(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") adata = _mnn(adata) adata.uns["method_code_version"] = check_r_version("batchelor") return adata diff --git a/openproblems/tasks/multimodal_data_integration/methods/procrustes.py b/openproblems/tasks/multimodal_data_integration/methods/procrustes.py index 55f8587a27..db55da4da4 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/procrustes.py +++ b/openproblems/tasks/multimodal_data_integration/methods/procrustes.py @@ -1,5 +1,5 @@ from ....tools.decorators import method -from ....tools.normalize import log_cpm +from ....tools.normalize import log_cp10k from ....tools.utils import check_version @@ -20,8 +20,8 @@ def procrustes(adata, test=False, n_svd=None): else: # pragma: no cover n_svd = n_svd or 100 n_svd = min([n_svd, min(adata.X.shape) - 1, min(adata.obsm["mode2"].shape) - 1]) - adata = log_cpm(adata) - adata = log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + adata = log_cp10k(adata) + adata = log_cp10k(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) Y_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.obsm["mode2"]) X_proc, Y_proc, _ = scipy.spatial.procrustes(X_pca, Y_pca) diff --git a/openproblems/tools/normalize.py b/openproblems/tools/normalize.py index c43b956610..6477c5cd79 100644 --- a/openproblems/tools/normalize.py +++ b/openproblems/tools/normalize.py @@ -41,44 +41,44 @@ def log_scran_pooling(adata: ad.AnnData) -> ad.AnnData: return adata -def _cpm(adata: ad.AnnData): +def _cp10k(adata: ad.AnnData): import scanpy as sc adata.X = sc.pp.normalize_total( - adata, target_sum=1e6, key_added="size_factors", inplace=False + adata, target_sum=1e4, key_added="size_factors", inplace=False )["X"] @decorators.normalizer -def cpm(adata: ad.AnnData) -> ad.AnnData: - """Normalize data to counts per million.""" - _cpm(adata) +def cp10k(adata: ad.AnnData) -> ad.AnnData: + """Normalize data to counts per 10,000.""" + _cp10k(adata) return adata @decorators.normalizer -def log_cpm(adata: ad.AnnData) -> ad.AnnData: - """Normalize data to log counts per million.""" +def log_cp10k(adata: ad.AnnData) -> ad.AnnData: + """Normalize data to log counts per 10,000.""" import scanpy as sc - _cpm(adata) + _cp10k(adata) sc.pp.log1p(adata) return adata @decorators.normalizer -def sqrt_cpm(adata: ad.AnnData) -> ad.AnnData: - """Normalize data to sqrt counts per million.""" - _cpm(adata) +def sqrt_cp10k(adata: ad.AnnData) -> ad.AnnData: + """Normalize data to sqrt counts per 10,000.""" + _cp10k(adata) adata.X = scprep.transform.sqrt(adata.X) return adata @decorators.normalizer -def log_cpm_hvg(adata: ad.AnnData, n_genes: int = 1000) -> ad.AnnData: - """Normalize logCPM HVG +def log_cp10k_hvg(adata: ad.AnnData, n_genes: int = 1000) -> ad.AnnData: + """Normalize logCP10k HVG - Normalize data to log counts per million and annotate n_genes highly + Normalize data to log counts per 10,000 and annotate n_genes highly variable genes. In order to subset the data to HVGs, use ``` adata = adata[:, adata.var["highly_variable"]].copy() @@ -86,7 +86,7 @@ def log_cpm_hvg(adata: ad.AnnData, n_genes: int = 1000) -> ad.AnnData: """ import scanpy as sc - adata = log_cpm(adata) + adata = log_cp10k(adata) if adata.n_vars < n_genes: log.warning( diff --git a/test/test_core_cli.py b/test/test_core_cli.py index 8bd90c0e3a..f521d2713e 100644 --- a/test/test_core_cli.py +++ b/test/test_core_cli.py @@ -161,7 +161,7 @@ def test_hash_docker_api(): [ (dataset, method, metric) for dataset in ["zebrafish_labs", None] - for method in ["logistic_regression_log_cpm", None] + for method in ["logistic_regression_log_cp10k", None] for metric in ["accuracy", None] ], name_func=utils.name.name_test, @@ -248,7 +248,7 @@ def test_pipeline(): method_file, "--version-file", version_file, - "logistic_regression_log_cpm", + "logistic_regression_log_cp10k", ], do_print=False, ) diff --git a/test/test_core_tasks.py b/test/test_core_tasks.py index 2d724f6e40..949a371031 100644 --- a/test/test_core_tasks.py +++ b/test/test_core_tasks.py @@ -29,7 +29,7 @@ def test_members(self): assert len(self.task._task_summary) < TASK_SUMMARY_MAXLEN assert hasattr(self.task, "DEFAULT_LAYER") assert isinstance(self.task.DEFAULT_LAYER, str) - assert self.task.DEFAULT_LAYER in ["counts", "log_normalized", "log_cpm"] + assert self.task.DEFAULT_LAYER in ["counts", "log_normalized", "log_cp10k"] assert hasattr(self.task, "api") assert isinstance(self.task.api, MODULE) for list_name in ["DATASETS", "METHODS", "METRICS"]: From 8b6877aac43e2db6f125f14ecb0bdacb7636203f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 1 Feb 2023 16:39:50 -0500 Subject: [PATCH 229/266] fix bib --- main.bib | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.bib b/main.bib index 0162461a18..02356c12c2 100644 --- a/main.bib +++ b/main.bib @@ -896,8 +896,8 @@ @inproceedings{venna2001neighborhood booktitle = {Artificial Neural Networks {\textemdash} {ICANN} 2001}, publisher = {Springer Berlin Heidelberg}, pages = {485--491}, - doi = {10.1007/3-540-44668-0\_68}, - url = {https://doi.org/10.1007/3-540-44668-0\_68}, + doi = {{10.1007/3-540-44668-0\_68}}, + url = {{https://doi.org/10.1007/3-540-44668-0\_68}}, } @article{venna2006local, title = {Local multidimensional scaling}, From d77a94914531882aca05653a4c6efc495c397fc6 Mon Sep 17 00:00:00 2001 From: MalteDLuecken Date: Wed, 1 Feb 2023 22:41:01 +0100 Subject: [PATCH 230/266] change multimodal data integration task name to matching modalities (#778) * change task name to matching modalities in README and folder name * make kNN AUC description modality-agnostic * add other folders in here * removed old multimodal integration folder * removed backup README * Rename multimodal_data_integration in tasks/__init__ * Update procrustes.py * Markdown lint * Rename hardcoded reference --------- Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Co-authored-by: Scott Gigante --- openproblems/tasks/__init__.py | 2 +- .../README.md | 12 ++++++------ .../__init__.py | 0 .../api.py | 0 .../datasets/__init__.py | 0 .../datasets/citeseq.py | 0 .../datasets/scicar.py | 0 .../methods/__init__.py | 0 .../methods/baseline.py | 0 .../methods/harmonic_alignment.py | 0 .../methods/mnn.R | 0 .../methods/mnn.py | 0 .../methods/procrustes.py | 2 +- .../metrics/__init__.py | 0 .../metrics/knn_auc.py | 0 .../metrics/mse.py | 0 test/test_core_cli.py | 2 +- 17 files changed, 9 insertions(+), 9 deletions(-) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/README.md (84%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/__init__.py (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/api.py (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/datasets/__init__.py (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/datasets/citeseq.py (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/datasets/scicar.py (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/methods/__init__.py (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/methods/baseline.py (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/methods/harmonic_alignment.py (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/methods/mnn.R (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/methods/mnn.py (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/methods/procrustes.py (96%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/metrics/__init__.py (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/metrics/knn_auc.py (100%) rename openproblems/tasks/{multimodal_data_integration => matching_modalities}/metrics/mse.py (100%) diff --git a/openproblems/tasks/__init__.py b/openproblems/tasks/__init__.py index 37fe938346..e576e230dc 100644 --- a/openproblems/tasks/__init__.py +++ b/openproblems/tasks/__init__.py @@ -1,7 +1,7 @@ from . import denoising from . import dimensionality_reduction from . import label_projection -from . import multimodal_data_integration +from . import matching_modalities from . import regulatory_effect_prediction from . import spatial_decomposition from ._batch_integration import batch_integration_embed diff --git a/openproblems/tasks/multimodal_data_integration/README.md b/openproblems/tasks/matching_modalities/README.md similarity index 84% rename from openproblems/tasks/multimodal_data_integration/README.md rename to openproblems/tasks/matching_modalities/README.md index fd7e32ad44..dd69a7803d 100644 --- a/openproblems/tasks/multimodal_data_integration/README.md +++ b/openproblems/tasks/matching_modalities/README.md @@ -1,4 +1,4 @@ -# Multimodal data integration +# Matching modalities ## The task @@ -25,13 +25,13 @@ observations sharing the same coordinates in the latent space. ## The metrics -Metrics for multimodal data integration aim to characterize how well the aligned +Metrics for matching modalities aim to characterize how well the aligned datasets correspond to the ground truth. -* **kNN AUC**: Let $f(i) ∈ F$ be the scRNA-seq measurement of cell $i$, and $g(i) ∈ G$ - be the scATAC- seq measurement of cell $i$. kNN-AUC calculates the average percentage - overlap of neighborhoods of $f(i)$ in $F$ with neighborhoods of $g(i)$ in $G$. Higher - is better. +* **kNN AUC**: Let $f(i) ∈ F$ be the modality 1 (e.g., scRNA-seq) measurement of cell $i$, + and $g(i) ∈ G$ be the modality 2 (e.g., scATAC-seq) measurement of cell $i$. kNN-AUC + calculates the average percentage overlap of neighborhoods of $f(i)$ in $F$ with + neighborhoods of $g(i)$ in $G$. Higher is better. * **MSE**: Mean squared error (MSE) is the average distance between each pair of matched observations of the same cell in the learned latent space. Lower is better. diff --git a/openproblems/tasks/multimodal_data_integration/__init__.py b/openproblems/tasks/matching_modalities/__init__.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/__init__.py rename to openproblems/tasks/matching_modalities/__init__.py diff --git a/openproblems/tasks/multimodal_data_integration/api.py b/openproblems/tasks/matching_modalities/api.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/api.py rename to openproblems/tasks/matching_modalities/api.py diff --git a/openproblems/tasks/multimodal_data_integration/datasets/__init__.py b/openproblems/tasks/matching_modalities/datasets/__init__.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/datasets/__init__.py rename to openproblems/tasks/matching_modalities/datasets/__init__.py diff --git a/openproblems/tasks/multimodal_data_integration/datasets/citeseq.py b/openproblems/tasks/matching_modalities/datasets/citeseq.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/datasets/citeseq.py rename to openproblems/tasks/matching_modalities/datasets/citeseq.py diff --git a/openproblems/tasks/multimodal_data_integration/datasets/scicar.py b/openproblems/tasks/matching_modalities/datasets/scicar.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/datasets/scicar.py rename to openproblems/tasks/matching_modalities/datasets/scicar.py diff --git a/openproblems/tasks/multimodal_data_integration/methods/__init__.py b/openproblems/tasks/matching_modalities/methods/__init__.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/methods/__init__.py rename to openproblems/tasks/matching_modalities/methods/__init__.py diff --git a/openproblems/tasks/multimodal_data_integration/methods/baseline.py b/openproblems/tasks/matching_modalities/methods/baseline.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/methods/baseline.py rename to openproblems/tasks/matching_modalities/methods/baseline.py diff --git a/openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py b/openproblems/tasks/matching_modalities/methods/harmonic_alignment.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/methods/harmonic_alignment.py rename to openproblems/tasks/matching_modalities/methods/harmonic_alignment.py diff --git a/openproblems/tasks/multimodal_data_integration/methods/mnn.R b/openproblems/tasks/matching_modalities/methods/mnn.R similarity index 100% rename from openproblems/tasks/multimodal_data_integration/methods/mnn.R rename to openproblems/tasks/matching_modalities/methods/mnn.R diff --git a/openproblems/tasks/multimodal_data_integration/methods/mnn.py b/openproblems/tasks/matching_modalities/methods/mnn.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/methods/mnn.py rename to openproblems/tasks/matching_modalities/methods/mnn.py diff --git a/openproblems/tasks/multimodal_data_integration/methods/procrustes.py b/openproblems/tasks/matching_modalities/methods/procrustes.py similarity index 96% rename from openproblems/tasks/multimodal_data_integration/methods/procrustes.py rename to openproblems/tasks/matching_modalities/methods/procrustes.py index db55da4da4..e144813055 100644 --- a/openproblems/tasks/multimodal_data_integration/methods/procrustes.py +++ b/openproblems/tasks/matching_modalities/methods/procrustes.py @@ -4,7 +4,7 @@ @method( - method_name="Procrustes", + method_name="Procrustes superimposition", paper_name="Generalized Procrustes analysis", paper_reference="gower1975generalized", paper_year=1975, diff --git a/openproblems/tasks/multimodal_data_integration/metrics/__init__.py b/openproblems/tasks/matching_modalities/metrics/__init__.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/metrics/__init__.py rename to openproblems/tasks/matching_modalities/metrics/__init__.py diff --git a/openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py b/openproblems/tasks/matching_modalities/metrics/knn_auc.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/metrics/knn_auc.py rename to openproblems/tasks/matching_modalities/metrics/knn_auc.py diff --git a/openproblems/tasks/multimodal_data_integration/metrics/mse.py b/openproblems/tasks/matching_modalities/metrics/mse.py similarity index 100% rename from openproblems/tasks/multimodal_data_integration/metrics/mse.py rename to openproblems/tasks/matching_modalities/metrics/mse.py diff --git a/test/test_core_cli.py b/test/test_core_cli.py index f521d2713e..f0c416c328 100644 --- a/test/test_core_cli.py +++ b/test/test_core_cli.py @@ -130,7 +130,7 @@ def test_help(capsys): @parameterized.parameterized.expand( [ ("label_projection", "--datasets", "pancreas_batch"), - ("multimodal_data_integration", "--methods", "mnn_log_scran_pooling"), + ("matching_modalities", "--methods", "mnn_log_scran_pooling"), ], name_func=utils.name.name_test, ) From 32b46e61d06f36a016672dc97af8ddae9ff2e86e Mon Sep 17 00:00:00 2001 From: Daniel Strobl <50872326+danielStrobl@users.noreply.github.com> Date: Thu, 2 Feb 2023 00:15:01 +0100 Subject: [PATCH 231/266] updated scib version (#793) * updated scib version * Bump scib req * use neighborhood graph f1 --------- Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- docker/openproblems-r-extras/requirements.txt | 2 +- docker/openproblems-r-pytorch/requirements.txt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 79b89e8987..2ca8b9e1b9 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,4 +1,4 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python rpy2<3.4.3 -scib==1.0.5 +scib==1.1.2 xgboost==1.6.* diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index c8b7a846d0..12ea659f64 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -3,6 +3,6 @@ bbknn==1.5.* git+https://github.com/chriscainx/mnnpy@2097dec # master harmony-pytorch==0.1.* scanorama==1.7.0 -scib==1.0.5 -scvi-tools~=0.19 # pinned in #313 +scib==1.1.2 +scvi-tools~=0.19 torch==1.13.* From 814fedc7ab35aeb8a789f1bef2be3aa7f4f0d935 Mon Sep 17 00:00:00 2001 From: Daniel Strobl <50872326+danielStrobl@users.noreply.github.com> Date: Thu, 2 Feb 2023 01:04:02 +0100 Subject: [PATCH 232/266] Daniel strobl hvg conservation fix (#785) * hvg conservation metric fix * pre-commit * Allow for uppercase repo owner * Fix sklearn req * bash not sh * bugfix use index * add to api * pre-commit * list instead of index * check number of genes * pre-commit * addressing comments * pre-commit * shorten line * addressing comments * pre-commit * fix checks * remove magic numbers * pre-commit * int -> numbers.Integral * Fix typo * fix dataset size assumption and duck-type hvg_unint --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Co-authored-by: Scott Gigante --- .../tasks/_batch_integration/_common/api.py | 23 ++++++++++++++++++- .../_common/datasets/immune.py | 10 +++++++- .../_common/datasets/pancreas.py | 10 +++++++- .../tasks/_batch_integration/_common/utils.py | 18 ++++++++++++++- .../batch_integration_feature/README.md | 4 ++++ .../batch_integration_feature/api.py | 9 +++++++- .../metrics/hvg_conservation.py | 3 ++- 7 files changed, 71 insertions(+), 6 deletions(-) diff --git a/openproblems/tasks/_batch_integration/_common/api.py b/openproblems/tasks/_batch_integration/_common/api.py index 5bec47bc27..bc5c9aa6b1 100644 --- a/openproblems/tasks/_batch_integration/_common/api.py +++ b/openproblems/tasks/_batch_integration/_common/api.py @@ -1,10 +1,13 @@ from ....data.sample import load_sample_data from ....tools.decorators import dataset from .utils import filter_celltypes +from .utils import precompute_hvg +import numbers import numpy as np MIN_CELLS_PER_CELLTYPE = 50 +N_HVG_UNINT = 2000 def check_neighbors(adata, neighbors_key, connectivities_key, distances_key): @@ -15,7 +18,12 @@ def check_neighbors(adata, neighbors_key, connectivities_key, distances_key): assert distances_key in adata.obsp -def check_dataset(adata, do_check_pca=False, do_check_neighbors=False): +def check_dataset( + adata, + do_check_pca=False, + do_check_neighbors=False, + do_check_hvg=False, +): """Check that dataset output fits expected API.""" assert "batch" in adata.obs @@ -28,12 +36,21 @@ def check_dataset(adata, do_check_pca=False, do_check_neighbors=False): assert adata.var_names.is_unique assert adata.obs_names.is_unique + assert "n_genes_pre" in adata.uns + assert isinstance(adata.uns["n_genes_pre"], numbers.Integral) + assert adata.uns["n_genes_pre"] == adata.n_vars + assert "organism" in adata.uns assert adata.uns["organism"] in ["mouse", "human"] if do_check_pca: assert "X_uni_pca" in adata.obsm + if do_check_hvg: + assert "hvg_unint" in adata.uns + assert len(adata.uns["hvg_unint"]) == min(N_HVG_UNINT, adata.n_vars) + assert np.all(np.isin(adata.uns["hvg_unint"], adata.var.index)) + if do_check_neighbors: check_neighbors(adata, "uni", "uni_connectivities", "uni_distances") @@ -58,6 +75,10 @@ def sample_dataset(run_pca: bool = False, run_neighbors: bool = False): adata.obs["batch"] = np.random.choice(2, adata.shape[0], replace=True).astype(str) adata.obs["labels"] = np.random.choice(3, adata.shape[0], replace=True).astype(str) adata = filter_celltypes(adata) + + adata.uns["hvg_unint"] = precompute_hvg(adata) + adata.uns["n_genes_pre"] = adata.n_vars + if run_pca: adata.obsm["X_uni_pca"] = sc.pp.pca(adata.X) if run_neighbors: diff --git a/openproblems/tasks/_batch_integration/_common/datasets/immune.py b/openproblems/tasks/_batch_integration/_common/datasets/immune.py index c6563177df..efea9dee09 100644 --- a/openproblems/tasks/_batch_integration/_common/datasets/immune.py +++ b/openproblems/tasks/_batch_integration/_common/datasets/immune.py @@ -1,6 +1,7 @@ from .....data.immune_cells import load_immune from .....tools.decorators import dataset from ..utils import filter_celltypes +from ..utils import precompute_hvg from typing import Optional @@ -13,7 +14,11 @@ "Smart-seq2).", image="openproblems", ) -def immune_batch(test: bool = False, min_celltype_count: Optional[int] = None): +def immune_batch( + test: bool = False, + min_celltype_count: Optional[int] = None, + n_hvg: Optional[int] = None, +): import scanpy as sc adata = load_immune(test) @@ -38,4 +43,7 @@ def immune_batch(test: bool = False, min_celltype_count: Optional[int] = None): sc.pp.neighbors(adata, use_rep="X_uni_pca", key_added="uni") adata.var_names_make_unique() + + adata.uns["hvg_unint"] = precompute_hvg(adata, n_genes=n_hvg) + adata.uns["n_genes_pre"] = adata.n_vars return adata diff --git a/openproblems/tasks/_batch_integration/_common/datasets/pancreas.py b/openproblems/tasks/_batch_integration/_common/datasets/pancreas.py index 9ebe868cff..7e9aa890cd 100644 --- a/openproblems/tasks/_batch_integration/_common/datasets/pancreas.py +++ b/openproblems/tasks/_batch_integration/_common/datasets/pancreas.py @@ -1,6 +1,7 @@ from .....data.pancreas import load_pancreas from .....tools.decorators import dataset from ..utils import filter_celltypes +from ..utils import precompute_hvg from typing import Optional @@ -13,7 +14,11 @@ "and SMARTER-seq).", image="openproblems", ) -def pancreas_batch(test: bool = False, min_celltype_count: Optional[int] = None): +def pancreas_batch( + test: bool = False, + min_celltype_count: Optional[int] = None, + n_hvg: Optional[int] = None, +): import scanpy as sc adata = load_pancreas(test) @@ -38,4 +43,7 @@ def pancreas_batch(test: bool = False, min_celltype_count: Optional[int] = None) sc.pp.neighbors(adata, use_rep="X_uni_pca", key_added="uni") adata.var_names_make_unique() + + adata.uns["hvg_unint"] = precompute_hvg(adata, n_genes=n_hvg) + adata.uns["n_genes_pre"] = adata.n_vars return adata diff --git a/openproblems/tasks/_batch_integration/_common/utils.py b/openproblems/tasks/_batch_integration/_common/utils.py index 99523b6253..ee5d367ff5 100644 --- a/openproblems/tasks/_batch_integration/_common/utils.py +++ b/openproblems/tasks/_batch_integration/_common/utils.py @@ -1,11 +1,27 @@ +from . import api +from scanpy.pp import highly_variable_genes from typing import Optional def filter_celltypes(adata, min_celltype_count: Optional[int] = None): - min_celltype_count = min_celltype_count or 50 + min_celltype_count = min_celltype_count or api.MIN_CELLS_PER_CELLTYPE celltype_counts = adata.obs["labels"].value_counts() keep_celltypes = celltype_counts[celltype_counts >= min_celltype_count].index keep_cells = adata.obs["labels"].isin(keep_celltypes) return adata[keep_cells].copy() + + +def precompute_hvg(adata, n_genes: Optional[int] = None): + + n_genes = n_genes or api.N_HVG_UNINT + hvg_unint = highly_variable_genes( + adata, + n_top_genes=n_genes, + layer="log_normalized", + flavor="cell_ranger", + batch_key="batch", + inplace=False, + ) + return list(hvg_unint[hvg_unint.highly_variable].index) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md index 1235c93c30..6e9de3fb6b 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md @@ -46,8 +46,12 @@ Datasets should contain the following attributes: * `adata.layers['counts']` with raw, integer UMI count data, * `adata.layers['log_normalized']` with log-normalized data and * `adata.X` with log-normalized data +* `adata.uns['n_genes_pre']` with the number of genes present before integration +* `adata.uns['hvg_unint']` with a list of 2000 highly variable genes + prior to integration (for the hvg conservation metric) Methods should store their a batch-corrected gene expression matrix in `adata.X`. +The output should should contain at least 2000 features. The `openproblems-python-batch-integration` docker container is used for the methods that diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py index 637eea141c..4aabaf94ef 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py @@ -3,12 +3,19 @@ import functools -check_dataset = functools.partial(api.check_dataset, do_check_pca=True) +check_dataset = functools.partial( + api.check_dataset, do_check_hvg=True, do_check_pca=True +) def check_method(adata, is_baseline=False): """Check that method output fits expected API.""" assert "log_normalized" in adata.layers + # check hvg_unint is still there + assert "hvg_unint" in adata.uns + # check n_vars is not too small + assert "n_genes_pre" in adata.uns + assert adata.n_vars >= min(api.N_HVG_UNINT, adata.uns["n_genes_pre"]) if not is_baseline: assert adata.layers["log_normalized"] is not adata.X return True diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py index f7779037fa..5f1160d2b9 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py @@ -29,5 +29,6 @@ def hvg_conservation(adata): adata_unint = adata.copy() adata_unint.X = adata_unint.layers["log_normalized"] + hvg_both = list(set(adata.uns["hvg_unint"]).intersection(adata.var_names)) - return hvg_overlap(adata_unint, adata, "batch") + return hvg_overlap(adata_unint, adata[:, hvg_both], "batch") From 1a5d74758a09c59eedb473dc85bf1ea5ac3a9dee Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 1 Feb 2023 19:45:09 -0500 Subject: [PATCH 233/266] add hvg_unint and n_genes_pre to lung_batch --- .../tasks/_batch_integration/_common/datasets/lung.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/openproblems/tasks/_batch_integration/_common/datasets/lung.py b/openproblems/tasks/_batch_integration/_common/datasets/lung.py index e3f20e4daa..f5610e1f1d 100644 --- a/openproblems/tasks/_batch_integration/_common/datasets/lung.py +++ b/openproblems/tasks/_batch_integration/_common/datasets/lung.py @@ -1,6 +1,7 @@ from .....data.lung import load_lung from .....tools.decorators import dataset from ..utils import filter_celltypes +from ..utils import precompute_hvg from typing import Optional @@ -12,7 +13,11 @@ "From Vieira Braga et al. Technologies: 10X and Drop-seq.", image="openproblems", ) -def lung_batch(test: bool = False, min_celltype_count: Optional[int] = None): +def lung_batch( + test: bool = False, + min_celltype_count: Optional[int] = None, + n_hvg: Optional[int] = None, +): import scanpy as sc adata = load_lung(test) @@ -37,4 +42,7 @@ def lung_batch(test: bool = False, min_celltype_count: Optional[int] = None): sc.pp.neighbors(adata, use_rep="X_uni_pca", key_added="uni") adata.var_names_make_unique() + + adata.uns["hvg_unint"] = precompute_hvg(adata, n_genes=n_hvg) + adata.uns["n_genes_pre"] = adata.n_vars return adata From 9717065f2eea38510b2ffc1f8eb60ee15c951fff Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 2 Feb 2023 09:09:59 -0500 Subject: [PATCH 234/266] pymde doesn't work on sparse data --- .../tasks/dimensionality_reduction/methods/pymde.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/openproblems/tasks/dimensionality_reduction/methods/pymde.py b/openproblems/tasks/dimensionality_reduction/methods/pymde.py index 050c61df12..1ddc69947e 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pymde.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pymde.py @@ -21,6 +21,7 @@ def _pymde( adata, method: str = "neighbors", genes=None, + n_pca: Optional[int] = None, test: bool = False, max_iter: Optional[int] = None, memory_size: Optional[int] = None, @@ -34,16 +35,17 @@ def _pymde( embed_kwargs = {} if test: - sc.tl.pca(adata_input, n_comps=20, svd_solver="arpack") - X = adata_input.obsm["X_pca"] + n_pca = n_pca or 20 embed_kwargs["max_iter"] = max_iter or 20 embed_kwargs["memory_size"] = memory_size or 2 else: # pragma: nocover - X = adata_input.X + n_pca = n_pca or 100 if max_iter is not None: embed_kwargs["max_iter"] = max_iter if memory_size is not None: embed_kwargs["memory_size"] = memory_size + sc.tl.pca(adata_input, n_comps=n_pca, svd_solver="arpack") + X = adata_input.obsm["X_pca"] if method == "neighbors": mde_fn = pymde.preserve_neighbors elif method == "distances": From ce8b8c440be44d5dec33410104b7b979a685cdff Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 2 Feb 2023 09:26:20 -0500 Subject: [PATCH 235/266] add timestamps to pytest -v --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 1b0fb142aa..05bac0a377 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ test_requires = [ "pytest==7.1.*", "pytest-cov>=3.0,<4.1", + "pytest-timestamper==0.0.9", "black==22.10.0", "coverage>=6.4,<6.6", "codecov==2.1.*", From 5be696e1f576586d91d78741ae4b1d1d5a5edb62 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 2 Feb 2023 09:55:37 -0500 Subject: [PATCH 236/266] scib 1.1.3 --- docker/openproblems-r-extras/requirements.txt | 2 +- docker/openproblems-r-pytorch/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/openproblems-r-extras/requirements.txt b/docker/openproblems-r-extras/requirements.txt index 2ca8b9e1b9..ec3092587e 100644 --- a/docker/openproblems-r-extras/requirements.txt +++ b/docker/openproblems-r-extras/requirements.txt @@ -1,4 +1,4 @@ git+https://github.com/KrishnaswamyLab/harmonic-alignment@v0.1#subdirectory=python rpy2<3.4.3 -scib==1.1.2 +scib==1.1.3 xgboost==1.6.* diff --git a/docker/openproblems-r-pytorch/requirements.txt b/docker/openproblems-r-pytorch/requirements.txt index 12ea659f64..c35cc20330 100644 --- a/docker/openproblems-r-pytorch/requirements.txt +++ b/docker/openproblems-r-pytorch/requirements.txt @@ -3,6 +3,6 @@ bbknn==1.5.* git+https://github.com/chriscainx/mnnpy@2097dec # master harmony-pytorch==0.1.* scanorama==1.7.0 -scib==1.1.2 +scib==1.1.3 scvi-tools~=0.19 torch==1.13.* From 0401028f9882d1d63e05fd00e1ed2b7b76a7e055 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 2 Feb 2023 14:07:52 -0500 Subject: [PATCH 237/266] Test all metadata (#813) * make sure metadata tests run * transfer meta * test all metadata * fix docker * fix path * undo rename * fix * separate metadata tests * bump test coverage --- .coveragerc | 8 +- .github/workflows/run_tests.yml | 4 +- .../batch_integration_embed/metrics/ari.py | 7 +- .../metrics/graph_connectivity.py | 7 +- .../metrics/iso_label_f1.py | 7 +- .../batch_integration_embed/metrics/nmi.py | 7 +- .../batch_integration_feature/metrics/ari.py | 7 +- .../metrics/cc_score.py | 7 +- .../metrics/graph_connectivity.py | 7 +- .../metrics/iso_label_f1.py | 7 +- .../metrics/iso_label_sil.py | 7 +- .../batch_integration_feature/metrics/kBET.py | 7 +- .../batch_integration_feature/metrics/nmi.py | 7 +- .../batch_integration_feature/metrics/pcr.py | 7 +- .../metrics/sil_batch.py | 7 +- .../metrics/silhouette.py | 7 +- openproblems/tasks/denoising/methods/dca.py | 2 +- openproblems/tasks/denoising/methods/magic.py | 2 +- scripts/install_renv.R | 2 +- test/test_core_metadata.py | 87 +++++++++++++++++++ test/test_core_utils.py | 4 +- test/test_task_1_load_data.py | 2 +- test/test_task_2_datasets.py | 34 +------- ..._cell_cell_communication_source_target.py} | 12 +-- test/test_task_dimensionality_reduction.py | 4 +- test/test_task_methods.py | 32 +------ test/test_task_metrics.py | 19 +--- test/utils/asserts.py | 2 +- test/utils/name.py | 5 +- 29 files changed, 128 insertions(+), 189 deletions(-) create mode 100644 test/test_core_metadata.py rename test/{test_task_cell_cell_communication.py => test_task_cell_cell_communication_source_target.py} (92%) diff --git a/.coveragerc b/.coveragerc index c6aa107297..28208602d7 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,8 +1,12 @@ [run] parallel = true branch = true -source = openproblems -omit = */__init__.py +source = + openproblems + test +omit = + */__init__.py + test/utils/*.py [report] exclude_lines = diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index e7d132483b..67e70ac3e7 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -313,13 +313,13 @@ jobs: if: "needs.build_images.result == 'skipped'" run: | cd workflow - snakemake -j $(nproc) docker_pull + snakemake -j $(nproc) docker cd .. - name: Run tests timeout-minutes: 60 run: | - pytest --cov=openproblems --cov-report=xml -vv --durations=15 --tb=native -k "${{ matrix.tests }}" + pytest --cov=openproblems --cov=test --cov-report=xml -vv --durations=15 --tb=native -k "${{ matrix.tests }}" mkdir -p coverage mv coverage.xml "$(echo 'coverage_${{ matrix.tests }}.xml' | sed 's/[^a-z0-9\.]/_/g')" diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py index e63e80df52..10822e79ba 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py @@ -14,12 +14,7 @@ """ -@metric( - metric_name="ARI", - maximize=True, - paper_reference="luecken2022benchmarking", - image="openproblems-r-pytorch", -) +@metric(**graph_metrics.ari.metadata) def ari(adata): from scanpy.pp import neighbors diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py index 0fb72d7dd7..e1d2e03cb5 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py @@ -20,12 +20,7 @@ """ -@metric( - metric_name="Graph connectivity", - paper_reference="luecken2022benchmarking", - maximize=True, - image="openproblems-r-pytorch", -) +@metric(**graph_metrics.graph_connectivity.metadata) def graph_connectivity(adata): from scanpy.pp import neighbors diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py index 04dc79c498..aa2b9cdafe 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py @@ -25,12 +25,7 @@ """ -@metric( - metric_name="Isolated label F1", - paper_reference="luecken2022benchmarking", - maximize=True, - image="openproblems-r-pytorch", -) +@metric(**graph_metrics.isolated_labels_f1.metadata) def isolated_labels_f1(adata): from scanpy.pp import neighbors diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py index 438e2f9198..06fd3dbb5c 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py @@ -13,12 +13,7 @@ """ -@metric( - metric_name="NMI", - paper_reference="luecken2022benchmarking", - maximize=True, - image="openproblems-r-pytorch", -) +@metric(**graph_metrics.nmi.metadata) def nmi(adata): from scanpy.pp import neighbors diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py index 1c22dc5f5b..cc0a9541cb 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py @@ -14,12 +14,7 @@ """ -@metric( - metric_name="ARI", - maximize=True, - paper_reference="luecken2022benchmarking", - image="openproblems-r-pytorch", -) +@metric(**graph_metrics.ari.metadata) def ari(adata): from scanpy.pp import neighbors from scanpy.tl import pca diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py index 541502daf4..d7bd5730d5 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py @@ -19,12 +19,7 @@ the preintegration variance contribution reduces the score.""" -@metric( - metric_name="Cell Cycle Score", - paper_reference="luecken2022benchmarking", - maximize=True, - image="openproblems-r-pytorch", -) +@metric(**embed_metrics.cc_score.metadata) def cc_score(adata, test=False): from scanpy.tl import pca diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py index f574335598..bc7fbba9cc 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py @@ -20,12 +20,7 @@ """ -@metric( - metric_name="Graph connectivity", - paper_reference="luecken2022benchmarking", - maximize=True, - image="openproblems-r-pytorch", -) +@metric(**graph_metrics.graph_connectivity.metadata) def graph_connectivity(adata): from scanpy.pp import neighbors from scanpy.tl import pca diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py index 241d25f9e4..9e5896c064 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py @@ -25,12 +25,7 @@ """ -@metric( - metric_name="Isolated label F1", - paper_reference="luecken2022benchmarking", - maximize=True, - image="openproblems-r-pytorch", -) +@metric(**graph_metrics.isolated_labels_f1.metadata) def isolated_labels_f1(adata): from scanpy.pp import neighbors from scanpy.tl import pca diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py index 81c5119328..72ece6686d 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py @@ -13,12 +13,7 @@ """ -@metric( - metric_name="Isolated label Silhouette", - paper_reference="luecken2022benchmarking", - maximize=True, - image="openproblems-r-pytorch", -) +@metric(**embed_metrics.isolated_labels_sil.metadata) def isolated_labels_sil(adata): from scanpy.tl import pca diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py index dec11bfb25..44820545f5 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py @@ -25,12 +25,7 @@ """ -@metric( - metric_name="kBET", - paper_reference="bttner2018test", - maximize=True, - image="openproblems-r-extras", -) +@metric(**embed_metrics.kBET.metadata) def kBET(adata): from scanpy.tl import pca diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py index b35daf2856..3e8e62bde1 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py @@ -13,12 +13,7 @@ """ -@metric( - metric_name="NMI", - paper_reference="luecken2022benchmarking", - maximize=True, - image="openproblems-r-pytorch", -) +@metric(**graph_metrics.nmi.metadata) def nmi(adata): from scanpy.pp import neighbors from scanpy.tl import pca diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py index 5b30bdd8a1..8a5d641f37 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py @@ -16,12 +16,7 @@ component.""" -@metric( - metric_name="PC Regression", - paper_reference="luecken2022benchmarking", - maximize=True, - image="openproblems-r-pytorch", -) +@metric(**embed_metrics.pcr.metadata) def pcr(adata): from scanpy.tl import pca diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py index 118b9b7863..35f024ca44 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py @@ -22,12 +22,7 @@ Here, M is the set of unique cell labels.""" -@metric( - metric_name="Batch ASW", - paper_reference="luecken2022benchmarking", - maximize=True, - image="openproblems-r-pytorch", -) +@metric(**embed_metrics.silhouette_batch.metadata) def silhouette_batch(adata): from scanpy.tl import pca diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py index b248a592cf..aa94c887e0 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py @@ -10,12 +10,7 @@ For information about the batch silhouette score, check sil_batch.""" -@metric( - metric_name="Silhouette", - paper_reference="luecken2022benchmarking", - maximize=True, - image="openproblems-r-pytorch", -) +@metric(**embed_metrics.silhouette.metadata) def silhouette(adata): from scanpy.tl import pca diff --git a/openproblems/tasks/denoising/methods/dca.py b/openproblems/tasks/denoising/methods/dca.py index 3f9d9cb5cf..8b47d4fcc6 100644 --- a/openproblems/tasks/denoising/methods/dca.py +++ b/openproblems/tasks/denoising/methods/dca.py @@ -27,7 +27,7 @@ def _dca(adata, test=False, epochs=None): @method( method_name="DCA", paper_name="Single-cell RNA-seq denoising using a deep count autoencoder", - paper_reference="https://www.nature.com/articles/s41467-018-07931-2", + paper_reference="eraslan2019single", paper_year=2019, code_url="https://github.com/theislab/dca", image="openproblems-python-tensorflow", diff --git a/openproblems/tasks/denoising/methods/magic.py b/openproblems/tasks/denoising/methods/magic.py index 5e06b5ca5f..a7a1374864 100644 --- a/openproblems/tasks/denoising/methods/magic.py +++ b/openproblems/tasks/denoising/methods/magic.py @@ -9,7 +9,7 @@ method, paper_name="Recovering Gene Interactions from Single-Cell Data " "Using Data Diffusion", - paper_reference="https://doi.org/10.1016/j.cell.2018.05.061", + paper_reference="van2018recovering", paper_year=2018, code_url="https://github.com/KrishnaswamyLab/MAGIC", image="openproblems-python-extras", diff --git a/scripts/install_renv.R b/scripts/install_renv.R index b3a24d58ad..f9d7f66889 100644 --- a/scripts/install_renv.R +++ b/scripts/install_renv.R @@ -40,7 +40,7 @@ strip_comments <- function(remote) { } with_retries <- function(func, - attempts = 5, + attempts = 10, sleep_once = 3, sleep_multiple = 60, backoff = 2, diff --git a/test/test_core_metadata.py b/test/test_core_metadata.py new file mode 100644 index 0000000000..57139e0083 --- /dev/null +++ b/test/test_core_metadata.py @@ -0,0 +1,87 @@ +import openproblems +import openproblems.utils +import parameterized +import utils +import utils.asserts +import utils.cache +import utils.git +import utils.name + +DATASET_SUMMARY_MINLEN = 40 +DATASET_SUMMARY_MAXLEN = 1000 + + +@parameterized.parameterized.expand( + [(dataset,) for task in openproblems.TASKS for dataset in task.DATASETS], + name_func=utils.name.name_test, +) +def test_dataset_metadata(dataset): + """Test for existence of dataset metadata.""" + assert hasattr(dataset, "metadata") + for attr in [ + "dataset_name", + "data_url", + "data_reference", + "dataset_summary", + "image", + ]: + assert attr in dataset.metadata + assert dataset.metadata[attr] is not None + + assert isinstance(dataset.metadata["dataset_name"], str) + assert isinstance(dataset.metadata["image"], str) + assert dataset.metadata["image"].startswith("openproblems") + assert isinstance(dataset.metadata["dataset_summary"], str) + assert len(dataset.metadata["dataset_summary"]) > DATASET_SUMMARY_MINLEN + assert len(dataset.metadata["dataset_summary"]) < DATASET_SUMMARY_MAXLEN + assert isinstance(dataset.metadata["data_url"], str) + assert utils.asserts.assert_url_accessible(dataset.metadata["data_url"]) + assert isinstance(dataset.metadata["data_reference"], str) + assert utils.asserts.assert_valid_reference(dataset.metadata["data_reference"]) + + +@parameterized.parameterized.expand( + [(method,) for task in openproblems.TASKS for method in task.METHODS], + name_func=utils.name.name_test, +) +def test_method_metadata(method): + """Test for existence of method metadata.""" + assert hasattr(method, "metadata") + for attr in [ + "method_name", + "paper_name", + "paper_reference", + "paper_year", + "code_url", + "image", + "is_baseline", + ]: + assert attr in method.metadata + + assert isinstance(method.metadata["image"], str) + assert method.metadata["image"].startswith("openproblems") + assert isinstance(method.metadata["method_name"], str) + assert isinstance(method.metadata["paper_name"], str) + assert isinstance(method.metadata["paper_year"], int) + assert isinstance(method.metadata["paper_reference"], str) + assert utils.asserts.assert_valid_reference(method.metadata["paper_reference"]) + assert isinstance(method.metadata["code_url"], str) + assert utils.asserts.assert_url_accessible(method.metadata["code_url"]) + assert isinstance(method.metadata["is_baseline"], bool) + + +@parameterized.parameterized.expand( + [(metric,) for task in openproblems.TASKS for metric in task.METRICS], + name_func=utils.name.name_test, +) +def test_metric_metadata(metric): + """Test for existence of metric metadata.""" + assert hasattr(metric, "metadata") + for attr in ["metric_name", "maximize", "image"]: + assert attr in metric.metadata + assert isinstance(metric.metadata["maximize"], bool) + assert isinstance(metric.metadata["metric_name"], str) + assert isinstance(metric.metadata["image"], str) + assert metric.metadata["image"].startswith("openproblems") + assert isinstance(metric.metadata["paper_reference"], str) + assert utils.asserts.assert_valid_reference(metric.metadata["paper_reference"]) diff --git a/test/test_core_utils.py b/test/test_core_utils.py index 8ea5084879..0c3ec8b5ee 100644 --- a/test/test_core_utils.py +++ b/test/test_core_utils.py @@ -10,7 +10,7 @@ def test_temporary_version_missing(): """Test temporary decorator behavior with missing version.""" @openproblems.utils.temporary - def test_fn(): + def test_fn(): # pragma: nocover pass np.testing.assert_raises_regex( @@ -27,7 +27,7 @@ def test_temporary_version_future(): temp_version = "{}.{}".format(version.major - 1, 0) @openproblems.utils.temporary(version=temp_version) - def test_fn(): + def test_fn(): # pragma: nocover pass np.testing.assert_raises_regex( diff --git a/test/test_task_1_load_data.py b/test/test_task_1_load_data.py index 18f3422889..125a2f3fc0 100644 --- a/test/test_task_1_load_data.py +++ b/test/test_task_1_load_data.py @@ -21,7 +21,7 @@ skip_on_empty=True, ) @utils.docker.docker_test(retries=2) -def test_load_dataset(task_name, dataset_name, test, tempdir, image): +def test_load_dataset(task_name, dataset_name, test, tempdir, image): # pragma: nocover """Test loading and caching of a dataset.""" import utils.asserts import utils.cache diff --git a/test/test_task_2_datasets.py b/test/test_task_2_datasets.py index 2787554747..6eeb87afd7 100644 --- a/test/test_task_2_datasets.py +++ b/test/test_task_2_datasets.py @@ -12,9 +12,6 @@ import utils.git import utils.name -DATASET_SUMMARY_MINLEN = 40 -DATASET_SUMMARY_MAXLEN = 1000 - def _assert_not_bytes(X): if isinstance(X, pd.Series): @@ -59,7 +56,7 @@ def setUpClass(cls): test=cls.test, dependency="test_load_dataset", ) - except AssertionError as e: + except AssertionError as e: # pragma: nocover if str(e) == "Intermediate file missing. Did test_load_dataset fail?": pytest.skip("Dataset not loaded successfully") else: @@ -121,32 +118,3 @@ def test_normalize(self, normalizer): adata = self.adata.copy() adata = normalizer(adata) utils.asserts.assert_finite(adata.X) - - -@parameterized.parameterized.expand( - [(dataset,) for task in openproblems.TASKS for dataset in task.DATASETS], - name_func=utils.name.name_test, -) -def test_dataset_metadata(dataset): - """Test for existence of dataset metadata.""" - assert hasattr(dataset, "metadata") - for attr in [ - "dataset_name", - "data_url", - "data_reference", - "dataset_summary", - "image", - ]: - assert attr in dataset.metadata - assert dataset.metadata[attr] is not None - - assert isinstance(dataset.metadata["dataset_name"], str) - assert isinstance(dataset.metadata["image"], str) - assert dataset.metadata["image"].startswith("openproblems") - assert isinstance(dataset.metadata["dataset_summary"], str) - assert len(dataset.metadata["dataset_summary"]) > DATASET_SUMMARY_MINLEN - assert len(dataset.metadata["dataset_summary"]) < DATASET_SUMMARY_MAXLEN - assert isinstance(dataset.metadata["data_url"], str) - assert utils.asserts.assert_url_accessible(dataset.metadata["data_url"]) - assert isinstance(dataset.metadata["data_reference"], str) - assert utils.asserts.assert_valid_reference(dataset.metadata["data_reference"]) diff --git a/test/test_task_cell_cell_communication.py b/test/test_task_cell_cell_communication_source_target.py similarity index 92% rename from test/test_task_cell_cell_communication.py rename to test/test_task_cell_cell_communication_source_target.py index ab22f2a6de..40b07b047e 100644 --- a/test/test_task_cell_cell_communication.py +++ b/test/test_task_cell_cell_communication_source_target.py @@ -11,11 +11,7 @@ import utils.docker import utils.git -# global skip -SUBTASKS = [ - openproblems.tasks.cell_cell_communication_source_target, - openproblems.tasks.cell_cell_communication_ligand_target, -] +TASK = openproblems.tasks.cell_cell_communication_source_target class TestApi(unittest.TestCase): @@ -54,7 +50,7 @@ def test_assert_is_subset(self): ) def test_map_gene_symbols(self): - adata = common.api.sample_dataset(SUBTASKS[0].api.MERGE_KEYS) + adata = common.api.sample_dataset(TASK.api.MERGE_KEYS) index = adata.var.index.to_numpy() index[0] = "many_to_one_1" index[1] = "many_to_one_2" @@ -106,8 +102,8 @@ def test_map_gene_symbols(self): self.assertNotIn("one_from_none", adata_mapped.var.index) -@utils.docker.docker_test(image=SUBTASKS[0].metrics.odds_ratio.metadata["image"]) -def test_odds_ratio_no_match(): +@utils.docker.docker_test(image=TASK.metrics.odds_ratio.metadata["image"]) +def test_odds_ratio_no_match(): # pragma: nocover import numpy as np task = openproblems.tasks.cell_cell_communication_source_target diff --git a/test/test_task_dimensionality_reduction.py b/test/test_task_dimensionality_reduction.py index 290e9cda1b..5516e74bbe 100644 --- a/test/test_task_dimensionality_reduction.py +++ b/test/test_task_dimensionality_reduction.py @@ -8,7 +8,7 @@ @utils.docker.docker_test(image=TASK.metrics.trustworthiness.metadata["image"]) -def test_trustworthiness_sparse(): +def test_trustworthiness_sparse(): # pragma: nocover from scipy.sparse import csr_matrix task = openproblems.tasks.dimensionality_reduction @@ -27,7 +27,7 @@ def test_trustworthiness_sparse(): @utils.docker.docker_test(image=TASK.metrics.density_preservation.metadata["image"]) -def test_density_preservation_matches_densmap(): +def test_density_preservation_matches_densmap(): # pragma: nocover from openproblems.tasks.dimensionality_reduction.metrics.density import _K from openproblems.tasks.dimensionality_reduction.metrics.density import _SEED from scipy.stats import pearsonr diff --git a/test/test_task_methods.py b/test/test_task_methods.py index f92ce7d57d..5e5ffd844a 100644 --- a/test/test_task_methods.py +++ b/test/test_task_methods.py @@ -25,7 +25,7 @@ skip_on_empty=True, ) @utils.docker.docker_test(timeout=600, retries=RETRIES) -def test_method(task_name, method_name, image): +def test_method(task_name, method_name, image): # pragma: nocover """Test application of a method.""" import anndata import openproblems.utils @@ -49,33 +49,3 @@ def test_method(task_name, method_name, image): assert method.metadata["code_version"] is not None else: assert adata.uns["method_code_version"] != "ModuleNotFound" - - -@parameterized.parameterized.expand( - [(method,) for task in openproblems.TASKS for method in task.METHODS], - name_func=utils.name.name_test, -) -def test_method_metadata(method): - """Test for existence of method metadata.""" - assert hasattr(method, "metadata") - for attr in [ - "method_name", - "paper_name", - "paper_reference", - "paper_year", - "code_url", - "image", - "is_baseline", - ]: - assert attr in method.metadata - - assert isinstance(method.metadata["image"], str) - assert method.metadata["image"].startswith("openproblems") - assert isinstance(method.metadata["method_name"], str) - assert isinstance(method.metadata["paper_name"], str) - assert isinstance(method.metadata["paper_year"], int) - assert isinstance(method.metadata["paper_reference"], str) - assert utils.asserts.assert_valid_reference(method.metadata["paper_reference"]) - assert isinstance(method.metadata["code_url"], str) - assert utils.asserts.assert_url_accessible(method.metadata["code_url"]) - assert isinstance(method.metadata["is_baseline"], bool) diff --git a/test/test_task_metrics.py b/test/test_task_metrics.py index 8232bb958b..96c57ceb01 100644 --- a/test/test_task_metrics.py +++ b/test/test_task_metrics.py @@ -5,23 +5,6 @@ import utils.name -@parameterized.parameterized.expand( - [(metric,) for task in openproblems.TASKS for metric in task.METRICS], - name_func=utils.name.name_test, -) -def test_metric_metadata(metric): - """Test for existence of metric metadata.""" - assert hasattr(metric, "metadata") - for attr in ["metric_name", "maximize", "image"]: - assert attr in metric.metadata - assert isinstance(metric.metadata["maximize"], bool) - assert isinstance(metric.metadata["metric_name"], str) - assert isinstance(metric.metadata["image"], str) - assert metric.metadata["image"].startswith("openproblems") - assert isinstance(metric.metadata["paper_reference"], str) - assert utils.asserts.assert_valid_reference(metric.metadata["paper_reference"]) - - @parameterized.parameterized.expand( [ ( @@ -36,7 +19,7 @@ def test_metric_metadata(metric): skip_on_empty=True, ) @utils.docker.docker_test -def test_metric(task_name, metric_name, image): +def test_metric(task_name, metric_name, image): # pragma: nocover """Test computation of a metric.""" import numbers diff --git a/test/utils/asserts.py b/test/utils/asserts.py index 06f85b69ab..211374f9d5 100644 --- a/test/utils/asserts.py +++ b/test/utils/asserts.py @@ -62,7 +62,7 @@ def assert_valid_reference(ref): bib = _load_bibliography() assert ref in bib.entries_dict bibentry = bib.entries_dict[ref] - if not bibentry["ENTRYTYPE"] == "misc" or ref in _MISSING_DOIS: + if not (bibentry["ENTRYTYPE"] == "misc" or ref in _MISSING_DOIS): assert "doi" in bibentry assert assert_url_accessible(f"https://doi.org/{bibentry['doi']}") return True diff --git a/test/utils/name.py b/test/utils/name.py index 446bcb9d58..1ea94355c6 100644 --- a/test/utils/name.py +++ b/test/utils/name.py @@ -17,9 +17,10 @@ def name_test(testcase_func, param_num, param): """Get a human readable name for a parameterized test.""" args = param.values() if isinstance(param, dict) else param.args - return "%s_%s" % ( + name_params = [ testcase_func.__name__, parameterized.parameterized.to_safe_name( "_".join(object_name(x) for x in args if x != TEMPDIR.name) ), - ) + ] + return "_".join(name_params) From 2da81a9802abd6ad225be654b605ba2a17a3829f Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 3 Feb 2023 10:41:46 -0500 Subject: [PATCH 238/266] Update nf-openproblems to v1.10 (#815) * use v1.10 benchmark * just pull the images * update full benchmark to v1.10 --- .github/workflows/run_tests.yml | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 67e70ac3e7..37347ffc5b 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -69,6 +69,17 @@ jobs: pip install --editable .[evaluate] python -c "import openproblems" + - name: Pull Docker images + if: | + ( + startsWith(github.ref, 'refs/heads/test_benchmark') || + startsWith(github.ref, 'refs/heads/test_full_benchmark') + ) + run: | + cd workflow + snakemake -j $(nproc) docker_pull + cd .. + - name: Update Docker images if: | !( @@ -505,13 +516,15 @@ jobs: env: TOWER_WATCH_URL: https://tower.nf/orgs/openproblems-bio/workspaces/openproblems-bio/watch TOWER_WORKSPACE_ID: 53907369739130 + TOWER_TEST_ACTION_ID: "6yMzmbRXXDZMoVqVkEozQo" + TOWER_FULL_ACTION_ID: "6znCmebL2EBgWJTQz0H7pz" BRANCH: ${{ needs.setup_benchmark.outputs.branch }} run: | if [[ ${{ startsWith(github.ref, 'refs/tags') || startsWith(github.ref, 'refs/heads/test_full_benchmark') }} == true ]]; then - TOWER_ACTION_ID="bVQhVSNah1JmJfnKkfyjg" + TOWER_ACTION_ID="${TOWER_FULL_ACTION_ID}" WORKDIR="s3://openproblems-nextflow/work_main" else - TOWER_ACTION_ID="5BQc88ZvjuXCYbc55Hot27" + TOWER_ACTION_ID="${TOWER_TEST_ACTION_ID}" WORKDIR="s3://openproblems-nextflow/work/${BRANCH}" fi generate_parameters() From c50ae881efe28e01ae6d9ab423bedfd315774e7f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 5 Feb 2023 16:42:07 -0500 Subject: [PATCH 239/266] repro repo structure broke update script --- .github/workflows/process_results.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/process_results.yml b/.github/workflows/process_results.yml index d271ee4ad9..e539377b98 100644 --- a/.github/workflows/process_results.yml +++ b/.github/workflows/process_results.yml @@ -83,7 +83,6 @@ jobs: S3_URI="s3://openproblems-nextflow/cwd_example" fi aws s3 cp --quiet --recursive "${S3_URI}" /tmp/results/ - rm -r nbt2022-reproducibility/results/*/*.json rm -r website/data/results/*/ python openproblems/workflow/parse_nextflow.py /tmp website/data/results From 54906870d4a1df72959d191934886ec30801ae21 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 7 Feb 2023 12:25:05 -0500 Subject: [PATCH 240/266] rename ALRA sqrt --- openproblems/tasks/denoising/methods/alra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/denoising/methods/alra.py b/openproblems/tasks/denoising/methods/alra.py index 9a2f54849d..f5216e283e 100644 --- a/openproblems/tasks/denoising/methods/alra.py +++ b/openproblems/tasks/denoising/methods/alra.py @@ -9,7 +9,7 @@ @method( - method_name="ALRA (sqrt norm)", + method_name="ALRA (sqrt norm, reversed normalization)", paper_name="Zero-preserving imputation of scRNA-seq data using " "low-rank approximation", paper_reference="linderman2018zero", From 80b37e7a6aa27df4436f400397564c01276817e0 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Wed, 8 Feb 2023 10:38:22 -0500 Subject: [PATCH 241/266] Fix DR baselines (#816) * fix DR baselines * add density test * fix dataset prep * bugfix * can't have >500 comps * ignore missing parametricumap * typo * account for arpack convergence --- .../datasets/__init__.py | 1 + .../datasets/zebrafish.py | 18 ++++++ .../methods/__init__.py | 4 +- .../methods/baseline.py | 33 ++++------ .../methods/diffusion_map.py | 61 +++++++++++++++++++ .../metrics/density.py | 12 +--- .../metrics/distance_correlation.py | 19 ++---- pytest.ini | 1 + test/test_task_dimensionality_reduction.py | 38 +++++++++++- 9 files changed, 137 insertions(+), 50 deletions(-) create mode 100644 openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py create mode 100644 openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py diff --git a/openproblems/tasks/dimensionality_reduction/datasets/__init__.py b/openproblems/tasks/dimensionality_reduction/datasets/__init__.py index e9ea18289c..14d605081e 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/__init__.py @@ -1,3 +1,4 @@ from .mouse_blood_olsson_labelled import olsson_2016_mouse_blood from .mouse_hspc_nestorowa2016 import mouse_hspc_nestorowa2016 from .tenx_5k_pbmc import tenx_5k_pbmc +from .zebrafish import zebrafish_labs diff --git a/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py b/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py new file mode 100644 index 0000000000..1a3212a012 --- /dev/null +++ b/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py @@ -0,0 +1,18 @@ +from ....data.zebrafish import load_zebrafish +from ....tools.decorators import dataset +from ....tools.normalize import log_cp10k + + +@dataset( + "Zebrafish", + data_url=load_zebrafish.metadata["data_url"], + data_reference=load_zebrafish.metadata["data_reference"], + dataset_summary="90k cells from zebrafish embryos throughout the first day of " + "development, with and without a knockout of chordin, an important developmental " + "gene. Dimensions: 26022 cells, 25258 genes. 24 cell types " + "(avg. 1084±1156 cells per cell type).", +) +def zebrafish_labs(test=False): + adata = load_zebrafish(test=test) + adata.uns["n_genes"] = adata.shape[1] + return log_cp10k(adata) diff --git a/openproblems/tasks/dimensionality_reduction/methods/__init__.py b/openproblems/tasks/dimensionality_reduction/methods/__init__.py index 746452572b..acc9e64a78 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/methods/__init__.py @@ -1,7 +1,7 @@ from .baseline import random_features +from .baseline import spectral_features from .baseline import true_features -from .baseline import true_features_log_cp10k -from .baseline import true_features_log_cp10k_hvg +from .diffusion_map import diffusion_map from .neuralee import neuralee_default from .neuralee import neuralee_logCP10k_1kHVG from .pca import pca_logCP10k diff --git a/openproblems/tasks/dimensionality_reduction/methods/baseline.py b/openproblems/tasks/dimensionality_reduction/methods/baseline.py index 33621c76aa..6403ea2981 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/baseline.py +++ b/openproblems/tasks/dimensionality_reduction/methods/baseline.py @@ -1,7 +1,8 @@ from ....tools.decorators import method from ....tools.normalize import log_cp10k -from ....tools.normalize import log_cp10k_hvg from ....tools.utils import check_version +from .diffusion_map import diffusion_map +from typing import Optional import functools import numpy as np @@ -29,19 +30,6 @@ def random_features(adata, test=False): method_name="True Features", ) def true_features(adata, test=False): - adata.obsm["X_emb"] = adata.X - if test: - adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100] - - adata.obsm["X_emb"] = adata.obsm["X_emb"].toarray() - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - -@_baseline_method( - method_name="True Features (logCP10k)", -) -def true_features_log_cp10k(adata, test=False): adata = log_cp10k(adata) adata.obsm["X_emb"] = adata.X if test: @@ -53,14 +41,15 @@ def true_features_log_cp10k(adata, test=False): @_baseline_method( - method_name="True Features (logCP10k, 1kHVG)", + method_name="Spectral Features", ) -def true_features_log_cp10k_hvg(adata, test=False): - adata = log_cp10k_hvg(adata) - adata.obsm["X_emb"] = adata[:, adata.var["highly_variable"]].copy().X +def spectral_features(adata, test=False, n_comps: Optional[int] = None): + if test: - adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100] + n_comps = n_comps or 20 + else: + n_comps = n_comps or 1000 - adata.obsm["X_emb"] = adata.obsm["X_emb"].toarray() - adata.uns["method_code_version"] = check_version("openproblems") - return adata + n_comps = min(n_comps, min(adata.shape) - 2) + + return diffusion_map(adata, n_comps=n_comps) diff --git a/openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py b/openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py new file mode 100644 index 0000000000..07cefb11af --- /dev/null +++ b/openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py @@ -0,0 +1,61 @@ +from ....tools.decorators import method +from ....tools.normalize import log_cp10k +from ....tools.utils import check_version + + +def _diffusion_map(graph, n_comps, t, n_retries=1): + import numpy as np + import scipy.sparse.linalg + + diag_data = np.asarray(graph.sum(axis=0)) + identity = scipy.sparse.identity(graph.shape[0], dtype=np.float64) + diag = scipy.sparse.spdiags( + 1.0 / np.sqrt(diag_data), 0, graph.shape[0], graph.shape[0] + ) + laplacian = identity - diag * graph * diag + num_lanczos_vectors = max(2 * n_comps + 1, int(np.sqrt(graph.shape[0]))) + try: + eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh( + laplacian, + n_comps, + which="SM", + ncv=num_lanczos_vectors, + tol=1e-4, + v0=np.ones(laplacian.shape[0]), + maxiter=graph.shape[0] * 5, + ) + return (eigenvalues**t) * eigenvectors + except scipy.sparse.linalg.ArpackNoConvergence: + if n_retries > 0: + # add some noise and try again + graph_rand = graph.copy().tocoo() + graph_rand.row = np.random.choice( + graph_rand.shape[0], len(graph_rand.row), replace=True + ) + graph_rand.data *= 0.01 + return _diffusion_map( + graph + graph_rand, n_comps, t, n_retries=n_retries - 1 + ) + else: + raise + + +@method( + method_name="Diffusion maps", + paper_reference="coifman2006diffusion", + paper_name="Diffusion maps", + paper_year=2006, + code_url="https://github.com/openproblems-bio/openproblems", +) +def diffusion_map( + adata, n_comps: int = 2, t: int = 1, test: bool = False, n_retries: int = 1 +): + import umap + + adata = log_cp10k(adata) + + graph = umap.UMAP(transform_mode="graph").fit_transform(adata.X) + + adata.obsm["X_emb"] = _diffusion_map(graph, n_comps, t, n_retries=n_retries) + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/dimensionality_reduction/metrics/density.py b/openproblems/tasks/dimensionality_reduction/metrics/density.py index f782ae7cda..a495d90f03 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/density.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/density.py @@ -47,7 +47,7 @@ def _calculate_radii( # directly taken from: https://github.com/lmcinnes/umap/blob/ # 317ce81dc64aec9e279aa1374ac809d9ced236f6/umap/umap_.py#L1190-L1243 - (knn_indices, knn_dists, rp_forest,) = nearest_neighbors( + knn_indices, knn_dists, _ = nearest_neighbors( X, n_neighbors, "euclidean", @@ -57,7 +57,7 @@ def _calculate_radii( verbose=False, ) - emb_graph, emb_sigmas, emb_rhos, emb_dists = fuzzy_simplicial_set( + emb_graph, _, _, emb_dists = fuzzy_simplicial_set( X, n_neighbors, random_state, @@ -100,21 +100,15 @@ def _calculate_radii( "density preservation", paper_reference="narayan2021assessing", maximize=True, - image="openproblems-python-extras", ) def density_preservation(adata: AnnData) -> float: from scipy.sparse import issparse from scipy.stats import pearsonr - from umap import UMAP emb = adata.obsm["X_emb"] - if np.any(np.isnan(emb)): - return 0.0 high_dim = adata.X.A if issparse(adata.X) else adata.X - _, ro, _ = UMAP( - n_neighbors=_K, random_state=_SEED, densmap=True, output_dens=True - ).fit_transform(high_dim) + ro = _calculate_radii(high_dim, n_neighbors=_K, random_state=_SEED) # in principle, we could just call _calculate_radii(high_dim, ...) # this is made sure that the test pass (otherwise, there was .02 difference in corr) re = _calculate_radii(emb, n_neighbors=_K, random_state=_SEED) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py index 0542c0bc59..bd97abd9ce 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py @@ -1,5 +1,6 @@ from ....tools.decorators import metric from ....tools.normalize import log_cp10k +from ..methods.diffusion_map import diffusion_map def _distance_correlation(X, X_emb): @@ -18,7 +19,7 @@ def _distance_correlation(X, X_emb): maximize=True, paper_reference="schober2018correlation", ) -def distance_correlation(adata, n_svd=200): +def distance_correlation(adata, n_svd=500): """Calculate the root mean squared error. Computes (RMSE) between the full (or processed) data matrix and the @@ -37,23 +38,13 @@ def distance_correlation(adata, n_svd=200): maximize=True, paper_reference="coifman2006diffusion", ) -def distance_correlation_spectral(adata, n_comps=200): +def distance_correlation_spectral(adata, n_comps=1000): """Calculate the spectral root mean squared error Computes (RMSE) between high-dimensional Laplacian eigenmaps on the full (or processed) data matrix and the dimensionally-reduced matrix, invariant to scalar multiplication """ - import numpy as np - import umap - import umap.spectral - - adata = log_cp10k(adata) - n_comps = min(n_comps, min(adata.shape) - 2) - - graph = umap.UMAP(transform_mode="graph").fit_transform(adata.X) - X = umap.spectral.spectral_layout( - adata.X, graph, n_comps, random_state=np.random.default_rng() - ) - return _distance_correlation(X, adata.obsm["X_emb"]) + adata_true = diffusion_map(adata.copy(), n_comps=n_comps) + return _distance_correlation(adata_true.obsm["X_emb"], adata.obsm["X_emb"]) diff --git a/pytest.ini b/pytest.ini index 273a414dbc..7f621389db 100644 --- a/pytest.ini +++ b/pytest.ini @@ -11,4 +11,5 @@ filterwarnings = ignore:is_categorical is deprecated and will be removed in a future version:FutureWarning ignore:The use of (converter|py2rpy|rpy2py) in module rpy2.robjects.conversion is deprecated.:DeprecationWarning ignore:`Model\.state_updates` will be removed in a future version\.:UserWarning + ignore:Tensorflow not installed. ParametricUMAP will be unavailable:ImportWarning always:Container failed with AssertionError\. Retrying [0-9]* more time:RuntimeWarning diff --git a/test/test_task_dimensionality_reduction.py b/test/test_task_dimensionality_reduction.py index 5516e74bbe..442ed801fc 100644 --- a/test/test_task_dimensionality_reduction.py +++ b/test/test_task_dimensionality_reduction.py @@ -26,8 +26,7 @@ def test_trustworthiness_sparse(): # pragma: nocover assert 0 <= m <= 1 -@utils.docker.docker_test(image=TASK.metrics.density_preservation.metadata["image"]) -def test_density_preservation_matches_densmap(): # pragma: nocover +def test_density_preservation_matches_densmap(): from openproblems.tasks.dimensionality_reduction.metrics.density import _K from openproblems.tasks.dimensionality_reduction.metrics.density import _SEED from scipy.stats import pearsonr @@ -52,4 +51,37 @@ def test_density_preservation_matches_densmap(): # pragma: nocover adata.obsm["X_emb"] = emb actual = metric(adata) - np.testing.assert_allclose(expected, actual, rtol=1e-5) + np.testing.assert_allclose(expected, actual, rtol=1e-3) + + +def test_density_preservation_perfect(): + import numpy as np + + task = openproblems.tasks.dimensionality_reduction + metric = openproblems.tasks.dimensionality_reduction.metrics.density_preservation + + adata = task.api.sample_dataset() + adata = task.api.sample_method(adata) + + adata.obsm["X_emb"] = adata.X.toarray() + actual = metric(adata) + + np.testing.assert_allclose(1, actual) + + +def test_diffusion_map_no_convergence(): + import numpy as np + import scipy.sparse.linalg + + adata = ( + openproblems.tasks.dimensionality_reduction.datasets.olsson_2016_mouse_blood() + ) + # no exception with retries + adata = openproblems.tasks.dimensionality_reduction.methods.diffusion_map(adata) + # exception with no retries + np.testing.assert_raises( + scipy.sparse.linalg.ArpackNoConvergence, + openproblems.tasks.dimensionality_reduction.methods.diffusion_map, + adata, + n_retries=0, + ) From 52d1ecc3d947b0a1dce8f0ad8aeb3019ea556a24 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Wed, 8 Feb 2023 12:23:00 -0500 Subject: [PATCH 242/266] set adata.uns['is_baseline'] (#820) --- openproblems/tools/decorators.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/openproblems/tools/decorators.py b/openproblems/tools/decorators.py index b32f2f3a0e..eb4f749ef5 100644 --- a/openproblems/tools/decorators.py +++ b/openproblems/tools/decorators.py @@ -83,9 +83,10 @@ def method( def decorator(func): @functools.wraps(func) - def apply_method(*args, **kwargs): + def apply_method(adata: anndata.AnnData, *args, **kwargs): log.debug("Running {} method".format(func.__name__)) - return func(*args, **kwargs) + adata.uns["is_baseline"] = is_baseline + return func(adata, *args, **kwargs) apply_method.metadata = dict( method_name=method_name, From fe02c56b748a0dedf94d3fa6c6bf0930b7d92337 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Wed, 8 Feb 2023 13:24:32 -0500 Subject: [PATCH 243/266] copy anndata in metric decorator (#819) --- CONTRIBUTING.md | 3 +++ openproblems/tools/decorators.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b187372df5..dbc99a88d1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -178,6 +178,9 @@ Metrics should take an AnnData object and return a `float`. function metric(AnnData adata) -> float ``` +Note that the AnnData object is passed to the metric function as a copy, so there is no +need to copy it internally, even if you modify the object. + Task-specific APIs are described in the README for each task. * [Label Projection](openproblems/tasks/label_projection) diff --git a/openproblems/tools/decorators.py b/openproblems/tools/decorators.py index eb4f749ef5..3c477a0167 100644 --- a/openproblems/tools/decorators.py +++ b/openproblems/tools/decorators.py @@ -126,9 +126,9 @@ def metric(metric_name, maximize, paper_reference, image="openproblems"): def decorator(func): @functools.wraps(func) - def apply_metric(*args, **kwargs): + def apply_metric(adata: anndata.AnnData, *args, **kwargs): log.debug("Running {} metric".format(func.__name__)) - return func(*args, **kwargs) + return func(adata.copy(), *args, **kwargs) apply_metric.metadata = dict( metric_name=metric_name, From 97f4573d13fbc9de19f2cd9dcb11cdef6044989a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 10 Feb 2023 13:45:32 -0500 Subject: [PATCH 244/266] bugfix n_svd < adata.shape --- .../dimensionality_reduction/metrics/distance_correlation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py index bd97abd9ce..777b1e7886 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py @@ -28,8 +28,9 @@ def distance_correlation(adata, n_svd=500): import sklearn.decomposition adata = log_cp10k(adata) - - X = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) + X = adata.X + if n_svd < min(X.shape): + X = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(X) return _distance_correlation(X, adata.obsm["X_emb"]) From 91059f0407c026eaaab5456cc582965896d5fad7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 10 Feb 2023 13:56:57 -0500 Subject: [PATCH 245/266] explicitly avoid `main` --- .github/pull_request_template.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index b59ced7833..0cffb557cc 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -12,7 +12,8 @@ ### Testing -* [ ] This submission was written on a forked copy of openproblems +* [ ] This submission was written on a branch other than `main` in a forked copy of + openproblems * [ ] Nextflow test pipeline is passing on this base branch of this pull request (include link to passed test on NF Tower found in GitHub Actions summary: ) From 07f97752498adecdd38427200c8d2d00bbf8652a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 10 Feb 2023 18:10:34 -0500 Subject: [PATCH 246/266] don't require branch other than main --- .github/pull_request_template.md | 3 +-- .github/workflows/run_tests.yml | 8 ++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 0cffb557cc..ad07e4dd5d 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -12,8 +12,7 @@ ### Testing -* [ ] This submission was written on a branch other than `main` in a forked copy of - openproblems +* [ ] This submission was written in a forked copy of openproblems * [ ] Nextflow test pipeline is passing on this base branch of this pull request (include link to passed test on NF Tower found in GitHub Actions summary: ) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 37347ffc5b..b898f32832 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -84,7 +84,10 @@ jobs: if: | !( startsWith(github.ref, 'refs/heads/test_docker') || - startsWith(github.ref, 'refs/heads/main') + ( + startsWith(github.ref, 'refs/heads/main') && + github.repository == 'openproblems-bio/openproblems' + ) ) run: | cd workflow @@ -101,7 +104,8 @@ jobs: - name: Build and push Docker images if: | - startsWith(github.ref, 'refs/heads/main') + startsWith(github.ref, 'refs/heads/main') && + github.repository == 'openproblems-bio/openproblems' env: DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} run: | From 8b409cd23b6855011151b1d07291a1f0caf52d1f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 10 Feb 2023 18:28:16 -0500 Subject: [PATCH 247/266] make small data dense --- .../dimensionality_reduction/metrics/distance_correlation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py index 777b1e7886..3e23f49571 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py @@ -31,6 +31,8 @@ def distance_correlation(adata, n_svd=500): X = adata.X if n_svd < min(X.shape): X = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(X) + else: + X = X.toarray() return _distance_correlation(X, adata.obsm["X_emb"]) From ee7836251c4c6c371471e95eb7aa6a3e9f133b43 Mon Sep 17 00:00:00 2001 From: Daniel Strobl <50872326+danielStrobl@users.noreply.github.com> Date: Sat, 11 Feb 2023 00:28:46 +0100 Subject: [PATCH 248/266] Don't recompute X_emb and neighborhood graph for baseline datasets (#823) * metrics check if input is baseline * pre-commit * check if embedding or nn graph present for baselines * rest of baseline checks * silhouette baseline check * refactor * refacotr embed * set is_baseline after method to prevent overwrite * set is_baseline in sample method --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante --- .../batch_integration_embed/api.py | 1 + .../batch_integration_embed/metrics/_utils.py | 5 ----- .../batch_integration_embed/metrics/ari.py | 6 ++---- .../metrics/cc_score.py | 6 +++--- .../metrics/graph_connectivity.py | 6 ++---- .../metrics/iso_label_f1.py | 6 ++---- .../batch_integration_embed/metrics/nmi.py | 6 ++---- .../batch_integration_embed/metrics/pcr.py | 4 ++-- .../batch_integration_embed/metrics/utils.py | 16 ++++++++++++++++ .../batch_integration_feature/api.py | 1 + .../batch_integration_feature/metrics/ari.py | 8 ++------ .../metrics/cc_score.py | 8 +++----- .../metrics/graph_connectivity.py | 8 ++------ .../metrics/iso_label_f1.py | 8 ++------ .../metrics/iso_label_sil.py | 6 ++---- .../batch_integration_feature/metrics/kBET.py | 6 ++---- .../batch_integration_feature/metrics/nmi.py | 8 ++------ .../batch_integration_feature/metrics/pcr.py | 6 ++---- .../metrics/sil_batch.py | 6 ++---- .../metrics/silhouette.py | 6 ++---- .../batch_integration_feature/metrics/utils.py | 18 ++++++++++++++++++ openproblems/tools/decorators.py | 3 ++- 22 files changed, 72 insertions(+), 76 deletions(-) delete mode 100644 openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_embed/metrics/utils.py create mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/metrics/utils.py diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py index d13a8d0e07..d534551688 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py @@ -23,4 +23,5 @@ def sample_method(adata): """Create sample method output for testing metrics in this task.""" adata.obsm["X_emb"] = adata.obsm["X_uni_pca"] + adata.uns["is_baseline"] = False return adata diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py deleted file mode 100644 index 8fe3be43aa..0000000000 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/_utils.py +++ /dev/null @@ -1,5 +0,0 @@ -def _get_split(adata): - uni = adata - uni.obsm["X_pca"] = uni.obsm["X_uni_pca"] - uni.X = uni.layers["log_normalized"] - return (uni, adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py index 10822e79ba..9bfe349d12 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_graph import metrics as graph_metrics +from .utils import embedding_to_graph """ The Rand index compares the overlap of two clusterings; @@ -16,7 +17,4 @@ @metric(**graph_metrics.ari.metadata) def ari(adata): - from scanpy.pp import neighbors - - neighbors(adata, use_rep="X_emb") - return graph_metrics.ari(adata) + return graph_metrics.ari(embedding_to_graph(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py index 8d7ee9101f..4cf7650542 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py @@ -1,4 +1,5 @@ from .....tools.decorators import metric +from .utils import get_split """ The cell-cycle conservation score evaluates how well the cell-cycle effect can be @@ -24,13 +25,12 @@ maximize=True, image="openproblems-r-pytorch", ) -def cc_score(adata, test=False): - from ._utils import _get_split +def cc_score(adata): from scib.metrics import cell_cycle try: cc = cell_cycle( - *_get_split(adata), "batch", embed="X_emb", organism=adata.uns["organism"] + *get_split(adata), "batch", embed="X_emb", organism=adata.uns["organism"] ) except ValueError: diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py index e1d2e03cb5..4fc69fe1a2 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_graph import metrics as graph_metrics +from .utils import embedding_to_graph """ The graph connectivity metric assesses whether the kNN graph representation, @@ -22,7 +23,4 @@ @metric(**graph_metrics.graph_connectivity.metadata) def graph_connectivity(adata): - from scanpy.pp import neighbors - - neighbors(adata, use_rep="X_emb") - return graph_metrics.graph_connectivity(adata) + return graph_metrics.graph_connectivity(embedding_to_graph(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py index aa2b9cdafe..578bceed26 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_graph import metrics as graph_metrics +from .utils import embedding_to_graph """ We developed two isolated label scores to evaluate how well the data integration methods @@ -27,7 +28,4 @@ @metric(**graph_metrics.isolated_labels_f1.metadata) def isolated_labels_f1(adata): - from scanpy.pp import neighbors - - neighbors(adata, use_rep="X_emb") - return graph_metrics.isolated_labels_f1(adata) + return graph_metrics.isolated_labels_f1(embedding_to_graph(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py index 06fd3dbb5c..71aa9acdb7 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_graph import metrics as graph_metrics +from .utils import embedding_to_graph """NMI compares the overlap of two clusterings. We used NMI to compare the cell-type labels with Louvain clusters computed on @@ -15,7 +16,4 @@ @metric(**graph_metrics.nmi.metadata) def nmi(adata): - from scanpy.pp import neighbors - - neighbors(adata, use_rep="X_emb") - return graph_metrics.nmi(adata) + return graph_metrics.nmi(embedding_to_graph(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py index 3e68a8ac27..d2a6c011ae 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py @@ -1,4 +1,5 @@ from .....tools.decorators import metric +from .utils import get_split """ Principal component regression, derived from PCA, has previously been used to quantify @@ -22,7 +23,6 @@ image="openproblems-r-pytorch", ) def pcr(adata): - from ._utils import _get_split from scib.metrics import pcr_comparison - return pcr_comparison(*_get_split(adata), "batch", embed="X_emb") + return pcr_comparison(*get_split(adata), "batch", embed="X_emb") diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/utils.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/utils.py new file mode 100644 index 0000000000..455e92ec76 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/utils.py @@ -0,0 +1,16 @@ +def embedding_to_graph(adata): + import scanpy as sc + + if adata.uns["is_baseline"] and "neighbors" in adata.uns: + # precomputed; do nothing + return adata + + sc.pp.neighbors(adata, use_rep="X_emb") + return adata + + +def get_split(adata): + uni = adata + uni.obsm["X_pca"] = uni.obsm["X_uni_pca"] + uni.X = uni.layers["log_normalized"] + return (uni, adata) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py index 4aabaf94ef..bbf5e2f4bb 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py @@ -29,4 +29,5 @@ def sample_dataset(): def sample_method(adata): """Create sample method output for testing metrics in this task.""" adata.X = adata.X.multiply(2) + adata.uns["is_baseline"] = False return adata diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py index cc0a9541cb..48bef53b28 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_graph import metrics as graph_metrics +from .utils import feature_to_graph """ The Rand index compares the overlap of two clusterings; @@ -16,9 +17,4 @@ @metric(**graph_metrics.ari.metadata) def ari(adata): - from scanpy.pp import neighbors - from scanpy.tl import pca - - adata.obsm["X_emb"] = pca(adata.X) - neighbors(adata, use_rep="X_emb") - return graph_metrics.ari(adata) + return graph_metrics.ari(feature_to_graph(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py index d7bd5730d5..778ac40e29 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_embed import metrics as embed_metrics +from .utils import feature_to_embedding """ The cell-cycle conservation score evaluates how well the cell-cycle effect can be @@ -20,8 +21,5 @@ @metric(**embed_metrics.cc_score.metadata) -def cc_score(adata, test=False): - from scanpy.tl import pca - - adata.obsm["X_emb"] = pca(adata.X) - return embed_metrics.cc_score(adata) +def cc_score(adata): + return embed_metrics.cc_score(feature_to_embedding(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py index bc7fbba9cc..4289f1174d 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_graph import metrics as graph_metrics +from .utils import feature_to_graph """ The graph connectivity metric assesses whether the kNN graph representation, @@ -22,9 +23,4 @@ @metric(**graph_metrics.graph_connectivity.metadata) def graph_connectivity(adata): - from scanpy.pp import neighbors - from scanpy.tl import pca - - adata.obsm["X_emb"] = pca(adata.X) - neighbors(adata, use_rep="X_emb") - return graph_metrics.graph_connectivity(adata) + return graph_metrics.graph_connectivity(feature_to_graph(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py index 9e5896c064..048ad0996a 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_graph import metrics as graph_metrics +from .utils import feature_to_graph """ We developed two isolated label scores to evaluate how well the data integration methods @@ -27,9 +28,4 @@ @metric(**graph_metrics.isolated_labels_f1.metadata) def isolated_labels_f1(adata): - from scanpy.pp import neighbors - from scanpy.tl import pca - - adata.obsm["X_emb"] = pca(adata.X) - neighbors(adata, use_rep="X_emb") - return graph_metrics.isolated_labels_f1(adata) + return graph_metrics.isolated_labels_f1(feature_to_graph(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py index 72ece6686d..9f1e3e1115 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_embed import metrics as embed_metrics +from .utils import feature_to_embedding """ Isolated cell labels are defined as the labels present in the least number @@ -15,7 +16,4 @@ @metric(**embed_metrics.isolated_labels_sil.metadata) def isolated_labels_sil(adata): - from scanpy.tl import pca - - adata.obsm["X_emb"] = pca(adata.X) - return embed_metrics.isolated_labels_sil(adata) + return embed_metrics.isolated_labels_sil(feature_to_embedding(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py index 44820545f5..f8ed86d5a7 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_embed import metrics as embed_metrics +from .utils import feature_to_embedding """ The kBET algorithm (v.0.99.6, release 4c9dafa) determines whether the label composition @@ -27,7 +28,4 @@ @metric(**embed_metrics.kBET.metadata) def kBET(adata): - from scanpy.tl import pca - - adata.obsm["X_emb"] = pca(adata.X) - return embed_metrics.kBET(adata) + return embed_metrics.kBET(feature_to_embedding(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py index 3e8e62bde1..21b1cc55ba 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_graph import metrics as graph_metrics +from .utils import feature_to_graph """NMI compares the overlap of two clusterings. We used NMI to compare the cell-type labels with Louvain clusters computed on @@ -15,9 +16,4 @@ @metric(**graph_metrics.nmi.metadata) def nmi(adata): - from scanpy.pp import neighbors - from scanpy.tl import pca - - adata.obsm["X_emb"] = pca(adata.X) - neighbors(adata, use_rep="X_emb") - return graph_metrics.nmi(adata) + return graph_metrics.nmi(feature_to_graph(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py index 8a5d641f37..3a556dbe2f 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_embed import metrics as embed_metrics +from .utils import feature_to_embedding """ Principal component regression, derived from PCA, has previously been used to quantify @@ -18,7 +19,4 @@ @metric(**embed_metrics.pcr.metadata) def pcr(adata): - from scanpy.tl import pca - - adata.obsm["X_emb"] = pca(adata.X) - return embed_metrics.pcr(adata) + return embed_metrics.pcr(feature_to_embedding(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py index 35f024ca44..ac98714333 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_embed import metrics as embed_metrics +from .utils import feature_to_embedding """ We consider the absolute silhouette width, s(i), on @@ -24,7 +25,4 @@ @metric(**embed_metrics.silhouette_batch.metadata) def silhouette_batch(adata): - from scanpy.tl import pca - - adata.obsm["X_emb"] = pca(adata.X) - return embed_metrics.silhouette_batch(adata) + return embed_metrics.silhouette_batch(feature_to_embedding(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py index aa94c887e0..dcd29a8f71 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py @@ -1,5 +1,6 @@ from .....tools.decorators import metric from ...batch_integration_embed import metrics as embed_metrics +from .utils import feature_to_embedding """ For the bio-conservation score, the ASW was computed on cell identity labels and @@ -12,7 +13,4 @@ @metric(**embed_metrics.silhouette.metadata) def silhouette(adata): - from scanpy.tl import pca - - adata.obsm["X_emb"] = pca(adata.X) - return embed_metrics.silhouette(adata) + return embed_metrics.silhouette(feature_to_embedding(adata)) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/utils.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/utils.py new file mode 100644 index 0000000000..d2decfa054 --- /dev/null +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/utils.py @@ -0,0 +1,18 @@ +from ...batch_integration_embed.metrics.utils import embedding_to_graph + + +def feature_to_embedding(adata): + import scanpy as sc + + if adata.uns["is_baseline"] and "X_emb" in adata.obsm: + # precomputed; do nothing + return adata + + adata.obsm["X_emb"] = sc.pp.pca(adata.X) + return adata + + +def feature_to_graph(adata): + adata = feature_to_embedding(adata) + adata = embedding_to_graph(adata) + return adata diff --git a/openproblems/tools/decorators.py b/openproblems/tools/decorators.py index 3c477a0167..dd2af6e193 100644 --- a/openproblems/tools/decorators.py +++ b/openproblems/tools/decorators.py @@ -85,8 +85,9 @@ def decorator(func): @functools.wraps(func) def apply_method(adata: anndata.AnnData, *args, **kwargs): log.debug("Running {} method".format(func.__name__)) + adata = func(adata, *args, **kwargs) adata.uns["is_baseline"] = is_baseline - return func(adata, *args, **kwargs) + return adata apply_method.metadata = dict( method_name=method_name, From 8e26833abb8b3f9a65930812f8e00c6f759362d7 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 13 Feb 2023 09:34:40 -0500 Subject: [PATCH 249/266] test distance correlation --- test/test_task_dimensionality_reduction.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/test_task_dimensionality_reduction.py b/test/test_task_dimensionality_reduction.py index 442ed801fc..b96b4234c2 100644 --- a/test/test_task_dimensionality_reduction.py +++ b/test/test_task_dimensionality_reduction.py @@ -1,5 +1,6 @@ """Specific tests for the dimensionality_reduction task""" import openproblems +import parameterized import utils.docker import utils.git @@ -54,6 +55,26 @@ def test_density_preservation_matches_densmap(): np.testing.assert_allclose(expected, actual, rtol=1e-3) +@parameterized.parameterized.expand( + [(200,), (1000,)], + name_func=utils.name.name_test, +) +def test_distance_correlation_with_svd(n_svd): + import numpy as np + + task = openproblems.tasks.dimensionality_reduction + metric = openproblems.tasks.dimensionality_reduction.metrics.distance_correlation + + adata = task.api.sample_dataset() + adata = task.api.sample_method(adata) + adata.obsm["X_emb"] = adata.X.toarray() + + expected = 1 + actual = metric(adata, n_svd=n_svd) + + np.testing.assert_allclose(expected, actual, rtol=1e-3) + + def test_density_preservation_perfect(): import numpy as np From 05a4c72de330fe877e39f321c043af0cfe476658 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 13 Feb 2023 09:35:02 -0500 Subject: [PATCH 250/266] Changes in destVI code (#826) (#827) * Changes in destVI code (#826) Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> * pre-commit --------- Co-authored-by: Can Ergen Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../tasks/spatial_decomposition/methods/destvi.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/openproblems/tasks/spatial_decomposition/methods/destvi.py b/openproblems/tasks/spatial_decomposition/methods/destvi.py index 0c6ca2b7dd..5486ca238e 100644 --- a/openproblems/tasks/spatial_decomposition/methods/destvi.py +++ b/openproblems/tasks/spatial_decomposition/methods/destvi.py @@ -26,8 +26,8 @@ def destvi( max_epochs_sp = max_epochs_sp or 10 max_epochs_sc = max_epochs_sc or 10 else: # pragma: nocover - max_epochs_sc = max_epochs_sc or 300 - max_epochs_sp = max_epochs_sp or 2500 + max_epochs_sc = max_epochs_sc or 500 + max_epochs_sp = max_epochs_sp or 10000 adata_sc, adata = split_sc_and_sp(adata) @@ -36,15 +36,17 @@ def destvi( sc_model.train( max_epochs=max_epochs_sc, early_stopping=True, - early_stopping_monitor="reconstruction_loss_train", + train_size=0.9, + validation_size=0.1, + early_stopping_monitor="elbo_validation", ) DestVI.setup_anndata(adata) st_model = DestVI.from_rna_model(adata, sc_model) st_model.train( max_epochs=max_epochs_sp, - early_stopping=True, - early_stopping_monitor="reconstruction_loss_train", + batch_size=min(int(adata.n_obs / 20 + 3), 128), + plan_kwargs={"min_kl_weight": 3.0, "max_kl_weight": 3}, ) adata.obsm["proportions_pred"] = st_model.get_proportions().to_numpy() adata.uns["method_code_version"] = check_version("scvi-tools") From 2923b246a016fbeb8e9e8cecd98add9361f7a6d3 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 13 Feb 2023 14:04:14 -0500 Subject: [PATCH 251/266] set explicit token permissions (#828) --- .github/workflows/check_r_dependencies.yml | 4 ++++ .github/workflows/comment_pull_request.yml | 3 +++ .github/workflows/pre-commit.yml | 3 +++ .github/workflows/process_results.yml | 3 +++ .github/workflows/run_tests.yml | 5 +++++ .github/workflows/update_website_content.yml | 3 +++ 6 files changed, 21 insertions(+) diff --git a/.github/workflows/check_r_dependencies.yml b/.github/workflows/check_r_dependencies.yml index d95e7ddd82..bb447a2a24 100644 --- a/.github/workflows/check_r_dependencies.yml +++ b/.github/workflows/check_r_dependencies.yml @@ -16,6 +16,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: write + pull-requests: write + jobs: check-r-dependencies: runs-on: ubuntu-latest diff --git a/.github/workflows/comment_pull_request.yml b/.github/workflows/comment_pull_request.yml index 4b47fe19f6..7b80d44f59 100644 --- a/.github/workflows/comment_pull_request.yml +++ b/.github/workflows/comment_pull_request.yml @@ -4,6 +4,9 @@ on: pull_request_target: types: [opened, synchronize, reopened, ready_for_review] +permissions: + pull-requests: write + jobs: comment_pr: diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 89f579de00..501e225900 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -6,6 +6,9 @@ on: pull_request: types: [opened, synchronize, reopened, ready_for_review] +permissions: + contents: write + concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true diff --git a/.github/workflows/process_results.yml b/.github/workflows/process_results.yml index e539377b98..3a366b6d4a 100644 --- a/.github/workflows/process_results.yml +++ b/.github/workflows/process_results.yml @@ -11,6 +11,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + jobs: process_results: runs-on: ubuntu-latest diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index b898f32832..c4c3f9ece7 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -10,6 +10,11 @@ on: types: - 'submitted' +permissions: + contents: write + packages: write + pull-requests: write + concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true diff --git a/.github/workflows/update_website_content.yml b/.github/workflows/update_website_content.yml index cd270325b2..fdddf261dd 100644 --- a/.github/workflows/update_website_content.yml +++ b/.github/workflows/update_website_content.yml @@ -10,6 +10,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + jobs: update_content: runs-on: ubuntu-latest From 24efa89d494d13406ec4bf927a8d083ae17ccd44 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Wed, 15 Feb 2023 11:34:18 -0500 Subject: [PATCH 252/266] Ignore setuptools+pkg_resources warning Closes #830 --- pytest.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/pytest.ini b/pytest.ini index 7f621389db..b1733d1bda 100644 --- a/pytest.ini +++ b/pytest.ini @@ -12,4 +12,5 @@ filterwarnings = ignore:The use of (converter|py2rpy|rpy2py) in module rpy2.robjects.conversion is deprecated.:DeprecationWarning ignore:`Model\.state_updates` will be removed in a future version\.:UserWarning ignore:Tensorflow not installed. ParametricUMAP will be unavailable:ImportWarning + ignore:Implementing implicit namespace packages \(as specified in PEP 420\) is preferred to `pkg_resources\.declare_namespace`.:DeprecationWarning always:Container failed with AssertionError\. Retrying [0-9]* more time:RuntimeWarning From 964122eff13e20ff0e20154024521ec0c9c181bb Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Wed, 15 Feb 2023 15:34:07 -0500 Subject: [PATCH 253/266] require matplotlib <3.7 --- setup.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 05bac0a377..578a337ae6 100644 --- a/setup.py +++ b/setup.py @@ -4,21 +4,22 @@ import os install_requires = [ - "numpy>=1.21,<1.24", - "scikit-learn>=1.0,<1.2", "anndata==0.8.*", - "scprep>=1.2.2", - "scipy>=1.7,<1.10", - "scanpy>=1.6", - "louvain==0.8.*", - "python-igraph==0.10.*", + "colorama==0.4.*", "decorator<5.0", # pinned in #324 + "louvain==0.8.*", + "matplotlib<3.7.0", "memory-profiler==0.60", - "colorama==0.4.*", + "numpy>=1.21,<1.24", "packaging==21.3", + "pandas==1.3.5", + "python-igraph==0.10.*", + "scanpy>=1.6", + "scipy>=1.7,<1.10", + "scikit-learn>=1.0,<1.2", + "scprep>=1.2.2", "umap-learn==0.5.*", "requests==2.28.*", - "pandas==1.3.5", ] r_requires = [ From 151e1e58c042f4d18e7983413da9b1da4e2902c5 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Thu, 16 Feb 2023 09:02:02 -0500 Subject: [PATCH 254/266] Fix (#831) --- pytest.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index b1733d1bda..b967c2a2f2 100644 --- a/pytest.ini +++ b/pytest.ini @@ -12,5 +12,5 @@ filterwarnings = ignore:The use of (converter|py2rpy|rpy2py) in module rpy2.robjects.conversion is deprecated.:DeprecationWarning ignore:`Model\.state_updates` will be removed in a future version\.:UserWarning ignore:Tensorflow not installed. ParametricUMAP will be unavailable:ImportWarning - ignore:Implementing implicit namespace packages \(as specified in PEP 420\) is preferred to `pkg_resources\.declare_namespace`.:DeprecationWarning + ignore:Deprecated call to `pkg_resources\.declare_namespace:DeprecationWarning always:Container failed with AssertionError\. Retrying [0-9]* more time:RuntimeWarning From a807acb6a6baff8734551f0476c90d9c5429270b Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 20 Feb 2023 07:24:51 -0500 Subject: [PATCH 255/266] harmonize batch integration dataset APIs (#834) --- .../tasks/_batch_integration/_common/api.py | 16 +++++----------- .../batch_integration_embed/README.md | 4 +++- .../batch_integration_embed/api.py | 11 ++--------- .../batch_integration_feature/README.md | 4 +++- .../batch_integration_feature/api.py | 11 ++--------- .../batch_integration_graph/README.md | 2 +- .../batch_integration_graph/api.py | 14 ++------------ 7 files changed, 18 insertions(+), 44 deletions(-) diff --git a/openproblems/tasks/_batch_integration/_common/api.py b/openproblems/tasks/_batch_integration/_common/api.py index bc5c9aa6b1..41ba21b45c 100644 --- a/openproblems/tasks/_batch_integration/_common/api.py +++ b/openproblems/tasks/_batch_integration/_common/api.py @@ -20,8 +20,6 @@ def check_neighbors(adata, neighbors_key, connectivities_key, distances_key): def check_dataset( adata, - do_check_pca=False, - do_check_neighbors=False, do_check_hvg=False, ): """Check that dataset output fits expected API.""" @@ -43,22 +41,20 @@ def check_dataset( assert "organism" in adata.uns assert adata.uns["organism"] in ["mouse", "human"] - if do_check_pca: - assert "X_uni_pca" in adata.obsm + assert "X_uni_pca" in adata.obsm if do_check_hvg: assert "hvg_unint" in adata.uns assert len(adata.uns["hvg_unint"]) == min(N_HVG_UNINT, adata.n_vars) assert np.all(np.isin(adata.uns["hvg_unint"], adata.var.index)) - if do_check_neighbors: - check_neighbors(adata, "uni", "uni_connectivities", "uni_distances") + check_neighbors(adata, "uni", "uni_connectivities", "uni_distances") return True @dataset() -def sample_dataset(run_pca: bool = False, run_neighbors: bool = False): +def sample_dataset(): """Create a simple dataset to use for testing methods in this task.""" import scanpy as sc @@ -79,8 +75,6 @@ def sample_dataset(run_pca: bool = False, run_neighbors: bool = False): adata.uns["hvg_unint"] = precompute_hvg(adata) adata.uns["n_genes_pre"] = adata.n_vars - if run_pca: - adata.obsm["X_uni_pca"] = sc.pp.pca(adata.X) - if run_neighbors: - sc.pp.neighbors(adata, use_rep="X_uni_pca", key_added="uni") + adata.obsm["X_uni_pca"] = sc.pp.pca(adata.X) + sc.pp.neighbors(adata, use_rep="X_uni_pca", key_added="uni") return adata diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md index 77f5845c71..ec89a66a98 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md @@ -64,7 +64,9 @@ Datasets should contain the following attributes: * `adata.obs["batch"]` with the batch covariate, and * `adata.obs["label"]` with the cell identity label -* `adata.obsm['X_uni']` with a pre-integration embedding (PCA) +* `adata.obsm['X_uni_pca']` with the PCA embedding of the unintegrated representation +* `adata.obsp['uni_connectivities']` with an unintegrated connectivity matrix generated + by `scanpy.pp.neighbors()` * `adata.layers['log_normalized']` with log-normalized data * `adata.X` with log-normalized data * `adata.uns["organism"]` with either `"mouse"` or `"human"` diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py index d534551688..d4d8bd3e44 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/api.py @@ -1,9 +1,7 @@ -from ....tools.decorators import dataset from .._common import api -import functools - -check_dataset = functools.partial(api.check_dataset, do_check_pca=True) +check_dataset = api.check_dataset +sample_dataset = api.sample_dataset def check_method(adata, is_baseline=False): @@ -14,11 +12,6 @@ def check_method(adata, is_baseline=False): return True -@dataset() -def sample_dataset(): - return api.sample_dataset(run_pca=True) - - def sample_method(adata): """Create sample method output for testing metrics in this task.""" diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md index 6e9de3fb6b..16f331355e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md @@ -42,7 +42,9 @@ Datasets should contain the following attributes: * `adata.obs["batch"]` with the batch covariate, and * `adata.obs["label"]` with the cell identity label -* `adata.obs["X_uni_pca"]` with a PCA embedding of the uncorrected data +* `adata.obsm['X_uni_pca']` with the PCA embedding of the unintegrated representation +* `adata.obsp['uni_connectivities']` with an unintegrated connectivity matrix generated + by `scanpy.pp.neighbors()` * `adata.layers['counts']` with raw, integer UMI count data, * `adata.layers['log_normalized']` with log-normalized data and * `adata.X` with log-normalized data diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py index bbf5e2f4bb..38feaf63a7 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py @@ -1,11 +1,9 @@ -from ....tools.decorators import dataset from .._common import api import functools -check_dataset = functools.partial( - api.check_dataset, do_check_hvg=True, do_check_pca=True -) +check_dataset = functools.partial(api.check_dataset, do_check_hvg=True) +sample_dataset = api.sample_dataset def check_method(adata, is_baseline=False): @@ -21,11 +19,6 @@ def check_method(adata, is_baseline=False): return True -@dataset() -def sample_dataset(): - return api.sample_dataset(run_pca=True) - - def sample_method(adata): """Create sample method output for testing metrics in this task.""" adata.X = adata.X.multiply(2) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/README.md b/openproblems/tasks/_batch_integration/batch_integration_graph/README.md index f407e8d349..6548338e81 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/README.md @@ -57,7 +57,7 @@ Datasets should contain the following attributes: * `adata.obs["batch"]` with the batch covariate, * `adata.obs["label"]` with the cell identity label, * `adata.layers['counts']` with raw, integer UMI count data, and -* `adata.obsm['X_uni']` with the PCA embedding of the unintegrated representation +* `adata.obsm['X_uni_pca']` with the PCA embedding of the unintegrated representation * `adata.obsp['uni_connectivities']` with an unintegrated connectivity matrix generated by `scanpy.pp.neighbors()` * `adata.X` with log-normalized data diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/api.py b/openproblems/tasks/_batch_integration/batch_integration_graph/api.py index e708faef5a..23a1fb6fe6 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/api.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/api.py @@ -1,14 +1,9 @@ -from ....tools.decorators import dataset from .._common import api -import functools - MIN_CELLS_PER_CELLTYPE = 50 - -check_dataset = functools.partial( - api.check_dataset, do_check_pca=True, do_check_neighbors=True -) +check_dataset = api.check_dataset +sample_dataset = api.sample_dataset def check_method(adata, is_baseline=False): @@ -17,11 +12,6 @@ def check_method(adata, is_baseline=False): return True -@dataset() -def sample_dataset(): - return api.sample_dataset(run_pca=True, run_neighbors=True) - - def sample_method(adata): """Create sample method output for testing metrics in this task.""" import scanpy as sc From db800b75059c187c3e37280cbc3b68970db5dff0 Mon Sep 17 00:00:00 2001 From: Daniel Strobl <50872326+danielStrobl@users.noreply.github.com> Date: Mon, 20 Feb 2023 18:10:33 +0100 Subject: [PATCH 256/266] new common baselines and cross import (#825) * new common baselines and cross import * pre-commit * addressing comments * pre-commit * fix wrong import * wrong import * pre-commit * wrong import 2 * pre-commit --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../_common/methods/__init__.py | 4 + .../_common/methods/baseline.py | 116 ++++++++++++++++++ .../methods/__init__.py | 9 +- .../methods/baseline.py | 48 +------- .../methods/__init__.py | 20 +-- .../methods/baseline.py | 58 --------- .../methods/__init__.py | 8 +- .../methods/baseline.py | 107 +--------------- 8 files changed, 142 insertions(+), 228 deletions(-) create mode 100644 openproblems/tasks/_batch_integration/_common/methods/__init__.py create mode 100644 openproblems/tasks/_batch_integration/_common/methods/baseline.py delete mode 100644 openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py diff --git a/openproblems/tasks/_batch_integration/_common/methods/__init__.py b/openproblems/tasks/_batch_integration/_common/methods/__init__.py new file mode 100644 index 0000000000..3fc9e1fe16 --- /dev/null +++ b/openproblems/tasks/_batch_integration/_common/methods/__init__.py @@ -0,0 +1,4 @@ +from .baseline import batch_random_integration +from .baseline import celltype_random_integration +from .baseline import no_integration +from .baseline import random_integration diff --git a/openproblems/tasks/_batch_integration/_common/methods/baseline.py b/openproblems/tasks/_batch_integration/_common/methods/baseline.py new file mode 100644 index 0000000000..139fadec5f --- /dev/null +++ b/openproblems/tasks/_batch_integration/_common/methods/baseline.py @@ -0,0 +1,116 @@ +from .....tools.decorators import method +from .....tools.utils import check_version + +import functools +import numpy as np + + +def _set_uns(adata): + adata.uns["neighbors"] = adata.uns["uni"] + adata.uns["neighbors"]["connectivities_key"] = "connectivities" + adata.uns["neighbors"]["distances_key"] = "distances" + + +def _randomize_features(X, partition=None): + X_out = X.copy() + if partition is None: + partition = np.full(X.shape[0], 0) + else: + partition = np.asarray(partition) + for partition_name in np.unique(partition): + partition_idx = np.argwhere(partition == partition_name).flatten() + X_out[partition_idx] = X[np.random.permutation(partition_idx)] + return X_out + + +def _randomize_graph(adata, partition=None): + distances, connectivities = ( + adata.obsp["uni_distances"], + adata.obsp["uni_connectivities"], + ) + new_idx = _randomize_features(np.arange(distances.shape[0]), partition=partition) + adata.obsp["distances"] = distances[new_idx][:, new_idx] + adata.obsp["connectivities"] = connectivities[new_idx][:, new_idx] + _set_uns(adata) + return adata + + +def _random_embedding(partition): + from sklearn.preprocessing import LabelEncoder + from sklearn.preprocessing import OneHotEncoder + + embedding = OneHotEncoder().fit_transform( + LabelEncoder().fit_transform(partition)[:, None] + ) + embedding = embedding + np.random.uniform(-0.01, 0.01, embedding.shape) + return embedding + + +_baseline_method = functools.partial( + method, + paper_name="Open Problems for Single Cell Analysis", + paper_reference="openproblems", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) + + +@_baseline_method( + method_name="No Integration", +) +def no_integration(adata, test=False): + adata.obsp["connectivities"] = adata.obsp["uni_connectivities"] + adata.obsp["distances"] = adata.obsp["uni_distances"] + _set_uns(adata) + adata.obsm["X_emb"] = adata.obsm["X_uni_pca"] + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@_baseline_method( + method_name="Random Integration", +) +def random_integration(adata, test=False): + adata.X = _randomize_features(adata.X) + adata.obsm["X_emb"] = _randomize_features(adata.obsm["X_uni_pca"]) + adata = _randomize_graph(adata) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@_baseline_method( + method_name="Random Integration by Celltype", + paper_name="Random Integration by Celltype (baseline)", + paper_reference="openproblems", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) +def celltype_random_integration(adata, test=False): + adata.obsm["X_emb"] = _randomize_features( + adata.obsm["X_uni_pca"], partition=adata.obs["labels"] + ) + adata.X = _randomize_features(adata.X, partition=adata.obs["labels"]) + adata = _randomize_graph( + adata, + partition=adata.obs["labels"].to_numpy(), + ) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@_baseline_method( + method_name="Random Integration by Batch", +) +def batch_random_integration(adata, test=False): + adata.obsm["X_emb"] = _randomize_features( + adata.obsm["X_uni_pca"], partition=adata.obs["batch"] + ) + adata.X = _randomize_features(adata.X, partition=adata.obs["batch"]) + adata = _randomize_graph( + adata, + partition=adata.obs["batch"].to_numpy(), + ) + adata.uns["method_code_version"] = check_version("openproblems") + return adata diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py index 0679d0a530..5360048c7c 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py @@ -1,3 +1,8 @@ +from ..._common.methods.baseline import batch_random_integration +from ..._common.methods.baseline import celltype_random_integration +from ..._common.methods.baseline import no_integration +from ..._common.methods.baseline import random_integration +from ...batch_integration_graph.methods.baseline import celltype_random_graph from ...batch_integration_graph.methods.combat import combat_full_scaled from ...batch_integration_graph.methods.combat import combat_full_unscaled from ...batch_integration_graph.methods.combat import combat_hvg_scaled @@ -28,11 +33,7 @@ from ...batch_integration_graph.methods.scanvi import scanvi_hvg_unscaled from ...batch_integration_graph.methods.scvi import scvi_full_unscaled from ...batch_integration_graph.methods.scvi import scvi_hvg_unscaled -from .baseline import batch_random_integration from .baseline import celltype_random_embedding -from .baseline import celltype_random_integration -from .baseline import no_integration from .baseline import no_integration_batch -from .baseline import random_integration from .scalex import scalex_full from .scalex import scalex_hvg diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py index e219707e41..2005a3f82c 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py @@ -1,7 +1,6 @@ from .....tools.decorators import method from .....tools.utils import check_version -from ...batch_integration_graph.methods.baseline import _random_embedding -from ...batch_integration_graph.methods.baseline import _randomize_features +from ..._common.methods.baseline import _random_embedding import functools import numpy as np @@ -17,40 +16,6 @@ ) -@_baseline_method( - method_name="No Integration", -) -def no_integration(adata, test=False): - adata.obsm["X_emb"] = adata.obsm["X_uni_pca"] - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - -@_baseline_method( - method_name="Random Integration", -) -def random_integration(adata, test=False): - adata.obsm["X_emb"] = _randomize_features(adata.obsm["X_uni_pca"]) - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - -@method( - method_name="Random Integration by Celltype", - paper_name="Random Integration by Celltype (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) -def celltype_random_integration(adata, test=False): - adata.obsm["X_emb"] = _randomize_features( - adata.obsm["X_uni_pca"], partition=adata.obs["labels"] - ) - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - @_baseline_method( method_name="Random Embedding by Celltype", ) @@ -60,17 +25,6 @@ def celltype_random_embedding(adata, test=False): return adata -@_baseline_method( - method_name="Random Integration by Batch", -) -def batch_random_integration(adata, test=False): - adata.obsm["X_emb"] = _randomize_features( - adata.obsm["X_uni_pca"], partition=adata.obs["batch"] - ) - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - @_baseline_method( method_name="No Integration by Batch", ) diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py index 6454d1a617..2db96e595d 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/__init__.py @@ -1,3 +1,14 @@ +# from ...batch_integration_graph.methods.seuratrpca import seuratrpca_full_scaled +# from ...batch_integration_graph.methods.seuratrpca import seuratrpca_full_unscaled +# from ...batch_integration_graph.methods.seuratrpca import seuratrpca_hvg_scaled +# from ...batch_integration_graph.methods.seuratrpca import seuratrpca_hvg_unscaled +from ..._common.methods.baseline import batch_random_integration +from ..._common.methods.baseline import celltype_random_integration +from ..._common.methods.baseline import no_integration +from ..._common.methods.baseline import random_integration +from ...batch_integration_embed.methods.baseline import celltype_random_embedding +from ...batch_integration_embed.methods.baseline import no_integration_batch +from ...batch_integration_graph.methods.baseline import celltype_random_graph from ...batch_integration_graph.methods.combat import combat_full_scaled from ...batch_integration_graph.methods.combat import combat_full_unscaled from ...batch_integration_graph.methods.combat import combat_hvg_scaled @@ -28,10 +39,6 @@ from ...batch_integration_graph.methods.scanorama import scanorama_feature_full_unscaled from ...batch_integration_graph.methods.scanorama import scanorama_feature_hvg_scaled from ...batch_integration_graph.methods.scanorama import scanorama_feature_hvg_unscaled -from .baseline import batch_random_integration -from .baseline import celltype_random_integration -from .baseline import no_integration -from .baseline import random_integration from .scalex import scalex_full from .scalex import scalex_hvg @@ -44,8 +51,3 @@ # from ...batch_integration_graph.methods.seurat_full import seurat_full_unscaled # from ...batch_integration_graph.methods.seurat_full import seurat_hvg_scaled # from ...batch_integration_graph.methods.seurat_full import seurat_hvg_unscaled - -# from ...batch_integration_graph.methods.seuratrpca import seuratrpca_full_scaled -# from ...batch_integration_graph.methods.seuratrpca import seuratrpca_full_unscaled -# from ...batch_integration_graph.methods.seuratrpca import seuratrpca_hvg_scaled -# from ...batch_integration_graph.methods.seuratrpca import seuratrpca_hvg_unscaled diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py deleted file mode 100644 index 7f71fdb686..0000000000 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/methods/baseline.py +++ /dev/null @@ -1,58 +0,0 @@ -from .....tools.decorators import method -from .....tools.utils import check_version -from ...batch_integration_graph.methods.baseline import _randomize_features - - -@method( - method_name="No Integration", - paper_name="No Integration (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) -def no_integration(adata, test=False): - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - -@method( - method_name="Random Integration", - paper_name="Random Integration (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) -def random_integration(adata, test=False): - adata.X = _randomize_features(adata.X) - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - -@method( - method_name="Random Integration by Celltype", - paper_name="Random Integration by Celltype (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) -def celltype_random_integration(adata, test=False): - adata.X = _randomize_features(adata.X, partition=adata.obs["labels"]) - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - -@method( - method_name="Random Integration by Batch", - paper_name="Random Integration by Batch (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) -def batch_random_integration(adata, test=False): - adata.X = _randomize_features(adata.X, partition=adata.obs["batch"]) - adata.uns["method_code_version"] = check_version("openproblems") - return adata diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py index f664b9139d..8fcbb6dac9 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py @@ -1,8 +1,8 @@ -from .baseline import batch_random_integration +from ..._common.methods.baseline import batch_random_integration +from ..._common.methods.baseline import celltype_random_integration +from ..._common.methods.baseline import no_integration +from ..._common.methods.baseline import random_integration from .baseline import celltype_random_graph -from .baseline import celltype_random_integration -from .baseline import no_integration -from .baseline import random_integration from .bbknn import bbknn_full_scaled from .bbknn import bbknn_full_unscaled from .bbknn import bbknn_hvg_scaled diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py index 8a3236795f..b26c0cc760 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py @@ -1,115 +1,10 @@ from .....tools.decorators import method from .....tools.utils import check_version +from ..._common.methods.baseline import _random_embedding -import numpy as np import scanpy as sc -def _set_uns(adata): - adata.uns["neighbors"] = adata.uns["uni"] - adata.uns["neighbors"]["connectivities_key"] = "connectivities" - adata.uns["neighbors"]["distances_key"] = "distances" - - -@method( - method_name="No Integration", - paper_name="No Integration (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) -def no_integration(adata, test=False): - adata.obsp["connectivities"] = adata.obsp["uni_connectivities"] - adata.obsp["distances"] = adata.obsp["uni_distances"] - _set_uns(adata) - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - -def _randomize_features(X, partition=None): - X_out = X.copy() - if partition is None: - partition = np.full(X.shape[0], 0) - else: - partition = np.asarray(partition) - for partition_name in np.unique(partition): - partition_idx = np.argwhere(partition == partition_name).flatten() - X_out[partition_idx] = X[np.random.permutation(partition_idx)] - return X_out - - -def _randomize_graph(adata, partition=None): - distances, connectivities = ( - adata.obsp["uni_distances"], - adata.obsp["uni_connectivities"], - ) - new_idx = _randomize_features(np.arange(distances.shape[0]), partition=partition) - adata.obsp["distances"] = distances[new_idx][:, new_idx] - adata.obsp["connectivities"] = connectivities[new_idx][:, new_idx] - _set_uns(adata) - return adata - - -def _random_embedding(partition): - from sklearn.preprocessing import LabelEncoder - from sklearn.preprocessing import OneHotEncoder - - embedding = OneHotEncoder().fit_transform( - LabelEncoder().fit_transform(partition)[:, None] - ) - embedding = embedding + np.random.uniform(-0.01, 0.01, embedding.shape) - return embedding - - -@method( - method_name="Random Integration", - paper_name="Random Integration (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) -def random_integration(adata, test=False): - adata = _randomize_graph(adata) - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - -@method( - method_name="Random Integration by Celltype", - paper_name="Random Integration by Celltype (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) -def celltype_random_integration(adata, test=False): - adata = _randomize_graph( - adata, - partition=adata.obs["labels"].to_numpy(), - ) - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - -@method( - method_name="Random Integration by Batch", - paper_name="Random Integration by Batch (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) -def batch_random_integration(adata, test=False): - adata = _randomize_graph( - adata, - partition=adata.obs["batch"].to_numpy(), - ) - adata.uns["method_code_version"] = check_version("openproblems") - return adata - - @method( method_name="Random Graph by Celltype", paper_name="Random Graph by Celltype (baseline)", From d4a00be70c82d6ba46b133e064ec4162f0c6c46d Mon Sep 17 00:00:00 2001 From: Daniel Strobl <50872326+danielStrobl@users.noreply.github.com> Date: Tue, 21 Feb 2023 16:17:09 +0100 Subject: [PATCH 257/266] jitter baseline patch (#838) * jitter baseline patch * celltype_random_embedding_w/o_jitter * Split jitter and no jitter * import --------- Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .../_batch_integration/_common/methods/baseline.py | 5 +++-- .../batch_integration_embed/methods/__init__.py | 1 + .../batch_integration_embed/methods/baseline.py | 11 ++++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/openproblems/tasks/_batch_integration/_common/methods/baseline.py b/openproblems/tasks/_batch_integration/_common/methods/baseline.py index 139fadec5f..6e2fe07d0b 100644 --- a/openproblems/tasks/_batch_integration/_common/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/_common/methods/baseline.py @@ -35,14 +35,15 @@ def _randomize_graph(adata, partition=None): return adata -def _random_embedding(partition): +def _random_embedding(partition, jitter=0.01): from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder embedding = OneHotEncoder().fit_transform( LabelEncoder().fit_transform(partition)[:, None] ) - embedding = embedding + np.random.uniform(-0.01, 0.01, embedding.shape) + if jitter is not None: + embedding = embedding + np.random.uniform(-1 * jitter, jitter, embedding.shape) return embedding diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py index 5360048c7c..6bff40b9ff 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/__init__.py @@ -34,6 +34,7 @@ from ...batch_integration_graph.methods.scvi import scvi_full_unscaled from ...batch_integration_graph.methods.scvi import scvi_hvg_unscaled from .baseline import celltype_random_embedding +from .baseline import celltype_random_embedding_jitter from .baseline import no_integration_batch from .scalex import scalex_full from .scalex import scalex_hvg diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py index 2005a3f82c..18a0830f5b 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py @@ -16,11 +16,20 @@ ) +@_baseline_method( + method_name="Random Embedding by Celltype (with jitter)", +) +def celltype_random_embedding_jitter(adata, test=False): + adata.obsm["X_emb"] = _random_embedding(partition=adata.obs["labels"], jitter=0.01) + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + @_baseline_method( method_name="Random Embedding by Celltype", ) def celltype_random_embedding(adata, test=False): - adata.obsm["X_emb"] = _random_embedding(partition=adata.obs["labels"]) + adata.obsm["X_emb"] = _random_embedding(partition=adata.obs["labels"], jitter=None) adata.uns["method_code_version"] = check_version("openproblems") return adata From 86375dd2be7b9318dbada7f2cb3379c853f1acd2 Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Tue, 21 Feb 2023 10:39:31 -0500 Subject: [PATCH 258/266] Add reversed norm order for ALRA in Denoising Task (#835) * add reverse order/regular order Try to match the magic code format for decorators and implementation of reverse norm * Update __init__.py * pre-commit * Update alra.py * Update alra.py * pre-commit * Fix method names * Update alra.py * function names should be lowercase * Update alra.R * X is unused * Fix bug * Revert 77302de6a76e35b7c555f84d199cb50944fa460f * pre-commit * Actually pass sqrt in sqrt norm methods --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> --- .../tasks/denoising/methods/__init__.py | 2 + openproblems/tasks/denoising/methods/alra.py | 113 +++++++++--------- 2 files changed, 58 insertions(+), 57 deletions(-) diff --git a/openproblems/tasks/denoising/methods/__init__.py b/openproblems/tasks/denoising/methods/__init__.py index 4afcf70031..b0a3994a61 100644 --- a/openproblems/tasks/denoising/methods/__init__.py +++ b/openproblems/tasks/denoising/methods/__init__.py @@ -1,5 +1,7 @@ from .alra import alra_log +from .alra import alra_log_reversenorm from .alra import alra_sqrt +from .alra import alra_sqrt_reversenorm from .baseline import no_denoising from .baseline import perfect_denoising from .dca import dca diff --git a/openproblems/tasks/denoising/methods/alra.py b/openproblems/tasks/denoising/methods/alra.py index f5216e283e..fe4ed2ce9e 100644 --- a/openproblems/tasks/denoising/methods/alra.py +++ b/openproblems/tasks/denoising/methods/alra.py @@ -1,15 +1,17 @@ from ....tools.conversion import r_function from ....tools.decorators import method +import functools import logging -_alra = r_function("alra.R") +_r_alra = r_function("alra.R") log = logging.getLogger("openproblems") -@method( - method_name="ALRA (sqrt norm, reversed normalization)", +method_name = ("ALRA (sqrt norm, reversed normalization)",) +_alra_method = functools.partial( + method, paper_name="Zero-preserving imputation of scRNA-seq data using " "low-rank approximation", paper_reference="linderman2018zero", @@ -17,26 +19,43 @@ code_url="https://github.com/KlugerLab/ALRA", image="openproblems-r-extras", ) -def alra_sqrt(adata, test=False): + + +def _alra(adata, normtype="log", reverse_norm_order=False, test=False): import numpy as np import rpy2.rinterface_lib.embedded import scprep - # libsize and sqrt norm - adata.obsm["train_norm"] = scprep.utils.matrix_transform( - adata.obsm["train"], np.sqrt - ) - adata.obsm["train_norm"], libsize = scprep.normalize.library_size_normalize( - adata.obsm["train_norm"], rescale=1, return_library_size=True - ) - adata.obsm["train_norm"] = adata.obsm["train_norm"].tocsr() + if normtype == "sqrt": + norm_fn = np.sqrt + denorm_fn = np.square + elif normtype == "log": + norm_fn = np.log1p + denorm_fn = np.expm1 + else: + raise NotImplementedError + + X = adata.obsm["train"].copy() + if reverse_norm_order: + # inexplicably, this sometimes performs better + X = scprep.utils.matrix_transform(X, norm_fn) + X, libsize = scprep.normalize.library_size_normalize( + X, rescale=1, return_library_size=True + ) + else: + X, libsize = scprep.normalize.library_size_normalize( + X, rescale=1, return_library_size=True + ) + X = scprep.utils.matrix_transform(X, norm_fn) + + adata.obsm["train_norm"] = X.tocsr() # run alra - # _alra takes sparse array, returns dense array + # _r_alra takes sparse array, returns dense array Y = None attempts = 0 while Y is None: try: - Y = _alra(adata) + Y = _r_alra(adata) except rpy2.rinterface_lib.embedded.RRuntimeError: # pragma: no cover if attempts < 10: attempts += 1 @@ -46,7 +65,7 @@ def alra_sqrt(adata, test=False): # transform back into original space # functions are reversed! - Y = scprep.utils.matrix_transform(Y, np.square) + Y = scprep.utils.matrix_transform(Y, denorm_fn) Y = scprep.utils.matrix_vector_elementwise_multiply(Y, libsize, axis=0) adata.obsm["denoised"] = Y @@ -54,49 +73,29 @@ def alra_sqrt(adata, test=False): return adata -@method( - method_name="ALRA (log norm)", - paper_name="Zero-preserving imputation of scRNA-seq data using " - "low-rank approximation", - paper_reference="linderman2018zero", - paper_year=2018, - code_url="https://github.com/KlugerLab/ALRA", - image="openproblems-r-extras", +@_alra_method( + method_name="ALRA (sqrt norm, reversed normalization)", ) -def alra_log(adata, test=False): - import numpy as np - import rpy2.rinterface_lib.embedded - import scprep +def alra_sqrt_reversenorm(adata, test=False): + return _alra(adata, normtype="sqrt", reverse_norm_order=True, test=False) - # libsize and log norm - # lib norm - adata.obsm["train_norm"], libsize = scprep.normalize.library_size_normalize( - adata.obsm["train"], rescale=1, return_library_size=True - ) - # log - adata.obsm["train_norm"] = scprep.utils.matrix_transform( - adata.obsm["train_norm"], np.log1p - ) - # to csr - adata.obsm["train_norm"] = adata.obsm["train_norm"].tocsr() - # run alra - # _alra takes sparse array, returns dense array - Y = None - attempts = 0 - while Y is None: - try: - Y = _alra(adata) - except rpy2.rinterface_lib.embedded.RRuntimeError: # pragma: no cover - if attempts < 10: - attempts += 1 - log.warning(f"alra.R failed (attempt {attempts})") - else: - raise - # transform back into original space - Y = scprep.utils.matrix_transform(Y, np.expm1) - Y = scprep.utils.matrix_vector_elementwise_multiply(Y, libsize, axis=0) - adata.obsm["denoised"] = Y +@_alra_method( + method_name="ALRA (log norm, reversed normalization)", +) +def alra_log_reversenorm(adata, test=False): + return _alra(adata, normtype="log", reverse_norm_order=True, test=False) - adata.uns["method_code_version"] = "1.0.0" - return adata + +@_alra_method( + method_name="ALRA (sqrt norm)", +) +def alra_sqrt(adata, test=False): + return _alra(adata, normtype="sqrt", reverse_norm_order=False, test=False) + + +@_alra_method( + method_name="ALRA (log norm)", +) +def alra_log(adata, test=False): + return _alra(adata, normtype="log", reverse_norm_order=False, test=False) From 9d1665076cf6215a31f89ed2be8be20a02502887 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 27 Feb 2023 11:16:09 -0500 Subject: [PATCH 259/266] normalize strings --- openproblems/api/hash.py | 10 +++--- openproblems/data/tabula_muris_senis.py | 8 ++--- openproblems/patch.py | 2 +- .../_common/datasets/immune.py | 7 ++-- .../_common/datasets/lung.py | 6 ++-- .../_common/datasets/pancreas.py | 7 ++-- .../datasets/tnbc_wu2021.py | 11 ++++--- .../datasets/allen_brain_atlas.py | 11 ++++--- .../tasks/denoising/datasets/pancreas.py | 9 ++--- openproblems/tasks/denoising/datasets/pbmc.py | 4 +-- .../denoising/datasets/tabula_muris_senis.py | 8 +++-- .../datasets/mouse_blood_olsson_labelled.py | 7 ++-- .../datasets/mouse_hspc_nestorowa2016.py | 7 ++-- .../datasets/tenx_5k_pbmc.py | 6 ++-- .../datasets/zebrafish.py | 10 +++--- .../tasks/label_projection/datasets/cengen.py | 18 +++++----- .../label_projection/datasets/pancreas.py | 33 ++++++++++--------- .../datasets/tabula_muris_senis.py | 10 +++--- .../label_projection/datasets/zebrafish.py | 22 +++++++------ .../matching_modalities/datasets/citeseq.py | 8 +++-- .../matching_modalities/datasets/scicar.py | 16 +++++---- .../datasets/scicar.py | 8 +++-- .../datasets/tabula_muris_senis.py | 11 +++---- openproblems/utils.py | 4 +-- pyproject.toml | 7 ++++ setup.cfg | 5 --- test/test_core_tools.py | 2 +- test/test_core_utils.py | 4 +-- ...k_cell_cell_communication_source_target.py | 8 ++--- test/test_task_methods.py | 4 +-- test/utils/asserts.py | 5 +-- test/utils/docker.py | 8 ++--- workflow/parse_nextflow.py | 18 ++++++---- workflow/snakemake_tools.py | 6 ++-- 34 files changed, 175 insertions(+), 135 deletions(-) create mode 100644 pyproject.toml diff --git a/openproblems/api/hash.py b/openproblems/api/hash.py index bb1af2dd7c..3c5786ad44 100644 --- a/openproblems/api/hash.py +++ b/openproblems/api/hash.py @@ -46,8 +46,10 @@ def docker_token(image_name): [ "curl", "--silent", - f"https://auth.docker.io/token?scope=repository:{image_name}:" - "pull&service=registry.docker.io", + ( + f"https://auth.docker.io/token?scope=repository:{image_name}:" + "pull&service=registry.docker.io" + ), ] ) ) @@ -88,8 +90,8 @@ def docker_hash(image_name): return docker_labels_from_api(image_name)["bio.openproblems.hash"] except Exception: # pragma: nocover warnings.warn( - "Failed to access docker or the docker API; docker image hash failed. " - f"All jobs using {image_name} will not be cached." + "Failed to access docker or the docker API; docker image hash failed. All" + f" jobs using {image_name} will not be cached." ) return str(random.getrandbits(256)) diff --git a/openproblems/data/tabula_muris_senis.py b/openproblems/data/tabula_muris_senis.py index 78afe934d8..fd871744d2 100644 --- a/openproblems/data/tabula_muris_senis.py +++ b/openproblems/data/tabula_muris_senis.py @@ -29,8 +29,8 @@ def check_unknown_organs(datasets, organ_list): unknown_organs = set(organ_list) - known_organs if unknown_organs: raise ValueError( - f"Unknown organs provided in `organ_list': {', '.join(unknown_organs)}. " - f"Known organs are {', '.join(known_organs)}" + f"Unknown organs provided in `organ_list': {', '.join(unknown_organs)}." + f" Known organs are {', '.join(known_organs)}" ) @@ -107,8 +107,8 @@ def load_tabula_muris_senis(test=False, method_list=None, organ_list=None): unknown_methods = set(method_list) - set(["facs", "droplet"]) if unknown_methods: raise ValueError( - f"Unknown methods provided in `method_list': {','.join(unknown_methods)}. " - "Known methods are `facs' and `droplet'" + f"Unknown methods provided in `method_list': {','.join(unknown_methods)}." + " Known methods are `facs' and `droplet'" ) datasets_path = f"/curation/v1/collections/{COLLECTION_ID}" diff --git a/openproblems/patch.py b/openproblems/patch.py index 326bc38f29..809e187c0d 100644 --- a/openproblems/patch.py +++ b/openproblems/patch.py @@ -52,7 +52,7 @@ def _download_aftp( if timeout: wget_command_list += ["-T", str(timeout)] - log.debug("Running: %s" % (" ".join(wget_command_list))) + log.debug("Running: %s" % " ".join(wget_command_list)) subprocess.call(wget_command_list) return tmp_path diff --git a/openproblems/tasks/_batch_integration/_common/datasets/immune.py b/openproblems/tasks/_batch_integration/_common/datasets/immune.py index efea9dee09..4732e9c3a3 100644 --- a/openproblems/tasks/_batch_integration/_common/datasets/immune.py +++ b/openproblems/tasks/_batch_integration/_common/datasets/immune.py @@ -9,9 +9,10 @@ dataset_name="Immune (by batch)", data_url=load_immune.metadata["data_url"], data_reference=load_immune.metadata["data_reference"], - dataset_summary="Human immune cells from peripheral blood and bone marrow " - "taken from 5 datasets comprising 10 batches across technologies (10X, " - "Smart-seq2).", + dataset_summary=( + "Human immune cells from peripheral blood and bone marrow taken from 5 datasets" + " comprising 10 batches across technologies (10X, Smart-seq2)." + ), image="openproblems", ) def immune_batch( diff --git a/openproblems/tasks/_batch_integration/_common/datasets/lung.py b/openproblems/tasks/_batch_integration/_common/datasets/lung.py index f5610e1f1d..0cdea36d46 100644 --- a/openproblems/tasks/_batch_integration/_common/datasets/lung.py +++ b/openproblems/tasks/_batch_integration/_common/datasets/lung.py @@ -9,8 +9,10 @@ dataset_name="Lung (Viera Braga et al.)", data_url=load_lung.metadata["data_url"], data_reference=load_lung.metadata["data_reference"], - dataset_summary="Human lung scRNA-seq data from 3 datasets with 32,472 cells. " - "From Vieira Braga et al. Technologies: 10X and Drop-seq.", + dataset_summary=( + "Human lung scRNA-seq data from 3 datasets with 32,472 cells. From Vieira Braga" + " et al. Technologies: 10X and Drop-seq." + ), image="openproblems", ) def lung_batch( diff --git a/openproblems/tasks/_batch_integration/_common/datasets/pancreas.py b/openproblems/tasks/_batch_integration/_common/datasets/pancreas.py index 7e9aa890cd..ff611ea1cc 100644 --- a/openproblems/tasks/_batch_integration/_common/datasets/pancreas.py +++ b/openproblems/tasks/_batch_integration/_common/datasets/pancreas.py @@ -9,9 +9,10 @@ dataset_name="Pancreas (by batch)", data_url=load_pancreas.metadata["data_url"], data_reference=load_pancreas.metadata["data_reference"], - dataset_summary="Human pancreatic islet scRNA-seq data from 6 datasets " - "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " - "and SMARTER-seq).", + dataset_summary=( + "Human pancreatic islet scRNA-seq data from 6 datasets across technologies" + " (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, and SMARTER-seq)." + ), image="openproblems", ) def pancreas_batch( diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py index 76e5282886..e58d6241c0 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/datasets/tnbc_wu2021.py @@ -10,11 +10,12 @@ "Triple negative breast cancer atlas", data_url=load_tnbc_data.metadata["data_url"], data_reference=load_tnbc_data.metadata["data_reference"], - dataset_summary="Human breast cancer atlas (Wu et al., 2021), " - "with cytokine activities, inferred using a multivariate " - "linear model with cytokine-focused signatures, as assumed true " - "cell-cell communication (Dimitrov et al., 2022). " - "42512 cells x 28078 features with 29 cell types from 10 patients", + dataset_summary=( + "Human breast cancer atlas (Wu et al., 2021), with cytokine activities," + " inferred using a multivariate linear model with cytokine-focused signatures," + " as assumed true cell-cell communication (Dimitrov et al., 2022). 42512 cells" + " x 28078 features with 29 cell types from 10 patients" + ), image="openproblems-r-extras", ) def tnbc_data(test=False): diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py index 009f2bdc90..05c90cc6be 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/datasets/allen_brain_atlas.py @@ -7,11 +7,12 @@ "Mouse brain atlas", data_url=load_mouse_brain_atlas.metadata["data_url"], data_reference=load_mouse_brain_atlas.metadata["data_reference"], - dataset_summary="A murine brain atlas with adjacent cell types as assumed " - "benchmark truth, inferred from deconvolution proportion " - "correlations using matching 10x Visium slides " - "(see Dimitrov et al., 2022). " - "14249 cells x 34617 features with 23 cell type labels.", + dataset_summary=( + "A murine brain atlas with adjacent cell types as assumed benchmark truth," + " inferred from deconvolution proportion correlations using matching 10x Visium" + " slides (see Dimitrov et al., 2022). 14249 cells x 34617 features with 23 cell" + " type labels." + ), image="openproblems-r-extras", ) def mouse_brain_atlas(test=False): diff --git a/openproblems/tasks/denoising/datasets/pancreas.py b/openproblems/tasks/denoising/datasets/pancreas.py index bd039ad88e..187b2e4106 100644 --- a/openproblems/tasks/denoising/datasets/pancreas.py +++ b/openproblems/tasks/denoising/datasets/pancreas.py @@ -7,10 +7,11 @@ dataset_name="Pancreas (inDrop)", data_url=load_pancreas.metadata["data_url"], data_reference=load_pancreas.metadata["data_reference"], - dataset_summary="Human pancreatic islet scRNA-seq data from 6 datasets " - "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " - "and SMARTER-seq). Here we just use the inDrop1 batch, which includes" - "1937 cells × 15502 genes.", + dataset_summary=( + "Human pancreatic islet scRNA-seq data from 6 datasets across technologies" + " (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, and SMARTER-seq). Here" + " we just use the inDrop1 batch, which includes1937 cells × 15502 genes." + ), image="openproblems-python-pytorch", ) def pancreas(test=False): diff --git a/openproblems/tasks/denoising/datasets/pbmc.py b/openproblems/tasks/denoising/datasets/pbmc.py index 440ebe8a3b..92ed2c94de 100644 --- a/openproblems/tasks/denoising/datasets/pbmc.py +++ b/openproblems/tasks/denoising/datasets/pbmc.py @@ -8,8 +8,8 @@ data_url=load_tenx_1k_pbmc.metadata["data_url"], data_reference=load_tenx_1k_pbmc.metadata["data_reference"], dataset_summary=( - "1k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. " - "Sequenced on 10X v3 chemistry in November 2018 by 10X Genomics." + "1k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. Sequenced" + " on 10X v3 chemistry in November 2018 by 10X Genomics." ), image="openproblems-python-pytorch", ) diff --git a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py index 9524cc4e95..8aca9661a2 100644 --- a/openproblems/tasks/denoising/datasets/tabula_muris_senis.py +++ b/openproblems/tasks/denoising/datasets/tabula_muris_senis.py @@ -7,9 +7,11 @@ "Tabula Muris Senis Lung", data_url=load_tabula_muris_senis.metadata["data_url"], data_reference=load_tabula_muris_senis.metadata["data_reference"], - dataset_summary="All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 " - "organs and tissues across the mouse lifespan. Here we use just 10x data from lung." - " 24540 cells × 16160 genes across 3 time points.", + dataset_summary=( + "All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 organs and" + " tissues across the mouse lifespan. Here we use just 10x data from lung. 24540" + " cells × 16160 genes across 3 time points." + ), image="openproblems-python-pytorch", ) def tabula_muris_senis_lung_random(test=False): diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py index 9cbf278db0..23f6e31463 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py @@ -7,9 +7,10 @@ "Mouse myeloid lineage differentiation", data_url=load_olsson_2016_mouse_blood.metadata["data_url"], data_reference=load_olsson_2016_mouse_blood.metadata["data_reference"], - dataset_summary="Myeloid lineage differentiation from mouse blood. " - "Sequenced by SMARTseq in 2016 by Olsson et al. " - "660 cells x 112815 features with 4 cell type labels", + dataset_summary=( + "Myeloid lineage differentiation from mouse blood. Sequenced by SMARTseq in" + " 2016 by Olsson et al. 660 cells x 112815 features with 4 cell type labels" + ), ) def olsson_2016_mouse_blood(test=False): adata = load_olsson_2016_mouse_blood(test=test) diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py index 10f4d428cb..864783a568 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py @@ -7,9 +7,10 @@ "Mouse hematopoietic stem cell differentiation", data_url=load_mouse_hspc_nestorowa2016.metadata["data_url"], data_reference=load_mouse_hspc_nestorowa2016.metadata["data_reference"], - dataset_summary="1.6k hematopoietic stem and progenitor cells from mouse bone " - "marrow. Sequenced by Smart-seq2. " - "1920 cells x 43258 features with 3 cell type labels", + dataset_summary=( + "1.6k hematopoietic stem and progenitor cells from mouse bone marrow. Sequenced" + " by Smart-seq2. 1920 cells x 43258 features with 3 cell type labels" + ), ) def mouse_hspc_nestorowa2016(test=False): adata = load_mouse_hspc_nestorowa2016(test=test) diff --git a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py index 2811fb58db..69dad9a9d3 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py @@ -8,9 +8,9 @@ data_url=load_tenx_5k_pbmc.metadata["data_url"], data_reference=load_tenx_5k_pbmc.metadata["data_reference"], dataset_summary=( - "5k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. " - "Sequenced on 10X v3 chemistry in July 2019 by 10X Genomics. " - "5247 cells x 20822 features with no cell type labels" + "5k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. Sequenced" + " on 10X v3 chemistry in July 2019 by 10X Genomics. 5247 cells x 20822 features" + " with no cell type labels" ), ) def tenx_5k_pbmc(test=False): diff --git a/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py b/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py index 1a3212a012..8de2a0db91 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py @@ -7,10 +7,12 @@ "Zebrafish", data_url=load_zebrafish.metadata["data_url"], data_reference=load_zebrafish.metadata["data_reference"], - dataset_summary="90k cells from zebrafish embryos throughout the first day of " - "development, with and without a knockout of chordin, an important developmental " - "gene. Dimensions: 26022 cells, 25258 genes. 24 cell types " - "(avg. 1084±1156 cells per cell type).", + dataset_summary=( + "90k cells from zebrafish embryos throughout the first day of development, with" + " and without a knockout of chordin, an important developmental gene." + " Dimensions: 26022 cells, 25258 genes. 24 cell types (avg. 1084±1156 cells per" + " cell type)." + ), ) def zebrafish_labs(test=False): adata = load_zebrafish(test=test) diff --git a/openproblems/tasks/label_projection/datasets/cengen.py b/openproblems/tasks/label_projection/datasets/cengen.py index 6575688b3e..e53acd7b13 100644 --- a/openproblems/tasks/label_projection/datasets/cengen.py +++ b/openproblems/tasks/label_projection/datasets/cengen.py @@ -8,10 +8,11 @@ "CeNGEN (split by batch)", data_url=load_cengen.metadata["data_url"], data_reference=load_cengen.metadata["data_reference"], - dataset_summary="100k FACS-isolated C. elegans neurons from 17 experiments " - "sequenced on 10x Genomics. Split into train/test by experimental batch. " - "Dimensions: 100955 cells, 22469 genes. 169 cell types " - "(avg. 597±800 cells per cell type).", + dataset_summary=( + "100k FACS-isolated C. elegans neurons from 17 experiments sequenced on 10x" + " Genomics. Split into train/test by experimental batch. Dimensions: 100955" + " cells, 22469 genes. 169 cell types (avg. 597±800 cells per cell type)." + ), ) def cengen_batch(test=False): adata = load_cengen(test=test) @@ -32,10 +33,11 @@ def cengen_batch(test=False): "CeNGEN (random split)", data_url=load_cengen.metadata["data_url"], data_reference=load_cengen.metadata["data_reference"], - dataset_summary="100k FACS-isolated C. elegans neurons from 17 experiments " - "sequenced on 10x Genomics. Split into train/test randomly. " - "Dimensions: 100955 cells, 22469 genes. 169 cell types " - "avg. 597±800 cells per cell type).", + dataset_summary=( + "100k FACS-isolated C. elegans neurons from 17 experiments sequenced on 10x" + " Genomics. Split into train/test randomly. Dimensions: 100955 cells, 22469" + " genes. 169 cell types avg. 597±800 cells per cell type)." + ), ) def cengen_random(test=False): adata = load_cengen(test=test) diff --git a/openproblems/tasks/label_projection/datasets/pancreas.py b/openproblems/tasks/label_projection/datasets/pancreas.py index e303f9ac22..a84f8e8e4e 100644 --- a/openproblems/tasks/label_projection/datasets/pancreas.py +++ b/openproblems/tasks/label_projection/datasets/pancreas.py @@ -9,11 +9,12 @@ "Pancreas (by batch)", data_url=load_pancreas.metadata["data_url"], data_reference=load_pancreas.metadata["data_reference"], - dataset_summary="Human pancreatic islet scRNA-seq data from 6 datasets " - "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " - "and SMARTER-seq). Split into train/test by experimental batch. " - "Dimensions: 16382 cells, 18771 genes. 14 cell types " - "(avg. 1170±1703 cells per cell type).", + dataset_summary=( + "Human pancreatic islet scRNA-seq data from 6 datasets across technologies" + " (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, and SMARTER-seq). Split" + " into train/test by experimental batch. Dimensions: 16382 cells, 18771 genes." + " 14 cell types (avg. 1170±1703 cells per cell type)." + ), ) def pancreas_batch(test=False): adata = load_pancreas(test=test) @@ -34,11 +35,12 @@ def pancreas_batch(test=False): "Pancreas (random split)", data_url=load_pancreas.metadata["data_url"], data_reference=load_pancreas.metadata["data_reference"], - dataset_summary="Human pancreatic islet scRNA-seq data from 6 datasets " - "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " - "and SMARTER-seq). Split into train/test randomly. " - "Dimensions: 16382 cells, 18771 genes. 14 cell types " - "(avg. 1170±1703 cells per cell type).", + dataset_summary=( + "Human pancreatic islet scRNA-seq data from 6 datasets across technologies" + " (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, and SMARTER-seq). Split" + " into train/test randomly. Dimensions: 16382 cells, 18771 genes. 14 cell types" + " (avg. 1170±1703 cells per cell type)." + ), ) def pancreas_random(test=False): adata = load_pancreas(test=test) @@ -57,11 +59,12 @@ def pancreas_random(test=False): "Pancreas (random split with label noise)", data_url=load_pancreas.metadata["data_url"], data_reference=load_pancreas.metadata["data_reference"], - dataset_summary="Human pancreatic islet scRNA-seq data from 6 datasets " - "across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, " - "and SMARTER-seq). Split into train/test randomly with 20% label noise. " - "Dimensions: 16382 cells, 18771 genes. 14 cell types " - "(avg. 1170±1703 cells per cell type).", + dataset_summary=( + "Human pancreatic islet scRNA-seq data from 6 datasets across technologies" + " (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, and SMARTER-seq). Split" + " into train/test randomly with 20% label noise. Dimensions: 16382 cells, 18771" + " genes. 14 cell types (avg. 1170±1703 cells per cell type)." + ), ) def pancreas_random_label_noise(test=False): adata = load_pancreas(test=test) diff --git a/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py b/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py index 3f927a83a8..62be5b43c1 100644 --- a/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py +++ b/openproblems/tasks/label_projection/datasets/tabula_muris_senis.py @@ -8,10 +8,12 @@ "Tabula Muris Senis Lung (random split)", data_url=load_tabula_muris_senis.metadata["data_url"], data_reference=load_tabula_muris_senis.metadata["data_reference"], - dataset_summary="All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 " - "organs and tissues across the mouse lifespan. Split into train/test randomly. " - "Dimensions: 24540 cells, 17985 genes. 39 cell types " - "(avg. 629±999 cells per cell type).", + dataset_summary=( + "All lung cells from Tabula Muris Senis, a 500k cell-atlas from 18 organs and" + " tissues across the mouse lifespan. Split into train/test randomly." + " Dimensions: 24540 cells, 17985 genes. 39 cell types (avg. 629±999 cells per" + " cell type)." + ), ) def tabula_muris_senis_lung_random(test=False): adata = load_tabula_muris_senis( diff --git a/openproblems/tasks/label_projection/datasets/zebrafish.py b/openproblems/tasks/label_projection/datasets/zebrafish.py index ca1935b23a..b09409d223 100644 --- a/openproblems/tasks/label_projection/datasets/zebrafish.py +++ b/openproblems/tasks/label_projection/datasets/zebrafish.py @@ -8,11 +8,12 @@ "Zebrafish (by laboratory)", data_url=load_zebrafish.metadata["data_url"], data_reference=load_zebrafish.metadata["data_reference"], - dataset_summary="90k cells from zebrafish embryos throughout the first day of " - "development, with and without a knockout of chordin, an important developmental " - "gene. Split into train/test by laboratory. " - "Dimensions: 26022 cells, 25258 genes. 24 cell types " - "(avg. 1084±1156 cells per cell type).", + dataset_summary=( + "90k cells from zebrafish embryos throughout the first day of development, with" + " and without a knockout of chordin, an important developmental gene. Split" + " into train/test by laboratory. Dimensions: 26022 cells, 25258 genes. 24 cell" + " types (avg. 1084±1156 cells per cell type)." + ), ) def zebrafish_labs(test=False): adata = load_zebrafish(test=test) @@ -26,11 +27,12 @@ def zebrafish_labs(test=False): "Zebrafish (random split)", data_url=load_zebrafish.metadata["data_url"], data_reference=load_zebrafish.metadata["data_reference"], - dataset_summary="90k cells from zebrafish embryos throughout the first day of " - "development, with and without a knockout of chordin, an important developmental " - "gene. Split into train/test randomly. " - "Dimensions: 26022 cells, 25258 genes. 24 cell types " - "(avg. 1084±1156 cells per cell type).", + dataset_summary=( + "90k cells from zebrafish embryos throughout the first day of development, with" + " and without a knockout of chordin, an important developmental gene. Split" + " into train/test randomly. Dimensions: 26022 cells, 25258 genes. 24 cell types" + " (avg. 1084±1156 cells per cell type)." + ), ) def zebrafish_random(test=False): adata = load_zebrafish(test=test) diff --git a/openproblems/tasks/matching_modalities/datasets/citeseq.py b/openproblems/tasks/matching_modalities/datasets/citeseq.py index 338423ef9e..5923b5eabb 100644 --- a/openproblems/tasks/matching_modalities/datasets/citeseq.py +++ b/openproblems/tasks/matching_modalities/datasets/citeseq.py @@ -6,9 +6,11 @@ "CITE-seq Cord Blood Mononuclear Cells", data_url=load_citeseq_cbmc.metadata["data_url"], data_reference=load_citeseq_cbmc.metadata["data_reference"], - dataset_summary="8k cord blood mononuclear cells sequenced by CITEseq, a multimodal" - " addition to the 10x scRNA-seq platform that allows simultaneous measurement of " - "RNA and protein.", + dataset_summary=( + "8k cord blood mononuclear cells sequenced by CITEseq, a multimodal addition to" + " the 10x scRNA-seq platform that allows simultaneous measurement of RNA and" + " protein." + ), ) def citeseq_cbmc(test=False): return load_citeseq_cbmc(test=test) diff --git a/openproblems/tasks/matching_modalities/datasets/scicar.py b/openproblems/tasks/matching_modalities/datasets/scicar.py index c3891fa12b..dddb373b29 100644 --- a/openproblems/tasks/matching_modalities/datasets/scicar.py +++ b/openproblems/tasks/matching_modalities/datasets/scicar.py @@ -7,9 +7,11 @@ "sciCAR Cell Lines", data_url=load_scicar_cell_lines.metadata["data_url"], data_reference=load_scicar_cell_lines.metadata["data_reference"], - dataset_summary="5k cells from a time-series of dexamethasone treatment sequenced " - "by sci-CAR, a combinatorial indexing-based co-assay that jointly profiles " - "chromatin accessibility and mRNA.", + dataset_summary=( + "5k cells from a time-series of dexamethasone treatment sequenced by sci-CAR, a" + " combinatorial indexing-based co-assay that jointly profiles chromatin" + " accessibility and mRNA." + ), ) def scicar_cell_lines(test=False): return load_scicar_cell_lines(test=test) @@ -19,9 +21,11 @@ def scicar_cell_lines(test=False): "sciCAR Mouse Kidney", data_url=load_scicar_mouse_kidney.metadata["data_url"], data_reference=load_scicar_cell_lines.metadata["data_reference"], - dataset_summary="11k cells from adult mouse kidney sequenced " - "by sci-CAR, a combinatorial indexing-based co-assay that jointly profiles " - "chromatin accessibility and mRNA.", + dataset_summary=( + "11k cells from adult mouse kidney sequenced by sci-CAR, a combinatorial" + " indexing-based co-assay that jointly profiles chromatin accessibility and" + " mRNA." + ), ) def scicar_mouse_kidney(test=False): return load_scicar_mouse_kidney(test=test) diff --git a/openproblems/tasks/regulatory_effect_prediction/datasets/scicar.py b/openproblems/tasks/regulatory_effect_prediction/datasets/scicar.py index e5dff69ea6..32f12fd5cf 100644 --- a/openproblems/tasks/regulatory_effect_prediction/datasets/scicar.py +++ b/openproblems/tasks/regulatory_effect_prediction/datasets/scicar.py @@ -6,9 +6,11 @@ "sciCAR Mouse Kidney with cell clusters", data_url=scicar.load_scicar_mouse_kidney.metadata["data_url"], data_reference=scicar.load_scicar_mouse_kidney.metadata["data_reference"], - dataset_summary="11k cells from adult mouse kidney sequenced " - "by sci-CAR, a combinatorial indexing-based co-assay that jointly profiles " - "chromatin accessibility and mRNA.", + dataset_summary=( + "11k cells from adult mouse kidney sequenced by sci-CAR, a combinatorial" + " indexing-based co-assay that jointly profiles chromatin accessibility and" + " mRNA." + ), ) def scicar_mouse_kidney(test=False): adata = scicar.load_scicar_mouse_kidney(test=test) diff --git a/openproblems/tasks/spatial_decomposition/datasets/tabula_muris_senis.py b/openproblems/tasks/spatial_decomposition/datasets/tabula_muris_senis.py index 47b7f0eeda..79fda7200b 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/tabula_muris_senis.py +++ b/openproblems/tasks/spatial_decomposition/datasets/tabula_muris_senis.py @@ -29,8 +29,7 @@ def _tabula_muris_senis(alpha, test, n_obs): @_tabula_muris_senis_dataset( "Tabula muris senis (alpha=1)", - dataset_summary="Mouse lung cells aggregated from single-cell" - " (Dirichlet alpha=1)", + dataset_summary="Mouse lung cells aggregated from single-cell (Dirichlet alpha=1)", ) def tabula_muris_senis_alpha_1(test=False, n_obs=100): return _tabula_muris_senis(alpha=1, test=test, n_obs=n_obs) @@ -38,8 +37,7 @@ def tabula_muris_senis_alpha_1(test=False, n_obs=100): @_tabula_muris_senis_dataset( "Tabula muris senis (alpha=5)", - dataset_summary="Mouse lung cells aggregated from single-cell" - " (Dirichlet alpha=5)", + dataset_summary="Mouse lung cells aggregated from single-cell (Dirichlet alpha=5)", ) def tabula_muris_senis_alpha_5(test=False, n_obs=100): return _tabula_muris_senis(alpha=5, test=test, n_obs=n_obs) @@ -47,8 +45,9 @@ def tabula_muris_senis_alpha_5(test=False, n_obs=100): @_tabula_muris_senis_dataset( "Tabula muris senis (alpha=0.5)", - dataset_summary="Mouse lung cells aggregated from single-cell" - " (Dirichlet alpha=0.5)", + dataset_summary=( + "Mouse lung cells aggregated from single-cell (Dirichlet alpha=0.5)" + ), ) def tabula_muris_senis_alpha_0_5(test=False, n_obs=100): return _tabula_muris_senis(alpha=0.5, test=test, n_obs=n_obs) diff --git a/openproblems/utils.py b/openproblems/utils.py index 3cbc56d356..4a9f35c5c0 100644 --- a/openproblems/utils.py +++ b/openproblems/utils.py @@ -18,8 +18,8 @@ def temporary(func, version=None, *args, **kwargs): raise TypeError("temporary() missing 1 required keyword argument: 'version'") if packaging.version.parse(__version__) >= packaging.version.parse(version): raise RuntimeError( - "Temporary function {}.{} is temporary and should not be used " - "after version {} (current version: {})".format( + "Temporary function {}.{} is temporary and should not be used after version" + " {} (current version: {})".format( func.__module__, func.__name__, version, __version__ ) ) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..e396c1f8ad --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[tool.isort] +profile = "black" +force_single_line = true +force_alphabetical_sort = true + +[tool.black] +experimental_string_processing = true diff --git a/setup.cfg b/setup.cfg index b80460ab9b..81cbb08f72 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,8 +19,3 @@ exclude = build, dist, Snakefile - -[isort] -profile = black -force_single_line = true -force_alphabetical_sort = true diff --git a/test/test_core_tools.py b/test/test_core_tools.py index cd45106482..560bcd8352 100644 --- a/test/test_core_tools.py +++ b/test/test_core_tools.py @@ -81,7 +81,7 @@ def test_cache(self): @parameterized.parameterized_class( - ("normalizer"), + "normalizer", [ (staticmethod(normalizer),) for normalizer in openproblems.utils.get_callable_members( diff --git a/test/test_core_utils.py b/test/test_core_utils.py index 0c3ec8b5ee..89fb9cfb24 100644 --- a/test/test_core_utils.py +++ b/test/test_core_utils.py @@ -32,8 +32,8 @@ def test_fn(): # pragma: nocover np.testing.assert_raises_regex( RuntimeError, - "Temporary function {}.{} is temporary and should not be used " - "after version {}".format(test_fn.__module__, test_fn.__name__, temp_version), + "Temporary function {}.{} is temporary and should not be used after version {}" + .format(test_fn.__module__, test_fn.__name__, temp_version), test_fn, ) diff --git a/test/test_task_cell_cell_communication_source_target.py b/test/test_task_cell_cell_communication_source_target.py index 40b07b047e..453ce9b370 100644 --- a/test/test_task_cell_cell_communication_source_target.py +++ b/test/test_task_cell_cell_communication_source_target.py @@ -28,8 +28,8 @@ def test_assert_is_subset(self): ) self.assertRaisesRegex( AssertionError, - r"test_subset is not a subset of test_superset\. " - "d missing from test_superset", + r"test_subset is not a subset of test_superset\. d missing from" + r" test_superset", common.api.assert_is_subset, ["a", "b", "c", "d"], ["a", "b", "c"], @@ -39,8 +39,8 @@ def test_assert_is_subset(self): ) self.assertRaisesRegex( AssertionError, - r"Allowed proportion \(0.24\) of missing test_subset elements exceeded " - r"\(0\.25\)\. d missing from test_superset", + r"Allowed proportion \(0.24\) of missing test_subset elements exceeded" + r" \(0\.25\)\. d missing from test_superset", common.api.assert_is_subset, ["a", "b", "c", "d"], ["a", "b", "c"], diff --git a/test/test_task_methods.py b/test/test_task_methods.py index 5e5ffd844a..78d2e69ce3 100644 --- a/test/test_task_methods.py +++ b/test/test_task_methods.py @@ -41,8 +41,8 @@ def test_method(task_name, method_name, image): # pragma: nocover assert task.api.check_method(adata, is_baseline=method.metadata["is_baseline"]) if "method_code_version" not in adata.uns: openproblems.utils.future_warning( - "Setting code_version in the method decorator is deprecated. " - "Store code version in `adata.uns['method_code_version']` instead.", + "Setting code_version in the method decorator is deprecated. Store code" + " version in `adata.uns['method_code_version']` instead.", error_version="1.0", error_category=TypeError, ) diff --git a/test/utils/asserts.py b/test/utils/asserts.py index 211374f9d5..bc4c11408b 100644 --- a/test/utils/asserts.py +++ b/test/utils/asserts.py @@ -6,8 +6,9 @@ import scipy.sparse _REQUEST_HEADERS = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) " - "Gecko/20100101 Firefox/71.0" + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0" + ) } FILEPATH = pathlib.Path(__file__) diff --git a/test/utils/docker.py b/test/utils/docker.py index 1c6476191f..974409923e 100644 --- a/test/utils/docker.py +++ b/test/utils/docker.py @@ -105,8 +105,8 @@ def image_requires_docker(image): else: if not docker_available(): raise RuntimeError( - "The Dockerfile for image {} is newer than the " - "latest push, but Docker is not available.".format(image) + "The Dockerfile for image {} is newer than the latest push, but Docker" + " is not available.".format(image) ) if docker_image_age(image) < git_file_age: import sys @@ -225,8 +225,8 @@ def run_image(image, script, *args, timeout=None, retries=0): if retries > 0 and not isinstance(e, exceptions.TimeoutError): time = "time" if retries == 1 else "times" warnings.warn( - f"Container failed with {type(e).__name__}. " - f"Retrying {retries} more {time}", + f"Container failed with {type(e).__name__}. Retrying {retries} more" + f" {time}", RuntimeWarning, ) retries -= 1 diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index defab2fff1..e23217967b 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -337,8 +337,10 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): output = dict( name=dataset.metadata["dataset_name"], data_url=dataset.metadata["data_url"], - data_reference="https://openproblems.bio/" - f"bibliography#{dataset.metadata['data_reference']}", + data_reference=( + "https://openproblems.bio/" + f"bibliography#{dataset.metadata['data_reference']}" + ), headers=dict( names=["Rank", "Name", "Mean score"], fixed=["Name", "Paper", "Library"] ), @@ -359,12 +361,16 @@ def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): result = { "Name": method.metadata["method_name"], "Paper": method.metadata["paper_name"], - "Paper URL": "https://openproblems.bio/" - f"bibliography#{method.metadata['paper_reference']}", + "Paper URL": ( + "https://openproblems.bio/" + f"bibliography#{method.metadata['paper_reference']}" + ), "Year": method.metadata["paper_year"], "Library": method.metadata["code_url"], - "Implementation": "https://github.com/openproblems-bio/openproblems/" - f"blob/main/{method.__module__.replace('.', '/')}.py", + "Implementation": ( + "https://github.com/openproblems-bio/openproblems/" + f"blob/main/{method.__module__.replace('.', '/')}.py" + ), "Version": method_results["code_version"], "Runtime (min)": parse_time_to_min(method_results["realtime"]), "CPU (%)": float(method_results["%cpu"].replace("%", "")), diff --git a/workflow/snakemake_tools.py b/workflow/snakemake_tools.py index 02d327dc55..81b5036ee6 100644 --- a/workflow/snakemake_tools.py +++ b/workflow/snakemake_tools.py @@ -173,9 +173,9 @@ def docker_image_age(image, pull_on_error=True): return docker_image_age(image, pull_on_error=False) elif date_string == "": warnings.warn( - "Docker image singlecellopenproblems/{} not found; " - "assuming needs rebuild. If you think this message is in error, " - "you can fix this by running `snakemake -j 1 docker_pull`".format(image) + "Docker image singlecellopenproblems/{} not found; assuming needs" + " rebuild. If you think this message is in error, you can fix this by" + " running `snakemake -j 1 docker_pull`".format(image) ) return -1 else: From 0f3736bda88cad0c5c609a98dc2114f0e72f84a1 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 28 Feb 2023 08:54:31 -0500 Subject: [PATCH 260/266] Precompute nn ranking for DR datasets (#841) * precompute nn ranking on full data * bugfix * bugfix2 * fix sample_dataset * just subsample it * Remove nan check --- .../tasks/dimensionality_reduction/README.md | 4 +- .../tasks/dimensionality_reduction/_utils.py | 27 +++++ .../tasks/dimensionality_reduction/api.py | 2 + .../datasets/mouse_blood_olsson_labelled.py | 5 +- .../datasets/mouse_hspc_nestorowa2016.py | 5 +- .../datasets/tenx_5k_pbmc.py | 5 +- .../datasets/zebrafish.py | 10 +- .../metrics/nn_ranking.py | 104 +++++------------- 8 files changed, 81 insertions(+), 81 deletions(-) create mode 100644 openproblems/tasks/dimensionality_reduction/_utils.py diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index df07de392c..cc998a0255 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -48,7 +48,9 @@ WARNING: other than most tasks, `adata.X` should contain log CP10k-normalized da highly on these metrics. **Datasets** should provide *log CP10k normalized counts* in `adata.X` and store the -original number of genes (i.e., `adata.shape[1]`) in `adata.uns["n_genes"]`. +original number of genes (i.e., `adata.shape[1]`) in `adata.uns["n_genes"]`. Datasets +should also contain the nearest-neighbor ranking matrix, required for the `nn_ranking` +metrics, as computed by `_utils.ranking_matrix(adata.X)` on normalized counts. **Methods** should assign dimensionally-reduced 2D embedding coordinates to `adata.obsm['X_emb']`. They *should not* modify the dimensionality of `adata.X` (e.g. diff --git a/openproblems/tasks/dimensionality_reduction/_utils.py b/openproblems/tasks/dimensionality_reduction/_utils.py new file mode 100644 index 0000000000..701621b1fb --- /dev/null +++ b/openproblems/tasks/dimensionality_reduction/_utils.py @@ -0,0 +1,27 @@ +from numba import njit + +import numpy as np + + +@njit(cache=True, fastmath=True) +def _ranking_matrix(D: np.ndarray) -> np.ndarray: # pragma: no cover + assert D.shape[0] == D.shape[1] + R = np.zeros(D.shape) + m = len(R) + ks = np.arange(m) + + for i in range(m): + for j in range(m): + R[i, j] = np.sum( + (D[i, :] < D[i, j]) | ((ks < j) & (np.abs(D[i, :] - D[i, j]) <= 1e-12)) + ) + + return R + + +def ranking_matrix(X): + from sklearn.metrics import pairwise_distances + + D = pairwise_distances(X) + R = _ranking_matrix(D) + return R diff --git a/openproblems/tasks/dimensionality_reduction/api.py b/openproblems/tasks/dimensionality_reduction/api.py index 7ec44c1676..4fff9852a8 100644 --- a/openproblems/tasks/dimensionality_reduction/api.py +++ b/openproblems/tasks/dimensionality_reduction/api.py @@ -1,6 +1,7 @@ from ...data.sample import load_sample_data from ...tools.decorators import dataset from ...tools.normalize import log_cp10k +from . import _utils import numpy as np @@ -31,6 +32,7 @@ def sample_dataset(): adata = load_sample_data() adata = log_cp10k(adata) adata.uns["n_genes"] = adata.shape[1] + adata.obsm["X_ranking"] = _utils.ranking_matrix(adata.X) return adata diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py index 23f6e31463..2d916fdd96 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_blood_olsson_labelled.py @@ -1,6 +1,7 @@ from ....data.mouse_blood_olsson_labelled import load_olsson_2016_mouse_blood from ....tools.decorators import dataset from ....tools.normalize import log_cp10k +from .._utils import ranking_matrix @dataset( @@ -15,4 +16,6 @@ def olsson_2016_mouse_blood(test=False): adata = load_olsson_2016_mouse_blood(test=test) adata.uns["n_genes"] = adata.shape[1] - return log_cp10k(adata) + adata = log_cp10k(adata) + adata.obsm["X_ranking"] = ranking_matrix(adata.X) + return adata diff --git a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py index 864783a568..8e1bc3c15b 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/mouse_hspc_nestorowa2016.py @@ -1,6 +1,7 @@ from ....data.mouse_hspc_nestorowa2016 import load_mouse_hspc_nestorowa2016 from ....tools.decorators import dataset from ....tools.normalize import log_cp10k +from .._utils import ranking_matrix @dataset( @@ -15,4 +16,6 @@ def mouse_hspc_nestorowa2016(test=False): adata = load_mouse_hspc_nestorowa2016(test=test) adata.uns["n_genes"] = adata.shape[1] - return log_cp10k(adata) + adata = log_cp10k(adata) + adata.obsm["X_ranking"] = ranking_matrix(adata.X) + return adata diff --git a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py index 69dad9a9d3..a18e67a719 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/tenx_5k_pbmc.py @@ -1,6 +1,7 @@ from ....data.tenx import load_tenx_5k_pbmc from ....tools.decorators import dataset from ....tools.normalize import log_cp10k +from .._utils import ranking_matrix @dataset( @@ -16,4 +17,6 @@ def tenx_5k_pbmc(test=False): adata = load_tenx_5k_pbmc(test=test) adata.uns["n_genes"] = adata.shape[1] - return log_cp10k(adata) + adata = log_cp10k(adata) + adata.obsm["X_ranking"] = ranking_matrix(adata.X) + return adata diff --git a/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py b/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py index 8de2a0db91..369a589c53 100644 --- a/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py +++ b/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py @@ -1,6 +1,7 @@ from ....data.zebrafish import load_zebrafish from ....tools.decorators import dataset from ....tools.normalize import log_cp10k +from .._utils import ranking_matrix @dataset( @@ -15,6 +16,13 @@ ), ) def zebrafish_labs(test=False): + import scanpy as sc + adata = load_zebrafish(test=test) + if not test: + # this dataset is too big + sc.pp.subsample(adata, n_obs=25000) adata.uns["n_genes"] = adata.shape[1] - return log_cp10k(adata) + adata = log_cp10k(adata) + adata.obsm["X_ranking"] = ranking_matrix(adata.X) + return adata diff --git a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py index 684eb6f1a0..b13fe93ac6 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py @@ -15,6 +15,7 @@ """ from ....tools.decorators import metric +from .._utils import ranking_matrix from anndata import AnnData from numba import njit from typing import Tuple @@ -33,22 +34,6 @@ _K = 30 -@njit(cache=True, fastmath=True) -def _ranking_matrix(D: np.ndarray) -> np.ndarray: # pragma: no cover - assert D.shape[0] == D.shape[1] - R = np.zeros(D.shape) - m = len(R) - ks = np.arange(m) - - for i in range(m): - for j in range(m): - R[i, j] = np.sum( - (D[i, :] < D[i, j]) | ((ks < j) & (np.abs(D[i, :] - D[i, j]) <= 1e-12)) - ) - - return R - - @njit(cache=True, fastmath=True) def _coranking_matrix(R1: np.ndarray, R2: np.ndarray) -> np.ndarray: # pragma: no cover assert R1.shape == R2.shape @@ -63,22 +48,6 @@ def _coranking_matrix(R1: np.ndarray, R2: np.ndarray) -> np.ndarray: # pragma: return Q -@njit(cache=True, fastmath=True) -def _trustworthiness(Q: np.ndarray, m: int) -> np.ndarray: # pragma: no cover - - T = np.zeros(m - 1) # trustworthiness - - for k in range(m - 1): - Qs = Q[k:, :k] - # a column vector of weights. weight = rank error = actual_rank - k - W = np.arange(Qs.shape[0]).reshape(-1, 1) - # 1 - normalized hard-k-intrusions. lower-left region. - # weighted by rank error (rank - k) - T[k] = 1 - np.sum(Qs * W) / ((k + 1) * m * (m - 1 - k)) - - return T - - @njit(cache=True, fastmath=True) def _continuity(Q: np.ndarray, m: int) -> np.ndarray: # pragma: no cover @@ -133,65 +102,38 @@ def _qnn_auc(QNN: np.ndarray) -> float: return AUC # type: ignore -def _metrics( - Q: np.ndarray, -) -> Tuple[np.ndarray, np.ndarray, np.ndarray, float, np.ndarray, int, float, float]: - Q = Q[1:, 1:] - m = len(Q) - - T = _trustworthiness(Q, m) - C = _continuity(Q, m) - QNN = _qnn(Q, m) - LCMC = _lcmc(QNN, m) - kmax = _kmax(LCMC) - Qlocal = _q_local(QNN, kmax) - Qglobal = _q_global(QNN, kmax, m) - AUC = _qnn_auc(QNN) - - return T, C, QNN, AUC, LCMC, kmax, Qlocal, Qglobal - - -def _high_dim(adata: AnnData) -> np.ndarray: - from scipy.sparse import issparse - - high_dim = adata.X - return high_dim.A if issparse(high_dim) else high_dim - +def _fit(adata: AnnData) -> Tuple[float, float, float, float, float, float, float]: + Rx = adata.obsm["X_ranking"] + E = adata.obsm["X_emb"] -def _fit( - X: np.ndarray, E: np.ndarray -) -> Tuple[float, float, float, float, float, float, float]: - from sklearn.metrics import pairwise_distances - - if np.any(np.isnan(E)): - return 0.0, 0.0, 0.0, 0.5, -np.inf, -np.inf, -np.inf - - Dx = pairwise_distances(X) - De = pairwise_distances(E) - Rx, Re = _ranking_matrix(Dx), _ranking_matrix(De) + Re = ranking_matrix(E) Q = _coranking_matrix(Rx, Re) + Q = Q[1:, 1:] + m = len(Q) - T, C, QNN, AUC, LCMC, _kmax, Qlocal, Qglobal = _metrics(Q) - - return T[_K], C[_K], QNN[_K], AUC, LCMC[_K], Qlocal, Qglobal + return Q, m @metric("continuity", paper_reference="zhang2021pydrmetrics", maximize=True) def continuity(adata: AnnData) -> float: - _, C, _, *_ = _fit(_high_dim(adata), adata.obsm["X_emb"]) + Q, m = _fit(adata) + C = _continuity(Q, m)[_K] return float(np.clip(C, 0.0, 1.0)) # in [0, 1] @metric("co-KNN size", paper_reference="zhang2021pydrmetrics", maximize=True) def qnn(adata: AnnData) -> float: - _, _, QNN, *_ = _fit(_high_dim(adata), adata.obsm["X_emb"]) + Q, m = _fit(adata) + QNN = _qnn(Q, m)[_K] # normalized in the code to [0, 1] return float(np.clip(QNN, 0.0, 1.0)) @metric("co-KNN AUC", paper_reference="zhang2021pydrmetrics", maximize=True) def qnn_auc(adata: AnnData) -> float: - _, _, _, AUC, *_ = _fit(_high_dim(adata), adata.obsm["X_emb"]) + Q, m = _fit(adata) + QNN = _qnn(Q, m) + AUC = _qnn_auc(QNN) return float(np.clip(AUC, 0.5, 1.0)) # in [0.5, 1] @@ -201,7 +143,9 @@ def qnn_auc(adata: AnnData) -> float: maximize=True, ) def lcmc(adata: AnnData) -> float: - *_, LCMC, _, _ = _fit(_high_dim(adata), adata.obsm["X_emb"]) + Q, m = _fit(adata) + QNN = _qnn(Q, m) + LCMC = _lcmc(QNN, m)[_K] return LCMC @@ -209,11 +153,19 @@ def lcmc(adata: AnnData) -> float: def qlocal(adata: AnnData) -> float: # according to authors, this is usually preferred to # qglobal, because human are more sensitive to nearer neighbors - *_, Qlocal, _ = _fit(_high_dim(adata), adata.obsm["X_emb"]) + Q, m = _fit(adata) + QNN = _qnn(Q, m) + LCMC = _lcmc(QNN, m) + kmax = _kmax(LCMC) + Qlocal = _q_local(QNN, kmax) return Qlocal @metric("global property", paper_reference="zhang2021pydrmetrics", maximize=True) def qglobal(adata: AnnData) -> float: - *_, Qglobal = _fit(_high_dim(adata), adata.obsm["X_emb"]) + Q, m = _fit(adata) + QNN = _qnn(Q, m) + LCMC = _lcmc(QNN, m) + kmax = _kmax(LCMC) + Qglobal = _q_global(QNN, kmax, m) return Qglobal From b3456fd73c04c28516f6df34c57e6e3e8b0dab32 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 28 Feb 2023 09:19:56 -0500 Subject: [PATCH 261/266] Add method and metric descriptions (#810) * add method and metric summaries * Update auprc.py --- main.bib | 13 +++ .../_common/methods/baseline.py | 42 +++++----- .../batch_integration_embed/README.md | 31 ------- .../methods/baseline.py | 24 +++--- .../metrics/cc_score.py | 4 + .../metrics/iso_label_sil.py | 5 ++ .../batch_integration_embed/metrics/kBET.py | 6 ++ .../batch_integration_embed/metrics/pcr.py | 6 ++ .../metrics/sil_batch.py | 5 ++ .../metrics/silhouette.py | 4 + .../batch_integration_feature/README.md | 11 --- .../metrics/hvg_conservation.py | 4 + .../batch_integration_graph/README.md | 24 ------ .../methods/baseline.py | 13 ++- .../batch_integration_graph/methods/bbknn.py | 6 ++ .../batch_integration_graph/methods/combat.py | 14 +++- .../methods/fastmnn.py | 7 ++ .../methods/harmony.py | 12 ++- .../batch_integration_graph/methods/liger.py | 12 ++- .../batch_integration_graph/methods/mnn.py | 11 ++- .../batch_integration_graph/methods/scalex.py | 12 ++- .../methods/scanorama.py | 11 ++- .../batch_integration_graph/methods/scanvi.py | 10 ++- .../batch_integration_graph/methods/scvi.py | 3 + .../batch_integration_graph/metrics/ari.py | 5 ++ .../metrics/graph_connectivity.py | 5 ++ .../metrics/iso_label_f1.py | 5 ++ .../batch_integration_graph/metrics/nmi.py | 4 + .../tasks/_cell_cell_communication/README.md | 15 ---- .../_cell_cell_communication/_common/api.py | 4 +- .../_common/methods/baseline.py | 23 +++-- .../_common/methods/liana.py | 83 +++++++++++++------ .../_common/metrics/auprc.py | 5 +- .../_common/metrics/odds_ratio.py | 13 ++- .../README.md | 15 ---- .../README.md | 15 ---- openproblems/tasks/denoising/README.md | 10 --- openproblems/tasks/denoising/methods/alra.py | 16 +++- .../tasks/denoising/methods/baseline.py | 18 ++-- openproblems/tasks/denoising/methods/dca.py | 8 ++ .../tasks/denoising/methods/knn_smoothing.py | 18 +++- openproblems/tasks/denoising/methods/magic.py | 33 ++++++-- openproblems/tasks/denoising/metrics/mse.py | 5 ++ .../tasks/denoising/metrics/poisson.py | 4 + .../tasks/dimensionality_reduction/README.md | 19 ----- .../methods/baseline.py | 37 +++++---- .../methods/diffusion_map.py | 7 ++ .../methods/neuralee.py | 21 +++-- .../dimensionality_reduction/methods/pca.py | 14 +++- .../dimensionality_reduction/methods/phate.py | 15 +++- .../dimensionality_reduction/methods/pymde.py | 7 ++ .../dimensionality_reduction/methods/tsne.py | 14 +++- .../dimensionality_reduction/methods/umap.py | 23 ++++- .../metrics/density.py | 6 +- .../metrics/distance_correlation.py | 22 +++-- .../metrics/nn_ranking.py | 45 ++++++++-- .../metrics/trustworthiness.py | 4 + openproblems/tasks/label_projection/README.md | 14 ---- .../label_projection/methods/baseline.py | 32 ++++--- .../methods/knn_classifier.py | 15 +++- .../methods/logistic_regression.py | 12 ++- .../tasks/label_projection/methods/mlp.py | 15 +++- .../label_projection/methods/scvi_tools.py | 22 ++++- .../tasks/label_projection/methods/seurat.py | 9 ++ .../tasks/label_projection/methods/xgboost.py | 6 ++ .../label_projection/metrics/accuracy.py | 7 +- .../tasks/label_projection/metrics/f1.py | 20 ++++- .../tasks/matching_modalities/README.md | 12 --- .../matching_modalities/methods/baseline.py | 26 +++--- .../methods/harmonic_alignment.py | 8 ++ .../tasks/matching_modalities/methods/mnn.py | 14 +++- .../matching_modalities/methods/procrustes.py | 13 ++- .../matching_modalities/metrics/knn_auc.py | 6 ++ .../tasks/matching_modalities/metrics/mse.py | 4 + .../methods/baseline.py | 21 ++--- .../methods/beta.py | 14 +++- .../metrics/correlation.py | 8 ++ .../tasks/spatial_decomposition/README.md | 15 ---- .../datasets/destvi/generate.py | 16 ++-- .../spatial_decomposition/methods/baseline.py | 23 +++-- .../methods/cell2location.py | 5 ++ .../spatial_decomposition/methods/destvi.py | 11 ++- .../spatial_decomposition/methods/nmfreg.py | 12 ++- .../spatial_decomposition/methods/nnls.py | 18 ++-- .../spatial_decomposition/methods/rctd.py | 7 ++ .../spatial_decomposition/methods/seuratv3.py | 4 + .../methods/stereoscope.py | 12 ++- .../spatial_decomposition/methods/tangram.py | 12 ++- .../methods/vanillanmf.py | 20 ++++- .../tasks/spatial_decomposition/metrics/r2.py | 13 ++- openproblems/tools/decorators.py | 23 ++++- test/test_core_metadata.py | 14 +++- 92 files changed, 862 insertions(+), 449 deletions(-) diff --git a/main.bib b/main.bib index 02356c12c2..5bce346a17 100644 --- a/main.bib +++ b/main.bib @@ -22,6 +22,19 @@ @article{agrawal2021mde doi = {10.1561/2200000090}, url = {https://doi.org/10.1561/2200000090}, } +@article{aliee2021autogenes, + title = {{AutoGeneS}: Automatic gene selection using multi-objective optimization for {RNA}-seq deconvolution}, + author = {Hananeh Aliee and Fabian J. Theis}, + year = {2021}, + month = jul, + journal = {Cell Systems}, + publisher = {Elsevier {BV}}, + volume = {12}, + number = {7}, + pages = {706--715.e4}, + doi = {10.1016/j.cels.2021.05.006}, + url = {https://doi.org/10.1016/j.cels.2021.05.006}, +} @article{andersson2020single, title = {Single-cell and spatial transcriptomics enables probabilistic inference of cell type topography}, author = {Alma Andersson and Joseph Bergenstr{\aa}hle and Michaela Asp and Ludvig Bergenstr{\aa}hle and Aleksandra Jurek and Jos{\'{e}} Fern{\'{a}}ndez Navarro and Joakim Lundeberg}, diff --git a/openproblems/tasks/_batch_integration/_common/methods/baseline.py b/openproblems/tasks/_batch_integration/_common/methods/baseline.py index 6e2fe07d0b..e46a6dda05 100644 --- a/openproblems/tasks/_batch_integration/_common/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/_common/methods/baseline.py @@ -1,7 +1,6 @@ -from .....tools.decorators import method +from .....tools.decorators import baseline_method from .....tools.utils import check_version -import functools import numpy as np @@ -47,18 +46,12 @@ def _random_embedding(partition, jitter=0.01): return embedding -_baseline_method = functools.partial( - method, - paper_name="Open Problems for Single Cell Analysis", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) - - -@_baseline_method( +@baseline_method( method_name="No Integration", + method_summary=( + "Cells are embedded by PCA on the unintegrated data. A graph is built on this" + " PCA embedding." + ), ) def no_integration(adata, test=False): adata.obsp["connectivities"] = adata.obsp["uni_connectivities"] @@ -69,8 +62,12 @@ def no_integration(adata, test=False): return adata -@_baseline_method( +@baseline_method( method_name="Random Integration", + method_summary=( + "Feature values, embedding coordinates, and graph connectivity are all randomly" + " permuted" + ), ) def random_integration(adata, test=False): adata.X = _randomize_features(adata.X) @@ -80,13 +77,12 @@ def random_integration(adata, test=False): return adata -@_baseline_method( +@baseline_method( method_name="Random Integration by Celltype", - paper_name="Random Integration by Celltype (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary=( + "Feature values, embedding coordinates, and graph connectivity are all randomly" + " permuted within each celltype label" + ), ) def celltype_random_integration(adata, test=False): adata.obsm["X_emb"] = _randomize_features( @@ -101,8 +97,12 @@ def celltype_random_integration(adata, test=False): return adata -@_baseline_method( +@baseline_method( method_name="Random Integration by Batch", + method_summary=( + "Feature values, embedding coordinates, and graph connectivity are all randomly" + " permuted within each batch label" + ), ) def batch_random_integration(adata, test=False): adata.obsm["X_emb"] = _randomize_features( diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md index ec89a66a98..0e3d1bcc93 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md @@ -20,37 +20,6 @@ This sub-task was taken from a [benchmarking study of data integration methods](https://openproblems.bio/bibliography#luecken2022benchmarking). -## The metrics - -Metrics for batch integration (embed) measure how well batches are mixed while -biological signals are preserved. They are divided into batch correction and biological -variance conservation metrics. - -### Batch correction - -* **kBET**: kBET determines whether the label composition of a k nearest neighborhood of -a cell is similar to the expected (global) label composition -([Buettner et al., Nat Meth 2019](https://openproblems.bio/bibliography#bttner2018test)). - The test is repeated for a random subset of cells, -and the results are summarized as a rejection rate over all tested neighborhoods. -* **Silhouette batch score**: The absolute silhouette width is computed over batch -labels per cell. As 0 then indicates that batches are well mixed and any deviation from -0 indicates a batch effect, we use the 1-abs(ASW) to map the score to the scale [0;1]. -* **Principal component regression (PC regression)**: This compare the explained -variance by batch before and after integration. It returns a score between 0 and 1 -(scaled=True) with 0 if the variance contribution hasn’t changed. The larger the score, -the more different the variance contributions are before and after integration. - -### Biological variance conservation - -* **Cell cycle score**: The cell-cycle conservation score evaluates how well the -cell-cycle effect can be captured before and after integration. -* **Isolated label silhouette**: This score evaluates the compactness for the label(s) -that is(are) shared by fewest batches. It indicates how well rare cell types can be -preserved after integration. -* **Cell type ASW**: The absolute silhouette with is computed on cell identity labels, -measuring their compactness. - ## API WARNING: other than most tasks, `adata.X` should contain log-normalized data. diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py index 18a0830f5b..50148d94fd 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py @@ -1,23 +1,17 @@ -from .....tools.decorators import method +from .....tools.decorators import baseline_method from .....tools.utils import check_version from ..._common.methods.baseline import _random_embedding -import functools import numpy as np import scanpy as sc -_baseline_method = functools.partial( - method, - paper_name="Open Problems for Single Cell Analysis", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) - -@_baseline_method( +@baseline_method( method_name="Random Embedding by Celltype (with jitter)", + method_summary=( + "Cells are embedded as a one-hot encoding of celltype labels, with a small" + " amount of random noise added to the embedding" + ), ) def celltype_random_embedding_jitter(adata, test=False): adata.obsm["X_emb"] = _random_embedding(partition=adata.obs["labels"], jitter=0.01) @@ -25,8 +19,9 @@ def celltype_random_embedding_jitter(adata, test=False): return adata -@_baseline_method( +@baseline_method( method_name="Random Embedding by Celltype", + method_summary="Cells are embedded as a one-hot encoding of celltype labels", ) def celltype_random_embedding(adata, test=False): adata.obsm["X_emb"] = _random_embedding(partition=adata.obs["labels"], jitter=None) @@ -34,8 +29,9 @@ def celltype_random_embedding(adata, test=False): return adata -@_baseline_method( +@baseline_method( method_name="No Integration by Batch", + method_summary="Cells are embedded by computing PCA independently on each batch", ) def no_integration_batch(adata, test=False): """Compute PCA independently on each batch diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py index 4cf7650542..616be47861 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py @@ -21,6 +21,10 @@ @metric( metric_name="Cell Cycle Score", + metric_summary=( + "The cell-cycle conservation score evaluates how well the cell-cycle effect can" + " be captured before and after integration." + ), paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py index 746e5851fe..617e2db6fa 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py @@ -14,6 +14,11 @@ @metric( metric_name="Isolated label Silhouette", + metric_summary=( + "This score evaluates the compactness for the label(s) that is(are) shared by" + " fewest batches. It indicates how well rare cell types can be preserved after" + " integration." + ), paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/kBET.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/kBET.py index 1655a43c9a..9da1f03e20 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/kBET.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/kBET.py @@ -26,6 +26,12 @@ @metric( metric_name="kBET", + metric_summary=( + "kBET determines whether the label composition of a k nearest neighborhood of a" + " cell is similar to the expected (global) label composition. The test is" + " repeated for a random subset of cells, and the results are summarized as a" + " rejection rate over all tested neighborhoods." + ), paper_reference="bttner2018test", maximize=True, image="openproblems-r-extras", diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py index d2a6c011ae..5553754372 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py @@ -18,6 +18,12 @@ @metric( metric_name="PC Regression", + metric_summary=( + "This compares the explained variance by batch before and after integration. It" + " returns a score between 0 and 1 (scaled=True) with 0 if the variance" + " contribution hasn’t changed. The larger the score, the more different the" + " variance contributions are before and after integration." + ), paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py index efe8775252..45cf2d2f9e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py @@ -23,6 +23,11 @@ @metric( metric_name="Batch ASW", + metric_summary=( + "The absolute silhouette width is computed over batch labels per cell. As 0" + " then indicates that batches are well mixed and any deviation from 0 indicates" + " a batch effect, we use the 1-abs(ASW) to map the score to the scale [0;1]." + ), paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py index 6275a08927..3b2afb4b0a 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py @@ -11,6 +11,10 @@ @metric( metric_name="Silhouette", + metric_summary=( + "The absolute silhouette with is computed on cell identity labels, measuring" + " their compactness." + ), paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md index 16f331355e..bad7e1499e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md @@ -18,17 +18,6 @@ for: This sub-task was taken from a [benchmarking study of data integration methods](https://openproblems.bio/bibliography#luecken2022benchmarking). -## The metrics - -Metrics for batch integration (feature) measure how well feature-level information is -batch corrected. This is only done on by capturing biological variance conservation. -Further metrics for batch correction and biological variance conservation that are -calculated on lower dimensional feature spaces extrapolated from corrected feature -outputs can be found in the batch integration embed and graph tasks. - -* **HVG conservation**: This metric computes the average percentage of overlapping -highly variable genes per batch before and after integration. - ## API WARNING: other than most tasks, `adata.X` should contain log-normalized data. diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py index 5f1160d2b9..df23837d91 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py @@ -20,6 +20,10 @@ @metric( metric_name="HVG conservation", + metric_summary=( + "This metric computes the average percentage of overlapping highly variable" + " genes per batch before and after integration." + ), paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/README.md b/openproblems/tasks/_batch_integration/batch_integration_graph/README.md index 6548338e81..704302fc24 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/README.md @@ -19,30 +19,6 @@ sub-tasks for batch integration can be found for: This sub-task was taken from a [benchmarking study of data integration methods](https://openproblems.bio/bibliography#luecken2022benchmarking). -## The metrics - -Metrics for batch integration (graph) measure how well batches are mixed while -biological signals are preserved. They are divided into batch correction and biological -variance conservation metrics. - -### Batch correction - -* **Graph connectivity**: The graph connectivity metric assesses whether the kNN graph -representation, G, of the integrated data connects all cells with the same cell identity -label. - -### Biological variance removal - -* **Adjusted rand index (ARI)**: The Rand index compares the overlap of two clusterings; -it considers both correct clustering overlaps while also counting correct disagreements -between two clusterings. -* **Iso label F1 score**: Isolated cell labels are identified as the labels present in -the least number of batches in the integration task. The score evaluates how well these -isolated labels separate from other cell identities based on clustering. -* **Normalized mutual information (NMI)**: NMI compares the overlap of two clusterings. -We used NMI to compare the cell-type labels with Louvain clusters computed on the -integrated dataset. - ## API WARNING: other than most tasks, `adata.X` should contain log-normalized data. diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py index b26c0cc760..2c876251a1 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/baseline.py @@ -1,17 +1,16 @@ -from .....tools.decorators import method +from .....tools.decorators import baseline_method from .....tools.utils import check_version from ..._common.methods.baseline import _random_embedding import scanpy as sc -@method( +@baseline_method( method_name="Random Graph by Celltype", - paper_name="Random Graph by Celltype (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary=( + "Cells are embedded as a one-hot encoding of celltype labels. A graph is then" + " built on this embedding" + ), ) def celltype_random_graph(adata, test=False): adata.obsm["X_emb"] = _random_embedding(partition=adata.obs["labels"]) diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py index 0190b60b4a..d495e18788 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py @@ -6,6 +6,12 @@ _bbknn_method = functools.partial( method, + method_summary=( + "BBKNN or batch balanced k nearest neighbours graph is built for each cell by" + " identifying its k nearest neighbours within each defined batch separately," + " creating independent neighbour sets for each cell in each batch. These sets" + " are then combined and processed with the UMAP algorithm for visualisation." + ), paper_name="BBKNN: fast batch alignment of single cell transcriptomes", paper_reference="polanski2020bbknn", paper_year=2020, diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py index 1cce2908b4..d8a67d8421 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py @@ -5,8 +5,18 @@ _combat_method = functools.partial( method, - paper_name="Adjusting batch effects in microarray expression data using " - "empirical Bayes methods", + method_summary=( + "ComBat uses an Empirical Bayes (EB) approach to correct for batch effects. It" + " estimates batch-specific parameters by pooling information across genes in" + " each batch and shrinks the estimates towards the overall mean of the batch" + " effect estimates across all genes. These parameters are then used to adjust" + " the data for batch effects, leading to more accurate and reproducible" + " results." + ), + paper_name=( + "Adjusting batch effects in microarray expression data using empirical Bayes" + " methods" + ), paper_reference="hansen2012removing", paper_year=2007, code_url="https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py index cd617d975f..03680f03dd 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py @@ -7,6 +7,13 @@ _fastmnn_method = functools.partial( method, + method_summary=( + "fastMNN performs a multi-sample PCA to reduce dimensionality, identifying MNN" + " paris in the low-dimensional space, and then correcting the target batch" + " towards the reference using locally weighted correction vectors. The" + " corrected target batch is then merged with the reference. The process is" + " repeated with the next target batch except for the PCA step." + ), paper_name="A description of the theory behind the fastMNN algorithm", paper_reference="lun2019fastmnn", paper_year=2019, diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/harmony.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/harmony.py index 981e46e739..188b7d0781 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/harmony.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/harmony.py @@ -6,8 +6,16 @@ _harmony_method = functools.partial( method, - paper_name="Fast, sensitive and accurate integration " - "of single-cell data with Harmony", + method_summary=( + "Harmony is a method that uses PCA to group the cells into multi-dataset" + " clusters, and then computes cluster-specific linear correction factors. Each" + " cell is then corrected by its cell-specific linear factor using the" + " cluster-weighted average. The method keeps iterating these four steps until" + " cell clusters are stable." + ), + paper_name=( + "Fast, sensitive and accurate integration of single-cell data with Harmony" + ), paper_reference="korsunsky2019fast", paper_year=2019, code_url="https://github.com/lilab-bcb/harmony-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/liger.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/liger.py index 5077e9f34e..9e3c2b012e 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/liger.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/liger.py @@ -7,8 +7,16 @@ _liger_method = functools.partial( method, - paper_name="Single-Cell Multi-omic Integration Compares and " - "Contrasts Features of Brain Cell Identity", + method_summary=( + "LIGER or linked inference of genomic experimental relationships uses iNMF" + " deriving and implementing a novel coordinate descent algorithm to efficiently" + " do the factorization. Joint clustering is performed and factor loadings are" + " normalised." + ), + paper_name=( + "Single-Cell Multi-omic Integration Compares and Contrasts Features of Brain" + " Cell Identity" + ), paper_reference="welch2019single", paper_year=2019, code_url="https://github.com/welch-lab/liger", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py index 48919504f2..a8147ec3ae 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py @@ -5,8 +5,15 @@ _mnn_method = functools.partial( method, - paper_name="Batch effects in single-cell RNA-sequencing " - "data are corrected by matching mutual nearest neighbors", + method_summary=( + "MNN first detects mutual nearest neighbours in two of the batches and infers a" + " projection of the second onto the first batch. After that, additional batches" + " are added iteratively." + ), + paper_name=( + "Batch effects in single-cell RNA-sequencing data are corrected by matching" + " mutual nearest neighbors" + ), paper_reference="haghverdi2018batch", paper_year=2018, code_url="https://github.com/chriscainx/mnnpy", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py index 796a786375..28eb3ac7e1 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py @@ -6,8 +6,16 @@ _scalex_method = functools.partial( method, - paper_name="Online single-cell data integration through projecting heterogeneous " - "datasets into a common cell-embedding space", + method_summary=( + "SCALEX is a method for integrating heterogeneous single-cell data online using" + " a VAE framework. Its generalised encoder disentangles batch-related" + " components from batch-invariant biological components, which are then" + " projected into a common cell-embedding space." + ), + paper_name=( + "Online single-cell data integration through projecting heterogeneous datasets" + " into a common cell-embedding space" + ), paper_reference="xiong2021online", paper_year=2022, code_url="https://github.com/jsxlei/SCALEX", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py index e04c305cd7..add9288ef6 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py @@ -5,8 +5,15 @@ _scanorama_method = functools.partial( method, - paper_name="Efficient integration of heterogeneous single-cell " - "transcriptomes using Scanorama", + method_summary=( + "Scanorama is an extension of the MNN method. Other then MNN, it finds mutual" + " nearest neighbours over all batches and embeds observations into a joint" + " hyperplane." + ), + paper_name=( + "Efficient integration of heterogeneous single-cell transcriptomes using" + " Scanorama" + ), paper_reference="hie2019efficient", paper_year=2019, code_url="https://github.com/brianhie/scanorama", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py index 91e3c92722..3202ac38d5 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py @@ -6,8 +6,14 @@ _scanvi_method = functools.partial( method, - paper_name="Probabilistic harmonization and annotation of single‐cell " - "transcriptomics data with deep generative models", + method_summary=( + "ScanVI is an extension of scVI but instead using a Bayesian semi-supervised" + " approach for more principled cell annotation." + ), + paper_name=( + "Probabilistic harmonization and annotation of single‐cell transcriptomics data" + " with deep generative models" + ), paper_reference="xu2021probabilistic", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py index 26d5f1d0d0..89263b40bb 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py @@ -6,6 +6,9 @@ _scvi_method = functools.partial( method, + method_summary=( + "scVI combines a variational autoencoder with a hierarchical Bayesian model." + ), paper_name="Deep generative modeling for single-cell transcriptomics", paper_reference="lopez2018deep", paper_year=2018, diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py index e6b69a228c..c3e54d89ce 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py @@ -15,6 +15,11 @@ @metric( metric_name="ARI", + metric_summary=( + "ARI (Adjusted Rand Index) compares the overlap of two clusterings. It" + " considers both correct clustering overlaps while also counting correct" + " disagreements between two clustering." + ), maximize=True, paper_reference="luecken2022benchmarking", image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py index 1d7a7780c1..d9e5b3901a 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py @@ -21,6 +21,11 @@ @metric( metric_name="Graph connectivity", + metric_summary=( + "The graph connectivity metric assesses whether the kNN graph representation," + " G, of the integrated data connects all cells with the same cell identity" + " label." + ), paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py index f4195e3c4d..ba08ffafd3 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py @@ -26,6 +26,11 @@ @metric( metric_name="Isolated label F1", + metric_summary=( + "Isolated cell labels are identified as the labels present in the least number" + " of batches in the integration task. The score evaluates how well these" + " isolated labels separate from other cell identities based on clustering." + ), paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py index 0bce4a5eaf..4b9b110809 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py @@ -14,6 +14,10 @@ @metric( metric_name="NMI", + metric_summary=( + "NMI compares the overlap of two clusterings. We used NMI to compare the" + " cell-type labels with Louvain clusters computed on the integrated dataset." + ), paper_reference="luecken2022benchmarking", maximize=True, image="openproblems-r-pytorch", diff --git a/openproblems/tasks/_cell_cell_communication/README.md b/openproblems/tasks/_cell_cell_communication/README.md index 34a89cf32c..637c03c4bd 100644 --- a/openproblems/tasks/_cell_cell_communication/README.md +++ b/openproblems/tasks/_cell_cell_communication/README.md @@ -40,21 +40,6 @@ More subtasks may be defined that infer communication events on any of the `sour cell type, the `target` cell type, the `ligand` molecule, and the receptor. More aspects of the communication may also be added in the future. -## The metrics - -Metrics for cell-cell communication aim to characterize how good are -the different scoring methods at prioritizing assumed truth predictions. - -* **Odds ratio**: The odds ratio represents the ratio of true and false -positives within a set of prioritized interactions (top ranked hits) versus -the same ratio for the remainder of the interactions. Thus, in this -scenario odds ratios quantify the strength of association between the -ability of methods to prioritize interactions and those interactions -assigned to the positive class. - -* **AUPRC**: a single number _[0-1]_ that summarizes the area under the curve where -x is the recall and y is the precision. - ## API ### Datasets diff --git a/openproblems/tasks/_cell_cell_communication/_common/api.py b/openproblems/tasks/_cell_cell_communication/_common/api.py index 2b2f4a0be1..ac38c8e064 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/api.py +++ b/openproblems/tasks/_cell_cell_communication/_common/api.py @@ -35,8 +35,8 @@ def assert_is_subset( msg = f"{subset_name} is not a subset of {superset_name}. " else: msg = ( - f"Allowed proportion ({prop_missing_allowed}) of missing " - f"{subset_name} elements exceeded ({prop_missing:.2f}). " + f"Allowed proportion ({prop_missing_allowed}) of missing" + f" {subset_name} elements exceeded ({prop_missing:.2f}). " ) x_missing = ",".join([x for x in subset[is_missing]]) raise AssertionError(msg + f"{x_missing} missing from {superset_name}") diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py b/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py index 27d950c233..e6704ca460 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/baseline.py @@ -1,17 +1,16 @@ -from .....tools.decorators import method +from .....tools.decorators import baseline_method from .....tools.utils import check_version import numpy as np import pandas as pd -@method( +@baseline_method( method_name="Random Events", - paper_name="Random Events (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary=( + "Random generation of cell-cell communication events by random selection of" + " ligand, receptor, source, target, and score" + ), ) def random_events(adata, test=False, n_events=1000): rng = np.random.default_rng(seed=1) @@ -36,13 +35,11 @@ def random_events(adata, test=False, n_events=1000): return adata -@method( +@baseline_method( method_name="True Events", - paper_name="True Events (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary=( + "Perfect prediction of cell-cell communication events from target data" + ), ) def true_events(adata, test=False): adata.uns["ccc_pred"] = adata.uns["ccc_target"].rename( diff --git a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py index c297163bbd..7c0f254b65 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py +++ b/openproblems/tasks/_cell_cell_communication/_common/methods/liana.py @@ -23,8 +23,14 @@ def _p_filt(x, y): _liana_method = functools.partial( method, - paper_name="Comparison of methods and resources for cell-cell " - "communication inference from single-cell RNA-Seq data", + method_summary=( + "RobustRankAggregate generates a consensus rank of all methods implemented in" + " LIANA providing either specificity or magnitude scores." + ), + paper_name=( + "Comparison of methods and resources for cell-cell communication inference from" + " single-cell RNA-Seq data" + ), paper_reference="dimitrov2022comparison", paper_year=2022, code_url="https://github.com/saezlab/liana", @@ -106,13 +112,19 @@ def magnitude_sum(adata, test=False): _cellphonedb_method = functools.partial( - method, - paper_name="CellPhoneDB: inferring cell–cell communication from " - "combined expression of multi-subunit ligand–receptor complexes", + _liana_method, + method_summary=( + "CellPhoneDBv2 calculates a mean of ligand-receptor expression as a measure of" + " interaction magnitude, along with a permutation-based p-value as a measure of" + " specificity. Here, we use the former to prioritize interactions, subsequent" + " to filtering according to p-value less than 0.05." + ), + paper_name=( + "CellPhoneDB: inferring cell–cell communication from combined expression of" + " multi-subunit ligand–receptor complexes" + ), paper_reference="efremova2020cellphonedb", paper_year=2020, - code_url="https://github.com/saezlab/liana", - image="openproblems-r-extras", ) @@ -153,13 +165,18 @@ def cellphonedb_sum(adata, test=False): _connectome_method = functools.partial( - method, - paper_name="Computation and visualization of cell–cell signaling " - "topologies in single-cell systems data using Connectome", + _liana_method, + method_summary=( + "Connectome uses the product of ligand-receptor expression as a measure of" + " magnitude, and the average of the z-transformed expression of ligand and" + " receptor as a measure of specificity." + ), + paper_name=( + "Computation and visualization of cell–cell signaling topologies in single-cell" + " systems data using Connectome" + ), paper_reference="raredon2022computation", paper_year=2022, - code_url="https://github.com/saezlab/liana", - image="openproblems-r-extras", ) @@ -188,13 +205,12 @@ def connectome_sum(adata, test=False): _logfc_method = functools.partial( - method, - paper_name="Comparison of methods and resources for cell-cell " - "communication inference from single-cell RNA-Seq data", - paper_reference="dimitrov2022comparison", - paper_year=2022, - code_url="https://github.com/saezlab/liana", - image="openproblems-r-extras", + _liana_method, + method_summary=( + "logFC (implemented in LIANA and inspired by iTALK) combines both expression" + " and magnitude, and represents the average of one-versus-the-rest log2-fold" + " change of ligand and receptor expression per cell type." + ), ) @@ -223,12 +239,19 @@ def logfc_sum(adata, test=False): _natmi_method = functools.partial( - method, + _liana_method, + method_summary=( + "NATMI uses the product of ligand-receptor expression as a measure of" + " magnitude. As a measure of specificity, NATMI proposes $specificity.edge =" + r" \frac{l}{l_s} \cdot \frac{r}{r_s}$; where $l$ and $r$ represent the average" + " expression of ligand and receptor per cell type, and $l_s$ and $r_s$" + " represent the sums of the average ligand and receptor expression across all" + " cell types. We use its specificity measure, as recommended by the authors for" + " single-context predictions." + ), paper_name="Predicting cell-to-cell communication networks using NATMI", paper_reference="hou2020predicting", paper_year=2021, - code_url="https://github.com/saezlab/liana", - image="openproblems-r-extras", ) @@ -257,13 +280,19 @@ def natmi_sum(adata, test=False): _sca_method = functools.partial( - method, - paper_name="SingleCellSignalR: inference of intercellular networks " - "from single-cell transcriptomics", + _liana_method, + method_summary=( + "SingleCellSignalR provides a magnitude score as $LRscore =" + r" \frac{\sqrt{lr}}{\mu+\sqrt{lr}}$; where $l$ and $r$ are the average ligand" + r" and receptor expression per cell type, and $\mu$ is the mean of the" + " expression matrix." + ), + paper_name=( + "SingleCellSignalR: inference of intercellular networks from single-cell" + " transcriptomics" + ), paper_reference="cabello2020singlecellsignalr", paper_year=2021, - code_url="https://github.com/saezlab/liana", - image="openproblems-r-extras", ) diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py index 593f537041..ff7a12a902 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py @@ -3,7 +3,10 @@ @metric( - metric_name="Precision-recall AUC", paper_reference="davis2006prauc", maximize=True + metric_name="Precision-recall AUC", + metric_summary="Area under the precision-recall curve.", + paper_reference="davis2006prauc", + maximize=True, ) def auprc(adata): from sklearn.metrics import auc diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py index 5cce8bb1fb..47bcc63ae9 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/odds_ratio.py @@ -8,7 +8,18 @@ def _sigmoid_transform(x): return 1 - 1 / (1 + x / 2) -@metric(metric_name="Odds Ratio", paper_reference="bland2000odds", maximize=True) +@metric( + metric_name="Odds Ratio", + metric_summary=( + "The odds ratio represents the ratio of true and false positives within a set" + " of prioritized interactions (top ranked hits) versus the same ratio for the" + " remainder of the interactions. Thus, in this scenario odds ratios quantify" + " the strength of association between the ability of methods to prioritize" + " interactions and those interactions assigned to the positive class." + ), + paper_reference="bland2000odds", + maximize=True, +) def odds_ratio(adata, top_prop=0.05): # Join benchmark (assumed truth) and ccc results # Get /w ccc_target and a response [0, 1] column diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md index 98634fb384..51546e8ab3 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md @@ -31,21 +31,6 @@ the target cell types. This subtask focuses on the prediction of interactions from steady-state, or single-context, single-cell data.** -## The metrics - -Metrics for cell-cell communication aim to characterize how good are -the different scoring methods at prioritizing assumed truth predictions. - -* **Odds ratio**: The odds ratio represents the ratio of true and false -positives within a set of prioritized interactions (top ranked hits) versus -the same ratio for the remainder of the interactions. Thus, in this -scenario odds ratios quantify the strength of association between the -ability of methods to prioritize interactions and those interactions -assigned to the positive class. - -* **AUPRC**: a single number _[0-1]_ that summarizes the area under the curve where -x is the recall and y is the precision. - ## API ### Datasets diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md index 7f8fb76cae..941d2c93fa 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md @@ -30,21 +30,6 @@ spatially-adjacent source cell types and target cell types. This subtask focuses on the prediction of interactions from steady-state, or single-context, single-cell data.** -## The metrics - -Metrics for cell-cell communication aim to characterize how good are -the different scoring methods at prioritizing assumed truth predictions. - -* **Odds ratio**: The odds ratio represents the ratio of true and false -positives within a set of prioritized interactions (top ranked hits) versus -the same ratio for the remainder of the interactions. Thus, in this -scenario odds ratios quantify the strength of association between the -ability of methods to prioritize interactions and those interactions -assigned to the positive class. - -* **AUPRC**: a single number _[0-1]_ that summarizes the area under the curve where -x is the recall and y is the precision. - ## API ### Datasets diff --git a/openproblems/tasks/denoising/README.md b/openproblems/tasks/denoising/README.md index 6a5eb5e1b3..7498d6e723 100644 --- a/openproblems/tasks/denoising/README.md +++ b/openproblems/tasks/denoising/README.md @@ -30,16 +30,6 @@ denoising accuracy is measured by comparing the result to the test dataset. The show that both in theory and in practice, the measured denoising accuracy is representative of the accuracy that would be obtained on a ground truth dataset. -## The metrics - -Metrics for data denoising aim to assess denoising accuracy by comparing the denoised -*training* set to the randomly sampled *test* set. - -* **MSE**: The mean squared error between the denoised counts of the training dataset - and the true counts of the test dataset after reweighting by the train/test ratio. -* **Poisson**: The Poisson log likelihood of observing the true counts of the test - dataset given the distribution given in the denoised dataset. - ## API Datasets should contain the raw UMI counts in `adata.X`, subsampled to training diff --git a/openproblems/tasks/denoising/methods/alra.py b/openproblems/tasks/denoising/methods/alra.py index fe4ed2ce9e..214dae5381 100644 --- a/openproblems/tasks/denoising/methods/alra.py +++ b/openproblems/tasks/denoising/methods/alra.py @@ -9,11 +9,21 @@ log = logging.getLogger("openproblems") -method_name = ("ALRA (sqrt norm, reversed normalization)",) _alra_method = functools.partial( method, - paper_name="Zero-preserving imputation of scRNA-seq data using " - "low-rank approximation", + method_summary=( + "ALRA (Adaptively-thresholded Low Rank Approximation) is a method for" + " imputation of missing values in single cell RNA-sequencing data. Given a" + " normalised scRNA-seq expression matrix, it first imputes values using rank-k" + " approximation, using singular value decomposition. Next, a symmetric" + " distribution is fitted to the near-zero imputed values for each gene (row) of" + " the matrix. The right “tail” of this distribution is then used to threshold" + " the accepted nonzero entries. This same threshold is then used to rescale the" + " matrix, once the “biological zeros” have been removed." + ), + paper_name=( + "Zero-preserving imputation of scRNA-seq data using low-rank approximation" + ), paper_reference="linderman2018zero", paper_year=2018, code_url="https://github.com/KlugerLab/ALRA", diff --git a/openproblems/tasks/denoising/methods/baseline.py b/openproblems/tasks/denoising/methods/baseline.py index 79a03fa859..c9003b525d 100644 --- a/openproblems/tasks/denoising/methods/baseline.py +++ b/openproblems/tasks/denoising/methods/baseline.py @@ -1,14 +1,10 @@ -from ....tools.decorators import method +from ....tools.decorators import baseline_method from ....tools.utils import check_version -@method( +@baseline_method( method_name="No denoising", - paper_name="Molecular Cross-Validation for Single-Cell RNA-seq", - paper_reference="batson2019molecular", - paper_year=2019, - code_url="https://github.com/czbiohub/molecular-cross-validation", - is_baseline=True, + method_summary="Denoised outputs are defined from the unmodified input data.", ) def no_denoising(adata, test=False): """Do nothing.""" @@ -17,13 +13,9 @@ def no_denoising(adata, test=False): return adata -@method( +@baseline_method( method_name="Perfect denoising", - paper_name="Molecular Cross-Validation for Single-Cell RNA-seq", - paper_reference="batson2019molecular", - paper_year=2019, - code_url="https://github.com/czbiohub/molecular-cross-validation", - is_baseline=True, + method_summary="Denoised outputs are defined from the target data.", ) def perfect_denoising(adata, test=False): """Cheat.""" diff --git a/openproblems/tasks/denoising/methods/dca.py b/openproblems/tasks/denoising/methods/dca.py index 8b47d4fcc6..b72474baf7 100644 --- a/openproblems/tasks/denoising/methods/dca.py +++ b/openproblems/tasks/denoising/methods/dca.py @@ -26,6 +26,14 @@ def _dca(adata, test=False, epochs=None): @method( method_name="DCA", + method_summary=( + "DCA (Deep Count Autoencoder) is a method to remove the effect of dropout in" + " scRNA-seq data. DCA takes into account the count structure, overdispersed" + " nature and sparsity of scRNA-seq datatypes using a deep autoencoder with a" + " zero-inflated negative binomial (ZINB) loss. The autoencoder is then applied" + " to the dataset, where the mean of the fitted negative binomial distributions" + " is used to fill each entry of the imputed matrix." + ), paper_name="Single-cell RNA-seq denoising using a deep count autoencoder", paper_reference="eraslan2019single", paper_year=2019, diff --git a/openproblems/tasks/denoising/methods/knn_smoothing.py b/openproblems/tasks/denoising/methods/knn_smoothing.py index 31cd4d1d7a..997d9fc496 100644 --- a/openproblems/tasks/denoising/methods/knn_smoothing.py +++ b/openproblems/tasks/denoising/methods/knn_smoothing.py @@ -4,8 +4,22 @@ @method( method_name="Iterative KNN smoothing", - paper_name="K-nearest neighbor smoothing for high-throughput " - "single-cell RNA-Seq data", + method_summary=( + "Iterative kNN-smoothing is a method to repair or denoise noisy scRNA-seq" + " expression matrices. Given a scRNA-seq expression matrix, KNN-smoothing first" + " applies initial normalisation and smoothing. Then, a chosen number of" + " principal components is used to calculate Euclidean distances between cells." + " Minimally sized neighbourhoods are initially determined from these Euclidean" + " distances, and expression profiles are shared between neighbouring cells." + " Then, the resultant smoothed matrix is used as input to the next step of" + " smoothing, where the size (k) of the considered neighbourhoods is increased," + " leading to greater smoothing. This process continues until a chosen maximum k" + " value has been reached, at which point the iteratively smoothed object is" + " then optionally scaled to yield a final result." + ), + paper_name=( + "K-nearest neighbor smoothing for high-throughput single-cell RNA-Seq data" + ), paper_reference="wagner2018knearest", paper_year=2018, code_url="https://github.com/yanailab/knn-smoothing", diff --git a/openproblems/tasks/denoising/methods/magic.py b/openproblems/tasks/denoising/methods/magic.py index a7a1374864..5b99118b4b 100644 --- a/openproblems/tasks/denoising/methods/magic.py +++ b/openproblems/tasks/denoising/methods/magic.py @@ -1,3 +1,4 @@ +from ....tools.decorators import baseline_method from ....tools.decorators import method from ....tools.utils import check_version @@ -7,8 +8,22 @@ _magic_method = functools.partial( method, - paper_name="Recovering Gene Interactions from Single-Cell Data " - "Using Data Diffusion", + method_summary=( + "MAGIC (Markov Affinity-based Graph Imputation of Cells) is a method for" + " imputation and denoising of noisy or dropout-prone single cell RNA-sequencing" + " data. Given a normalised scRNA-seq expression matrix, it first calculates" + " Euclidean distances between each pair of cells in the dataset, which is then" + " augmented using a Gaussian kernel (function) and row-normalised to give a" + " normalised affinity matrix. A t-step markov process is then calculated, by" + " powering this affinity matrix t times. Finally, the powered affinity matrix" + " is right-multiplied by the normalised data, causing the final imputed values" + " to take the value of a per-gene average weighted by the affinities of cells." + " The resultant imputed matrix is then rescaled, to more closely match the" + " magnitude of measurements in the normalised (input) matrix." + ), + paper_name=( + "Recovering Gene Interactions from Single-Cell Data Using Data Diffusion" + ), paper_reference="van2018recovering", paper_year=2018, code_url="https://github.com/KrishnaswamyLab/MAGIC", @@ -81,12 +96,16 @@ def magic_approx_reverse_norm(adata, test=False): return _magic(adata, solver="approximate", normtype="sqrt", reverse_norm_order=True) -@method( +@baseline_method( method_name="KNN smoothing", - paper_name="KNN Smoothing (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", + method_summary=( + "KNN-smoothing is a method for denoising data based on the k-nearest" + " neighbours. Given a normalised scRNA-seq matrix, KNN-smoothing calculates a" + " k-nearest neighbour matrix using Euclidean distances between cell pairs. Each" + " cell’s denoised expression is then defined as the average expression of each" + " of its neighbours." + ), + is_baseline=False, image="openproblems-python-extras", ) def knn_naive(adata, test=False): diff --git a/openproblems/tasks/denoising/metrics/mse.py b/openproblems/tasks/denoising/metrics/mse.py index 104aec9fc8..eb4bf6c85b 100644 --- a/openproblems/tasks/denoising/metrics/mse.py +++ b/openproblems/tasks/denoising/metrics/mse.py @@ -3,6 +3,11 @@ @metric( metric_name="Mean-squared error", + metric_summary=( + "The mean squared error between the denoised counts of the training dataset and" + " the true counts of the test dataset after reweighting by the train/test" + " ratio." + ), paper_reference="batson2019molecular", maximize=False, ) diff --git a/openproblems/tasks/denoising/metrics/poisson.py b/openproblems/tasks/denoising/metrics/poisson.py index f1460627a3..b6f8aa6b95 100644 --- a/openproblems/tasks/denoising/metrics/poisson.py +++ b/openproblems/tasks/denoising/metrics/poisson.py @@ -3,6 +3,10 @@ @metric( metric_name="Poisson loss", + metric_summary=( + "The Poisson log likelihood of observing the true counts of the test dataset" + " given the distribution given in the denoised dataset." + ), paper_reference="batson2019molecular", maximize=False, image="openproblems-python-pytorch", diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index cc998a0255..db7ea1a9ea 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -21,25 +21,6 @@ high dimensional data don’t distinguish data points well). Thus, we need to fi to [dimensionally reduce](https://en.wikipedia.org/wiki/Dimensionality_reduction) the data for visualization and interpretation. -## The metrics - -* **Distance correlation**: the Spearman correlation between - ground truth distances in the high-dimensional data and Euclidean distances in the - dimension-reduced data, invariant to scalar multiplication. *Distance correlation* - computes high-dimensional distances in Euclidean space, while *Distance correlation - (spectral)* computes [diffusion distances](http://dx.doi.org/10.1016/j.acha.2006.04.006) - (i.e. Euclidean distances on the [Laplacian Eigenmap](http://dx.doi.org/10.1162/089976603321780317)). -* **Trustworthiness**: a measurement of similarity between the rank of each point's - nearest neighbors in the high-dimensional data and the reduced data ([Venna & Kaski, - 2001](https://openproblems.bio/bibliography#venna2001neighborhood)). -* **Density preservation**: similarity between local densities in the high-dimensional - data and the reduced data ([Narayan, Berger & Cho, - 2020](https://openproblems.bio/bibliography#narayan2021assessing)) -* **NN Ranking**: a set of metrics from - [pyDRMetrics](https://openproblems.bio/bibliography#zhang2021pydrmetrics) relating to - the preservation of nearest neighbors in the high-dimensional data and the reduced - data. - ## API WARNING: other than most tasks, `adata.X` should contain log CP10k-normalized data, diff --git a/openproblems/tasks/dimensionality_reduction/methods/baseline.py b/openproblems/tasks/dimensionality_reduction/methods/baseline.py index 6403ea2981..849efd853d 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/baseline.py +++ b/openproblems/tasks/dimensionality_reduction/methods/baseline.py @@ -1,24 +1,17 @@ -from ....tools.decorators import method +from ....tools.decorators import baseline_method from ....tools.normalize import log_cp10k from ....tools.utils import check_version from .diffusion_map import diffusion_map from typing import Optional -import functools import numpy as np -_baseline_method = functools.partial( - method, - paper_name="Open Problems for Single Cell Analysis", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, -) - -@_baseline_method( +@baseline_method( method_name="Random Features", + method_summary=( + "Randomly generated two-dimensional coordinates from a normal distribution." + ), ) def random_features(adata, test=False): adata.obsm["X_emb"] = np.random.normal(0, 1, (adata.shape[0], 2)) @@ -26,10 +19,25 @@ def random_features(adata, test=False): return adata -@_baseline_method( +@baseline_method( method_name="True Features", + method_summary="Use of the original feature inputs as the 'embedding'.", ) def true_features(adata, test=False): + adata.obsm["X_emb"] = adata.X + if test: + adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100] + + adata.obsm["X_emb"] = adata.obsm["X_emb"].toarray() + adata.uns["method_code_version"] = check_version("openproblems") + return adata + + +@baseline_method( + method_name="True Features (logCP10k)", + method_summary="Use of the original feature inputs as the 'embedding'.", +) +def true_features_log_cp10k(adata, test=False): adata = log_cp10k(adata) adata.obsm["X_emb"] = adata.X if test: @@ -40,8 +48,9 @@ def true_features(adata, test=False): return adata -@_baseline_method( +@baseline_method( method_name="Spectral Features", + method_summary="Use 1000-dimensional diffusions maps as an embedding", ) def spectral_features(adata, test=False, n_comps: Optional[int] = None): diff --git a/openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py b/openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py index 07cefb11af..429c047b88 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py +++ b/openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py @@ -42,6 +42,13 @@ def _diffusion_map(graph, n_comps, t, n_retries=1): @method( method_name="Diffusion maps", + method_summary=( + "Diffusion maps uses an affinity matrix to describe the similarity between data" + " points, which is then transformed into a graph Laplacian. The" + " eigenvalue-weighted eigenvectors of the graph Laplacian are then used to" + " create the embedding. Diffusion maps is calculated on the logCPM expression" + " matrix." + ), paper_reference="coifman2006diffusion", paper_name="Diffusion maps", paper_year=2006, diff --git a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py index 2ba2942a59..750537aa5c 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/neuralee.py +++ b/openproblems/tasks/dimensionality_reduction/methods/neuralee.py @@ -12,9 +12,20 @@ _neuralee_method = functools.partial( method, - paper_name="NeuralEE: A GPU-Accelerated Elastic Embedding " - "Dimensionality Reduction Method for " - "Visualizing Large-Scale scRNA-Seq Data", + method_summary=( + "NeuralEE is a neural network implementation of elastic embedding. It is a" + " non-linear method that preserves pairwise distances between data points." + " NeuralEE uses a neural network to optimize an objective function that" + " measures the difference between pairwise distances in the original" + " high-dimensional space and the two-dimensional space. It is computed on both" + " the recommended input from the package authors of 500 HVGs selected from a" + " logged expression matrix (without sequencing depth scaling) and the default" + " logCPM matrix with 1000 HVGs." + ), + paper_name=( + "NeuralEE: A GPU-Accelerated Elastic Embedding Dimensionality Reduction Method" + " for Visualizing Large-Scale scRNA-Seq Data" + ), paper_reference="xiong2020neuralee", paper_year=2020, code_url="https://github.com/HiBearME/NeuralEE", @@ -76,8 +87,8 @@ def _neuralee( if subsample_genes is not None and subsample_genes < adata_input.n_vars: subsample_genes = min(adata_input.n_vars, int(subsample_genes * 1.2)) log.warning( - "ValueError in neuralee_default. " - f"Increased subsample_genes to {subsample_genes}" + "ValueError in neuralee_default. Increased subsample_genes to" + f" {subsample_genes}" ) else: raise diff --git a/openproblems/tasks/dimensionality_reduction/methods/pca.py b/openproblems/tasks/dimensionality_reduction/methods/pca.py index 488f684dea..7505787968 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pca.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pca.py @@ -7,11 +7,21 @@ _pca_method = functools.partial( method, + method_summary=( + 'PCA or "Principal Component Analysis" is a linear method that finds orthogonal' + " directions in the data that capture the most variance. The first two" + " principal components are chosen as the two-dimensional embedding. We select" + " only the first two principal components as the two-dimensional embedding. PCA" + " is calculated on the logCPM expression matrix with and without selecting 1000" + " HVGs." + ), paper_name="On lines and planes of closest fit to systems of points in space", paper_reference="pearson1901pca", paper_year=1901, - code_url="https://scikit-learn.org/stable/modules/generated/" - "sklearn.decomposition.PCA.html", + code_url=( + "https://scikit-learn.org/stable/modules/generated/" + "sklearn.decomposition.PCA.html" + ), ) diff --git a/openproblems/tasks/dimensionality_reduction/methods/phate.py b/openproblems/tasks/dimensionality_reduction/methods/phate.py index 1706a5caf8..7c40f316bc 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/phate.py +++ b/openproblems/tasks/dimensionality_reduction/methods/phate.py @@ -9,8 +9,19 @@ _phate_method = functools.partial( method, - paper_name="Visualizing Structure and Transitions in High-Dimensional Biological" - " Data", + method_summary=( + "PHATE or “Potential of Heat - diffusion for Affinity - based Transition" + " Embedding” uses the potential of heat diffusion to preserve trajectories in a" + " dataset via a diffusion process. It is an affinity - based method that" + " creates an embedding by finding the dominant eigenvalues of a Markov" + " transition matrix. We evaluate several variants including using the" + " recommended square - root transformed CPM matrix as input, this input with" + " the gamma parameter set to zero and the normal logCPM transformed matrix with" + " and without HVG selection." + ), + paper_name=( + "Visualizing Structure and Transitions in High-Dimensional Biological Data" + ), paper_reference="moon2019visualizing", paper_year=2019, code_url="https://github.com/KrishnaswamyLab/PHATE/", diff --git a/openproblems/tasks/dimensionality_reduction/methods/pymde.py b/openproblems/tasks/dimensionality_reduction/methods/pymde.py index 1ddc69947e..8e8ce79aa4 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pymde.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pymde.py @@ -9,6 +9,13 @@ _pymde_method = functools.partial( method, + method_summary=( + "PyMDE is a Python implementation of minimum-distortion embedding. It is a" + " non-linear method that preserves distances between cells or neighborhoods in" + " the high-dimensional space. It is computed with options to preserve distances" + " between cells or neighbourhoods and with the logCPM matrix with and without" + " HVG selection as input." + ), paper_name="Minimum-Distortion Embedding", paper_reference="agrawal2021mde", paper_year=2021, diff --git a/openproblems/tasks/dimensionality_reduction/methods/tsne.py b/openproblems/tasks/dimensionality_reduction/methods/tsne.py index 49e14d3ee2..c074df2b4f 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/tsne.py +++ b/openproblems/tasks/dimensionality_reduction/methods/tsne.py @@ -7,11 +7,21 @@ _tsne_method = functools.partial( method, + method_summary=( + "t-SNE or t-distributed Stochastic Neighbor Embedding converts similarities" + " between data points to joint probabilities and tries to minimize the" + " Kullback-Leibler divergence between the joint probabilities of the" + " low-dimensional embedding and the high-dimensional data. We use the" + " implementation in the scanpy package with the result of PCA on the logCPM" + " expression matrix (with and without HVG selection)." + ), paper_name="Visualizing Data using t-SNE", paper_reference="vandermaaten2008visualizing", paper_year=2008, - code_url="https://scikit-learn.org/stable/modules/generated/" - "sklearn.manifold.TSNE.html#sklearn.manifold.TSNE", + code_url=( + "https://scikit-learn.org/stable/modules/generated/" + "sklearn.manifold.TSNE.html#sklearn.manifold.TSNE" + ), image="openproblems-python-extras", ) diff --git a/openproblems/tasks/dimensionality_reduction/methods/umap.py b/openproblems/tasks/dimensionality_reduction/methods/umap.py index cea0257323..31509caeea 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/umap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/umap.py @@ -7,16 +7,31 @@ _umap_method = functools.partial( method, - paper_name="UMAP: Uniform Manifold Approximation and Projection for " - "Dimension Reduction", + method_summary=( + "UMAP or Uniform Manifold Approximation and Projection is an algorithm for" + " dimension reduction based on manifold learning techniques and ideas from" + " topological data analysis. We perform UMAP on the logCPM expression matrix" + " before and after HVG selection and with and without PCA as a pre-processing" + " step." + ), + paper_name=( + "UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction" + ), paper_reference="mcinnes2018umap", paper_year=2018, code_url="https://github.com/lmcinnes/umap", ) _densmap_method = functools.partial( method, - paper_name="Assessing single-cell transcriptomic variability through" - " density-preserving data visualization", + method_summary=( + "densMAP is a modification of UMAP that adds an extra cost term in order to" + " preserve information about the relative local density of the data. It is" + " performed on the same inputs as UMAP." + ), + paper_name=( + "Assessing single-cell transcriptomic variability through density-preserving" + " data visualization" + ), paper_reference="narayan2021assessing", paper_year=2021, code_url="https://github.com/lmcinnes/umap", diff --git a/openproblems/tasks/dimensionality_reduction/metrics/density.py b/openproblems/tasks/dimensionality_reduction/metrics/density.py index a495d90f03..6225c8ee6d 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/density.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/density.py @@ -97,7 +97,11 @@ def _calculate_radii( @metric( - "density preservation", + "Density preservation", + metric_summary=( + "Similarity between local densities in the high-dimensional data and the" + " reduced data." + ), paper_reference="narayan2021assessing", maximize=True, ) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py index 3e23f49571..52dd1075ad 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py @@ -16,14 +16,18 @@ def _distance_correlation(X, X_emb): @metric( metric_name="Distance correlation", + metric_summary=( + "Spearman correlation between all pairwise Euclidean distances in the original" + " and dimension-reduced data" + ), maximize=True, paper_reference="schober2018correlation", ) def distance_correlation(adata, n_svd=500): - """Calculate the root mean squared error. + """Calculate the distance correlation - Computes (RMSE) between the full (or processed) data matrix and the - dimensionally-reduced matrix, invariant to scalar multiplication + Computes Spearman correlations between distances on the full (or processed) data + matrix and the dimensionally-reduced matrix """ import sklearn.decomposition @@ -38,15 +42,19 @@ def distance_correlation(adata, n_svd=500): @metric( metric_name="Distance correlation (spectral)", + metric_summary=( + "Spearman correlation between all pairwise diffusion distances in the original" + " and dimension-reduced data" + ), maximize=True, paper_reference="coifman2006diffusion", ) def distance_correlation_spectral(adata, n_comps=1000): - """Calculate the spectral root mean squared error + """Calculate the spectral distance correlation - Computes (RMSE) between high-dimensional Laplacian eigenmaps on the full (or - processed) data matrix and the dimensionally-reduced matrix, invariant to scalar - multiplication + Computes Spearman correlations between distances on high-dimensional Laplacian + eigenmaps on the full (or processed) data matrix and the dimensionally-reduced + matrix """ n_comps = min(n_comps, min(adata.shape) - 2) adata_true = diffusion_map(adata.copy(), n_comps=n_comps) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py index b13fe93ac6..2c3cde36ff 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py @@ -114,14 +114,30 @@ def _fit(adata: AnnData) -> Tuple[float, float, float, float, float, float, floa return Q, m -@metric("continuity", paper_reference="zhang2021pydrmetrics", maximize=True) +@metric( + "continuity", + metric_summary=( + "Continuity measures error of hard extrusions based on nearest neighbor" + " coranking" + ), + paper_reference="zhang2021pydrmetrics", + maximize=True, +) def continuity(adata: AnnData) -> float: Q, m = _fit(adata) C = _continuity(Q, m)[_K] return float(np.clip(C, 0.0, 1.0)) # in [0, 1] -@metric("co-KNN size", paper_reference="zhang2021pydrmetrics", maximize=True) +@metric( + "co-KNN size", + metric_summary=( + "co-KNN size counts how many points are in both k-nearest neighbors before and" + " after the dimensionality reduction" + ), + paper_reference="zhang2021pydrmetrics", + maximize=True, +) def qnn(adata: AnnData) -> float: Q, m = _fit(adata) QNN = _qnn(Q, m)[_K] @@ -129,7 +145,12 @@ def qnn(adata: AnnData) -> float: return float(np.clip(QNN, 0.0, 1.0)) -@metric("co-KNN AUC", paper_reference="zhang2021pydrmetrics", maximize=True) +@metric( + "co-KNN AUC", + metric_summary="co-KNN AUC is area under the co-KNN curve", + paper_reference="zhang2021pydrmetrics", + maximize=True, +) def qnn_auc(adata: AnnData) -> float: Q, m = _fit(adata) QNN = _qnn(Q, m) @@ -139,6 +160,10 @@ def qnn_auc(adata: AnnData) -> float: @metric( "local continuity meta criterion", + metric_summary=( + "The local continuity meta criterion is the co-KNN size with baseline removal" + " which favors locality" + ), paper_reference="zhang2021pydrmetrics", maximize=True, ) @@ -149,7 +174,12 @@ def lcmc(adata: AnnData) -> float: return LCMC -@metric("local property", paper_reference="zhang2021pydrmetrics", maximize=True) +@metric( + "local property", + metric_summary="The local property metric is a summary of the local co-KNN", + paper_reference="zhang2021pydrmetrics", + maximize=True, +) def qlocal(adata: AnnData) -> float: # according to authors, this is usually preferred to # qglobal, because human are more sensitive to nearer neighbors @@ -161,7 +191,12 @@ def qlocal(adata: AnnData) -> float: return Qlocal -@metric("global property", paper_reference="zhang2021pydrmetrics", maximize=True) +@metric( + "global property", + metric_summary="The global property metric is a summary of the global co-KNN", + paper_reference="zhang2021pydrmetrics", + maximize=True, +) def qglobal(adata: AnnData) -> float: Q, m = _fit(adata) QNN = _qnn(Q, m) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py index 157e71fb55..6f2387747c 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py @@ -6,6 +6,10 @@ @metric( metric_name="trustworthiness", + metric_summary=( + "a measurement of similarity between the rank of each point's nearest neighbors" + " in the high-dimensional data and the reduced data." + ), paper_reference="venna2001neighborhood", maximize=True, ) diff --git a/openproblems/tasks/label_projection/README.md b/openproblems/tasks/label_projection/README.md index 0fb854c852..2ad3ba44da 100644 --- a/openproblems/tasks/label_projection/README.md +++ b/openproblems/tasks/label_projection/README.md @@ -24,20 +24,6 @@ with matching labels. These datasets are then split into training and test batch the task of each method is to train a cell type classifer on the training set and project those labels onto the test set. -## The metrics - -Metrics for label projection aim to characterize how well each classifer correctly -assigns cell type labels to cells in the test set. - -* **Accuracy**: Average number of correctly applied labels. -* **F1 score**: The [F1 - score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html) - is a weighted average of the precision and recall over all class labels, where an F1 - score reaches its best value at 1 and worst score at 0, where each class contributes - to the score relative to its frequency in the dataset. -* **Macro F1 score**: The macro F1 score is an unweighted F1 score, where each class - contributes equally, regardless of its frequency. - ## API Datasets should contain the following attributes: diff --git a/openproblems/tasks/label_projection/methods/baseline.py b/openproblems/tasks/label_projection/methods/baseline.py index 513abc24b4..38b98f0c75 100644 --- a/openproblems/tasks/label_projection/methods/baseline.py +++ b/openproblems/tasks/label_projection/methods/baseline.py @@ -1,15 +1,16 @@ -from ....tools.decorators import method +from ....tools.decorators import baseline_method from ....tools.utils import check_version import numpy as np -@method( +@baseline_method( method_name="Majority Vote", - paper_name="Majority Vote (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", + method_summary=( + "Assignment of all predicted labels as the most common label in the training" + " data" + ), + is_baseline=False, ) def majority_vote(adata, test=False): majority = adata.obs.labels[adata.obs.is_train].value_counts().index[0] @@ -20,13 +21,12 @@ def majority_vote(adata, test=False): return adata -@method( +@baseline_method( method_name="Random Labels", - paper_name="Random Labels (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary=( + "Random assignment of predicted labels proportionate to label abundance in" + " training data" + ), ) def random_labels(adata, test=False): label_distribution = adata.obs.labels[adata.obs.is_train].value_counts() @@ -43,13 +43,9 @@ def random_labels(adata, test=False): return adata -@method( +@baseline_method( method_name="True Labels", - paper_name="True Labels (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary="Perfect assignment of the predicted labels from the test labels", ) def true_labels(adata, test=False): adata.obs["labels_pred"] = adata.obs["labels"] diff --git a/openproblems/tasks/label_projection/methods/knn_classifier.py b/openproblems/tasks/label_projection/methods/knn_classifier.py index b32f9d29a4..c7c1d2a73d 100644 --- a/openproblems/tasks/label_projection/methods/knn_classifier.py +++ b/openproblems/tasks/label_projection/methods/knn_classifier.py @@ -7,11 +7,22 @@ _knn_classifier_method = functools.partial( method, + method_summary=( + 'K-neighbors classifier uses the "k-nearest neighbours" approach, which is a' + " popular machine learning algorithm for classification and regression tasks." + " The assumption underlying KNN in this context is that cells with similar gene" + " expression profiles tend to belong to the same cell type. For each unlabelled" + " cell, this method computes the $k$ labelled cells (in this case, 5) with the" + " smallest distance in PCA space, and assigns that cell the most common cell" + " type among its $k$ nearest neighbors." + ), paper_name="Nearest neighbor pattern classification", paper_reference="cover1967nearest", paper_year=1967, - code_url="https://scikit-learn.org/stable/modules/generated/" - "sklearn.neighbors.KNeighborsClassifier.html", + code_url=( + "https://scikit-learn.org/stable/modules/generated/" + "sklearn.neighbors.KNeighborsClassifier.html" + ), ) diff --git a/openproblems/tasks/label_projection/methods/logistic_regression.py b/openproblems/tasks/label_projection/methods/logistic_regression.py index bbd51bf442..4d7c2cf854 100644 --- a/openproblems/tasks/label_projection/methods/logistic_regression.py +++ b/openproblems/tasks/label_projection/methods/logistic_regression.py @@ -7,11 +7,19 @@ _logistic_regression_method = functools.partial( method, + method_summary=( + "Logistic Regression estimates parameters of a logistic function for" + " multivariate classification tasks. Here, we use 100-dimensional whitened PCA" + " coordinates as independent variables, and the model minimises the cross" + " entropy loss over all cell type classes. " + ), paper_name="Applied Logistic Regression", paper_reference="hosmer2013applied", paper_year=2013, - code_url="https://scikit-learn.org/stable/modules/generated/" - "sklearn.linear_model.LogisticRegression.html", + code_url=( + "https://scikit-learn.org/stable/modules/generated/" + "sklearn.linear_model.LogisticRegression.html" + ), ) diff --git a/openproblems/tasks/label_projection/methods/mlp.py b/openproblems/tasks/label_projection/methods/mlp.py index e421b9b3c4..0e92978105 100644 --- a/openproblems/tasks/label_projection/methods/mlp.py +++ b/openproblems/tasks/label_projection/methods/mlp.py @@ -7,11 +7,22 @@ _mlp_method = functools.partial( method, + method_summary=( + 'MLP or "Multi-Layer Perceptron" is a type of artificial neural network that' + " consists of multiple layers of interconnected neurons. Each neuron computes a" + " weighted sum of all neurons in the previous layer and transforms it with" + " nonlinear activation function. The output layer provides the final" + " prediction, and network weights are updated by gradient descent to minimize" + " the cross entropy loss. Here, the input data is 100-dimensional whitened PCA" + " coordinates for each cell, and we use two hidden layers of 100 neurons each." + ), paper_name="Connectionist learning procedures", paper_reference="hinton1989connectionist", paper_year=1990, - code_url="https://scikit-learn.org/stable/modules/generated/" - "sklearn.neural_network.MLPClassifier.html", + code_url=( + "https://scikit-learn.org/stable/modules/generated/" + "sklearn.neural_network.MLPClassifier.html" + ), ) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index b220b8bea8..7b68324439 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -5,8 +5,19 @@ _scanvi_method = functools.partial( method, - paper_name="Probabilistic harmonization and annotation of single-cell" - " transcriptomics data with deep generative models", + method_summary=( + 'scANVI or "single-cell ANnotation using Variational Inference" is a' + " semi-supervised variant of the scVI(Lopez et al. 2018) algorithm. Like scVI," + " scANVI uses deep neural networks and stochastic optimization to model" + " uncertainty caused by technical noise and bias in single - cell" + " transcriptomics measurements. However, scANVI also leverages cell type labels" + " in the generative modelling. In this approach, scANVI is used to predict the" + " cell type labels of the unlabelled test data." + ), + paper_name=( + "Probabilistic harmonization and annotation of single-cell transcriptomics data" + " with deep generative models" + ), paper_reference="xu2021probabilistic", paper_year=2021, code_url="https://github.com/YosefLab/scvi-tools", @@ -15,6 +26,13 @@ _scanvi_scarches_method = functools.partial( method, + method_summary=( + 'scArches+scANVI or "Single-cell architecture surgery" is a deep learning' + " method for mapping new datasets onto a pre-existing reference model, using" + " transfer learning and parameter optimization. It first uses scANVI to build a" + " reference model from the training data, and then apply scArches to map the" + " test data onto the reference model and make predictions." + ), paper_name="Query to reference single-cell integration with transfer learning", paper_reference="lotfollahi2020query", paper_year=2021, diff --git a/openproblems/tasks/label_projection/methods/seurat.py b/openproblems/tasks/label_projection/methods/seurat.py index 2871c06acf..f234bf17d6 100644 --- a/openproblems/tasks/label_projection/methods/seurat.py +++ b/openproblems/tasks/label_projection/methods/seurat.py @@ -12,6 +12,15 @@ @method( method_name="Seurat reference mapping (SCTransform)", + method_summary=( + "Seurat reference mapping is a cell type label transfer method provided by the" + " Seurat package. Gene expression counts are first normalised by SCTransform" + " before computing PCA. Then it finds mutual nearest neighbours, known as" + " transfer anchors, between the labelled and unlabelled part of the data in PCA" + " space, and computes each cell’s distance to each of the anchor pairs." + " Finally, it uses the labelled anchors to predict cell types for unlabelled" + " cells based on these distances." + ), paper_name="Integrated analysis of multimodal single-cell data", paper_reference="hao2021integrated", paper_year=2021, diff --git a/openproblems/tasks/label_projection/methods/xgboost.py b/openproblems/tasks/label_projection/methods/xgboost.py index 7eef8bcbea..6efb895063 100644 --- a/openproblems/tasks/label_projection/methods/xgboost.py +++ b/openproblems/tasks/label_projection/methods/xgboost.py @@ -9,6 +9,12 @@ _xgboost_method = functools.partial( method, + method_summary=( + "XGBoost is a gradient boosting decision tree model that learns multiple tree" + " structures in the form of a series of input features and their values," + " leading to a prediction decision, and averages predictions from all its" + " trees. Here, input features are normalised gene expression values." + ), paper_name="XGBoost: A Scalable Tree Boosting System", paper_reference="chen2016xgboost", paper_year=2016, diff --git a/openproblems/tasks/label_projection/metrics/accuracy.py b/openproblems/tasks/label_projection/metrics/accuracy.py index bed7b3e4cf..37a67a7526 100644 --- a/openproblems/tasks/label_projection/metrics/accuracy.py +++ b/openproblems/tasks/label_projection/metrics/accuracy.py @@ -3,7 +3,12 @@ import numpy as np -@metric(metric_name="Accuracy", paper_reference="grandini2020metrics", maximize=True) +@metric( + metric_name="Accuracy", + metric_summary="Average number of correctly applied labels.", + paper_reference="grandini2020metrics", + maximize=True, +) def accuracy(adata): import sklearn.preprocessing diff --git a/openproblems/tasks/label_projection/metrics/f1.py b/openproblems/tasks/label_projection/metrics/f1.py index e893e588d0..47ce546c82 100644 --- a/openproblems/tasks/label_projection/metrics/f1.py +++ b/openproblems/tasks/label_projection/metrics/f1.py @@ -16,13 +16,29 @@ def _f1(adata, average="weighted"): ) -@metric(metric_name="F1 score", paper_reference="grandini2020metrics", maximize=True) +@metric( + metric_name="F1 score", + metric_summary=( + "The [F1 score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)" # noqa: E501 + " is a weighted average of the precision and recall over all class labels," + " where an F1 score reaches its best value at 1 and worst score at 0, where" + " each class contributes to the score relative to its frequency in the dataset." + ), + paper_reference="grandini2020metrics", + maximize=True, +) def f1(adata): return _f1(adata, average="weighted") @metric( - metric_name="Macro F1 score", paper_reference="grandini2020metrics", maximize=True + metric_name="Macro F1 score", + metric_summary=( + "The macro F1 score is an unweighted F1 score, where each class contributes" + " equally, regardless of its frequency." + ), + paper_reference="grandini2020metrics", + maximize=True, ) def f1_macro(adata): return _f1(adata, average="macro") diff --git a/openproblems/tasks/matching_modalities/README.md b/openproblems/tasks/matching_modalities/README.md index dd69a7803d..54bca2e6ea 100644 --- a/openproblems/tasks/matching_modalities/README.md +++ b/openproblems/tasks/matching_modalities/README.md @@ -23,18 +23,6 @@ data as ground truth so that we can evaluate when the observations from the same acquired using different modalities are similar. A perfect result has each of the paired observations sharing the same coordinates in the latent space. -## The metrics - -Metrics for matching modalities aim to characterize how well the aligned -datasets correspond to the ground truth. - -* **kNN AUC**: Let $f(i) ∈ F$ be the modality 1 (e.g., scRNA-seq) measurement of cell $i$, - and $g(i) ∈ G$ be the modality 2 (e.g., scATAC-seq) measurement of cell $i$. kNN-AUC - calculates the average percentage overlap of neighborhoods of $f(i)$ in $F$ with - neighborhoods of $g(i)$ in $G$. Higher is better. -* **MSE**: Mean squared error (MSE) is the average distance between each pair of matched - observations of the same cell in the learned latent space. Lower is better. - ## API Datasets should include matched measurements from two modalities, which are contained in diff --git a/openproblems/tasks/matching_modalities/methods/baseline.py b/openproblems/tasks/matching_modalities/methods/baseline.py index 042b0ac233..0593fbabc3 100644 --- a/openproblems/tasks/matching_modalities/methods/baseline.py +++ b/openproblems/tasks/matching_modalities/methods/baseline.py @@ -1,17 +1,17 @@ -from ....tools.decorators import method +from ....tools.decorators import baseline_method from ....tools.normalize import log_cp10k from ....tools.utils import check_version import numpy as np -@method( +@baseline_method( method_name="Random Features", - paper_name="Random Features (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary=( + "20-dimensional SVD is computed on the first modality, and is then randomly" + " permuted twice, once for use as the output for each modality, producing" + " random features with no correlation between modalities." + ), ) def random_features(adata, test=False, n_svd=20): import sklearn.decomposition @@ -27,13 +27,13 @@ def random_features(adata, test=False, n_svd=20): return adata -@method( +@baseline_method( method_name="True Features", - paper_name="True Features (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary=( + "20-dimensional SVD is computed on the first modality, and this same embedding" + " is used as output for both modalities, producing perfectly aligned features" + " from each modality." + ), ) def true_features(adata, test=False, n_svd=20): import sklearn.decomposition diff --git a/openproblems/tasks/matching_modalities/methods/harmonic_alignment.py b/openproblems/tasks/matching_modalities/methods/harmonic_alignment.py index 6e28be7445..46565c9535 100644 --- a/openproblems/tasks/matching_modalities/methods/harmonic_alignment.py +++ b/openproblems/tasks/matching_modalities/methods/harmonic_alignment.py @@ -8,6 +8,14 @@ _harmonic_alignment_method = functools.partial( method, + method_summary=( + "Harmonic alignment embeds cellular data from each modality into a common space" + " by computing a mapping between the 100-dimensional diffusion maps of each" + " modality. This mapping is computed by computing an isometric transformation" + " of the eigenmaps, and concatenating the resulting diffusion maps together" + " into a joint 200-dimensional space. This joint diffusion map space is used as" + " output for the task." + ), paper_name="Harmonic Alignment", paper_reference="stanley2020harmonic", paper_year=2020, diff --git a/openproblems/tasks/matching_modalities/methods/mnn.py b/openproblems/tasks/matching_modalities/methods/mnn.py index 6f94b3695c..039129bfe0 100644 --- a/openproblems/tasks/matching_modalities/methods/mnn.py +++ b/openproblems/tasks/matching_modalities/methods/mnn.py @@ -10,8 +10,18 @@ _mnn_method = functools.partial( method, - paper_name="Batch effects in single-cell RNA-sequencing data are corrected by " - "matching mutual nearest neighbors", + method_summary=( + "Mutual nearest neighbors (MNN) embeds cellular data from each modality into a" + " common space by computing a mapping between modality-specific 100-dimensional" + " SVD embeddings. The embeddings are integrated using the FastMNN version of" + " the MNN algorithm, which generates an embedding of the second modality mapped" + " to the SVD space of the first. This corrected joint SVD space is used as" + " output for the task." + ), + paper_name=( + "Batch effects in single-cell RNA-sequencing data are corrected by matching" + " mutual nearest neighbors" + ), paper_reference="haghverdi2018batch", paper_year=2018, code_url="https://github.com/LTLA/batchelor", diff --git a/openproblems/tasks/matching_modalities/methods/procrustes.py b/openproblems/tasks/matching_modalities/methods/procrustes.py index e144813055..a7ccd337d6 100644 --- a/openproblems/tasks/matching_modalities/methods/procrustes.py +++ b/openproblems/tasks/matching_modalities/methods/procrustes.py @@ -5,11 +5,20 @@ @method( method_name="Procrustes superimposition", + method_summary=( + "Procrustes superimposition embeds cellular data from each modality into a" + " common space by aligning the 100-dimensional SVD embeddings to one another by" + " using an isomorphic transformation that minimizes the root mean squared" + " distance between points. The unmodified SVD embedding and the transformed" + " second modality are used as output for the task." + ), paper_name="Generalized Procrustes analysis", paper_reference="gower1975generalized", paper_year=1975, - code_url="https://docs.scipy.org/doc/scipy/reference/generated/" - "scipy.spatial.procrustes.html", + code_url=( + "https://docs.scipy.org/doc/scipy/reference/generated/" + "scipy.spatial.procrustes.html" + ), ) def procrustes(adata, test=False, n_svd=None): import scipy.spatial diff --git a/openproblems/tasks/matching_modalities/metrics/knn_auc.py b/openproblems/tasks/matching_modalities/metrics/knn_auc.py index 3a50fbb178..8e783e17fd 100644 --- a/openproblems/tasks/matching_modalities/metrics/knn_auc.py +++ b/openproblems/tasks/matching_modalities/metrics/knn_auc.py @@ -5,6 +5,12 @@ @metric( metric_name="kNN Area Under the Curve", + metric_summary=( + "Let $f(i) ∈ F$ be the scRNA-seq measurement of cell $i$, and $g(i) ∈ G$ be the" + " scATAC- seq measurement of cell $i$. kNN-AUC calculates the average" + " percentage overlap of neighborhoods of $f(i)$ in $F$ with neighborhoods of" + " $g(i)$ in $G$. Higher is better." + ), paper_reference="stanley2020harmonic", maximize=True, ) diff --git a/openproblems/tasks/matching_modalities/metrics/mse.py b/openproblems/tasks/matching_modalities/metrics/mse.py index ffdfd1dfee..49dbf462c2 100644 --- a/openproblems/tasks/matching_modalities/metrics/mse.py +++ b/openproblems/tasks/matching_modalities/metrics/mse.py @@ -15,6 +15,10 @@ def _square(X): @metric( metric_name="Mean squared error", + metric_summary=( + "Mean squared error (MSE) is the average distance between each pair of matched" + " observations of the same cell in the learned latent space. Lower is better." + ), paper_reference="lance2022multimodal", maximize=False, ) diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/baseline.py b/openproblems/tasks/regulatory_effect_prediction/methods/baseline.py index a91bdc60d4..f8305c8b60 100644 --- a/openproblems/tasks/regulatory_effect_prediction/methods/baseline.py +++ b/openproblems/tasks/regulatory_effect_prediction/methods/baseline.py @@ -1,16 +1,15 @@ -from ....tools.decorators import method +from ....tools.decorators import baseline_method from ....tools.utils import check_version import numpy as np -@method( +@baseline_method( method_name="Random Scores", - paper_name="Random Scores (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary=( + "Random generation of gene scores by random permutation of gene expression" + " values" + ), ) def random_scores(adata, test=False): adata.obsm["gene_score"] = adata.X[ @@ -20,13 +19,9 @@ def random_scores(adata, test=False): return adata -@method( +@baseline_method( method_name="True Scores", - paper_name="True Scores (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary="Perfect prediction of gene scores from gene expression values", ) def true_scores(adata, test=False): adata.obsm["gene_score"] = adata.X diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py index 5689c5a5a2..30f7a70bcb 100644 --- a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py +++ b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py @@ -229,8 +229,18 @@ def _beta(adata, test=False, top_genes=None, threshold=1): @method( method_name="BETA", - paper_name="Target analysis by integration of transcriptome " - "and ChIP-seq data with BETA", + method_summary=( + "Binding and expression target analysis (BETA) is a software package that" + " integrates ChIP-seq of TFs or chromatin regulators with differential gene" + " expression data to infer direct target genes. BETA has three functions: (i)" + " to predict whether the factor has activating or repressive function; (ii) to" + " infer the factor's target genes; and (iii) to identify the motif of the" + " factor and its collaborators, which might modulate the factor's activating or" + " repressive function." + ), + paper_name=( + "Target analysis by integration of transcriptome and ChIP-seq data with BETA" + ), paper_reference="wang2013target", paper_year=2013, code_version="1.0", diff --git a/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py b/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py index 4993d58cff..ac62c59376 100644 --- a/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py +++ b/openproblems/tasks/regulatory_effect_prediction/metrics/correlation.py @@ -28,6 +28,10 @@ def _correlation(adata, method="pearson"): @metric( metric_name="Median Pearson correlation", + metric_summary=( + "Median Pearson correlation between predicted and true gene expression over all" + " genes." + ), paper_reference="schober2018correlation", maximize=True, ) @@ -37,6 +41,10 @@ def pearson_correlation(adata): @metric( metric_name="Median Spearman correlation", + metric_summary=( + "Median Spearman correlation between predicted and true gene expression over" + " all genes." + ), paper_reference="schober2018correlation", maximize=True, ) diff --git a/openproblems/tasks/spatial_decomposition/README.md b/openproblems/tasks/spatial_decomposition/README.md index 79b3db1d8e..ff20f325b2 100644 --- a/openproblems/tasks/spatial_decomposition/README.md +++ b/openproblems/tasks/spatial_decomposition/README.md @@ -18,21 +18,6 @@ scNuc-seq) to guide the inference process, while the latter only work with the spatial data. We require that all datasets have an associated reference single cell data set, but methods are free to ignore this information. -## Metrics - -### R2 - -R2 pronounced as "R squared", also known as the "coefficient of determination". R2 -reports the fraction of the true proportion values' (`adata.obsm["proportions_true"]`) -variance that can be explained by the predicted proportion values -(`adata.obsm["proportion_pred"]`). The **best score**, and upper bound, is 1.0. There is -no fixed lower bound for the metric. The _uniform/non-weighted average_ across all cell -types/states is used to summarize performance. See the -[sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html) -documentation for details on the implementation and the -[wikipedia](https://en.wikipedia.org/wiki/Coefficient_of_determination) site for more -general information regarding the metric. - ## API Datasets consists of 2 `anndata.AnnData` objects, concatenated by key diff --git a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py index 3bdfb77031..089a6a618c 100644 --- a/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py +++ b/openproblems/tasks/spatial_decomposition/datasets/destvi/generate.py @@ -4,13 +4,17 @@ @dataset( "DestVI", - data_url="https://github.com/romain-lopez/DestVI-reproducibility/" - "blob/master/simulations/make_dataset.py", + data_url=( + "https://github.com/romain-lopez/DestVI-reproducibility/" + "blob/master/simulations/make_dataset.py" + ), data_reference="lopez2022destvi", - dataset_summary="scRNA-seq is generated based on learn NB parameters " - "from the destVI manuscripts leveraging sparsePCA. Number of cells and " - "cell types present in each spatial spot is computed via combination of " - "kernel-based parametrization of a categorical distribution and the NB model.", + dataset_summary=( + "scRNA-seq is generated based on learn NB parameters from the destVI" + " manuscripts leveraging sparsePCA. Number of cells and cell types present in" + " each spatial spot is computed via combination of kernel-based parametrization" + " of a categorical distribution and the NB model." + ), image="openproblems-python-pytorch", ) def destvi(test=False): diff --git a/openproblems/tasks/spatial_decomposition/methods/baseline.py b/openproblems/tasks/spatial_decomposition/methods/baseline.py index 727ec1bf3d..f8d1eee962 100644 --- a/openproblems/tasks/spatial_decomposition/methods/baseline.py +++ b/openproblems/tasks/spatial_decomposition/methods/baseline.py @@ -1,17 +1,16 @@ -from ....tools.decorators import method +from ....tools.decorators import baseline_method from ....tools.utils import check_version from ..utils import split_sc_and_sp import numpy as np -@method( +@baseline_method( method_name="Random Proportions", - paper_name="Random Proportions (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary=( + "Random assignment of predicted celltype proportions from a Dirichlet" + " distribution." + ), ) def random_proportions(adata, test=False): adata_sc, adata = split_sc_and_sp(adata) @@ -25,13 +24,11 @@ def random_proportions(adata, test=False): return adata -@method( +@baseline_method( method_name="True Proportions", - paper_name="True Proportions (baseline)", - paper_reference="openproblems", - paper_year=2022, - code_url="https://github.com/openproblems-bio/openproblems", - is_baseline=True, + method_summary=( + "Perfect assignment of predicted celltype proportions from the ground truth." + ), ) def true_proportions(adata, test=False): _, adata = split_sc_and_sp(adata) diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index 3c55a44eeb..6967c8a8f3 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -8,6 +8,11 @@ _cell2location_method = functools.partial( method, + method_summary=( + "Cell2location is a decomposition method based on Negative Binomial regression" + " that is able to account for batch effects in estimating the single-cell gene" + " expression signature used for the spatial decomposition step." + ), paper_name="Cell2location maps fine-grained cell types in spatial transcriptomics", paper_reference="kleshchevnikov2022cell2location", paper_year=2022, diff --git a/openproblems/tasks/spatial_decomposition/methods/destvi.py b/openproblems/tasks/spatial_decomposition/methods/destvi.py index 5486ca238e..3b9f7134fe 100644 --- a/openproblems/tasks/spatial_decomposition/methods/destvi.py +++ b/openproblems/tasks/spatial_decomposition/methods/destvi.py @@ -6,8 +6,15 @@ @method( method_name="DestVI", - paper_name="DestVI identifies continuums of cell types in spatial " - "transcriptomics data", + method_summary=( + "destVI is a decomposition method that leverages a conditional generative model" + " of spatial transcriptomics down to the sub-cell-type variation level, which" + " is then used to decompose the cell-type proportions determining the spatial" + " organization of a tissue." + ), + paper_name=( + "DestVI identifies continuums of cell types in spatial transcriptomics data" + ), paper_reference="lopez2022destvi", paper_year=2022, code_url="https://github.com/YosefLab/scvi-tools", diff --git a/openproblems/tasks/spatial_decomposition/methods/nmfreg.py b/openproblems/tasks/spatial_decomposition/methods/nmfreg.py index dd5d0dfe91..3ba6fddb85 100644 --- a/openproblems/tasks/spatial_decomposition/methods/nmfreg.py +++ b/openproblems/tasks/spatial_decomposition/methods/nmfreg.py @@ -7,8 +7,16 @@ @method( method_name="NMF-reg", - paper_name="Slide-seq: A scalable technology for measuring genome-wide" - " expression at high spatial resolution", + method_summary=( + "NMFreg is a decomposition method based on Non-negative Matrix Factorization" + " Regression (NMFreg) that reconstructs expression of each spatial location as" + " a weighted combination of cell-type signatures defined by scRNA-seq. It was" + " originally developed for Slide-seq data." + ), + paper_name=( + "Slide-seq: A scalable technology for measuring genome-wide expression at high" + " spatial resolution" + ), paper_reference="rodriques2019slide", paper_year=2019, code_url="https://github.com/tudaga/NMFreg_tutorial", diff --git a/openproblems/tasks/spatial_decomposition/methods/nnls.py b/openproblems/tasks/spatial_decomposition/methods/nnls.py index 23cfd7b5e1..4996217c90 100644 --- a/openproblems/tasks/spatial_decomposition/methods/nnls.py +++ b/openproblems/tasks/spatial_decomposition/methods/nnls.py @@ -9,11 +9,19 @@ @method( method_name="Non-Negative Least Squares", - paper_name="Solving Least Squares Problems", - paper_reference="lawson1995solving", - paper_year=1987, - code_url="https://docs.scipy.org/doc/scipy/" - "reference/generated/scipy.optimize.nnls.html", + method_summary=( + "NNLS13 is a decomposition method based on Non-Negative Least Square Regression" + " (NNLS). It was originally introduced by the method AutoGenes" + ), + paper_name=( + "AutoGeneS: Automatic gene selection using multi-objective optimization for" + " RNA-seq deconvolution" + ), + paper_reference="aliee2021autogenes", + paper_year=2021, + code_url=( + "https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.nnls.html" + ), ) def nnls_scipy(adata, test=False): from scipy.optimize import nnls diff --git a/openproblems/tasks/spatial_decomposition/methods/rctd.py b/openproblems/tasks/spatial_decomposition/methods/rctd.py index 50b6d8414c..26ffedf5cc 100644 --- a/openproblems/tasks/spatial_decomposition/methods/rctd.py +++ b/openproblems/tasks/spatial_decomposition/methods/rctd.py @@ -12,6 +12,13 @@ @method( method_name="RCTD", + method_summary=( + "RCTD (Robust Cell Type Decomposition) is a decomposition method that uses" + " signatures learnt from single-cell data to decompose spatial expression of" + " tissues. It is able to platform effect normalization step, which normalizes" + " the scRNA-seq cell type profiles to match the platform effects of the spatial" + " transcriptomics dataset." + ), paper_name="Robust decomposition of cell type mixtures in spatial transcriptomics", paper_reference="cable2021robust", paper_year=2020, diff --git a/openproblems/tasks/spatial_decomposition/methods/seuratv3.py b/openproblems/tasks/spatial_decomposition/methods/seuratv3.py index 0aaa92f4de..46c1f7e560 100644 --- a/openproblems/tasks/spatial_decomposition/methods/seuratv3.py +++ b/openproblems/tasks/spatial_decomposition/methods/seuratv3.py @@ -14,6 +14,10 @@ @method( method_name="SeuratV3", + method_summary=( + "SeuratV3 is a decomposition method that is based on Canonical Correlation" + " Analysis (CCA)." + ), paper_name="Comprehensive Integration of Single-Cell Data", paper_reference="stuart2019comprehensive", paper_year=2019, diff --git a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py index d046376daa..83b23d1ea1 100644 --- a/openproblems/tasks/spatial_decomposition/methods/stereoscope.py +++ b/openproblems/tasks/spatial_decomposition/methods/stereoscope.py @@ -5,8 +5,16 @@ @method( method_name="Stereoscope", - paper_name="Single-cell and spatial transcriptomics enables probabilistic " - "inference of cell type topography", + method_summary=( + "Stereoscope is a decomposition method based on Negative Binomial regression." + " It is similar in scope and implementation to cell2location but less flexible" + " to incorporate additional covariates such as batch effects and other type of" + " experimental design annotations." + ), + paper_name=( + "Single-cell and spatial transcriptomics enables probabilistic inference of" + " cell type topography" + ), paper_reference="andersson2020single", paper_year=2020, code_url="https://github.com/scverse/scvi-tools", diff --git a/openproblems/tasks/spatial_decomposition/methods/tangram.py b/openproblems/tasks/spatial_decomposition/methods/tangram.py index a1a9e92f82..8aabb486e6 100644 --- a/openproblems/tasks/spatial_decomposition/methods/tangram.py +++ b/openproblems/tasks/spatial_decomposition/methods/tangram.py @@ -5,8 +5,16 @@ @method( method_name="Tangram", - paper_name="Deep learning and alignment of spatially resolved single-cell " - "transcriptomes with Tangram", + method_summary=( + "Tangram is a method to map gene expression signatures from scRNA-seq data to" + " spatial data. It performs the cell type mapping by learning a similarity" + " matrix between single-cell and spatial locations based on gene expression" + " profiles." + ), + paper_name=( + "Deep learning and alignment of spatially resolved single-cell transcriptomes" + " with Tangram" + ), paper_reference="biancalani2021deep", paper_year=2021, code_url="https://github.com/broadinstitute/Tangram", diff --git a/openproblems/tasks/spatial_decomposition/methods/vanillanmf.py b/openproblems/tasks/spatial_decomposition/methods/vanillanmf.py index d561ff14cf..81438d68d3 100644 --- a/openproblems/tasks/spatial_decomposition/methods/vanillanmf.py +++ b/openproblems/tasks/spatial_decomposition/methods/vanillanmf.py @@ -6,12 +6,24 @@ @method( method_name="Non-Negative Matrix Factorization (NMF)", - paper_name="Fast local algorithms for large scale nonnegative " - "matrix and tensor factorizations", + method_summary=( + "NMF is a decomposition method based on Non-negative Matrix Factorization (NMF)" + " that reconstructs expression of each spatial location as a weighted" + " combination of cell-type signatures defined by scRNA-seq. It is a simpler" + " baseline than NMFreg as it only performs the NMF step based on mean" + " expression signatures of cell types, returning the weights loading of the NMF" + " as (normalized) cell type proportions, without the regression step." + ), + paper_name=( + "Fast local algorithms for large scale nonnegative matrix and tensor" + " factorizations" + ), paper_reference="cichocki2009fast", paper_year=2009, - code_url="https://scikit-learn.org/stable/modules/generated/" - "sklearn.decomposition.NMF.html", + code_url=( + "https://scikit-learn.org/stable/modules/generated/" + "sklearn.decomposition.NMF.html" + ), ) def nmf(adata, test=False, max_iter=None, random_state=17): """NMF for spatial deconvolution.""" diff --git a/openproblems/tasks/spatial_decomposition/metrics/r2.py b/openproblems/tasks/spatial_decomposition/metrics/r2.py index 13d7564ae5..29e1d4492b 100644 --- a/openproblems/tasks/spatial_decomposition/metrics/r2.py +++ b/openproblems/tasks/spatial_decomposition/metrics/r2.py @@ -1,7 +1,18 @@ from ....tools.decorators import metric -@metric(metric_name="r2", maximize=True, paper_reference="miles2005rsquared") +@metric( + metric_name="r2", + metric_summary=( + "R2, or the “coefficient of determination”, reports the fraction of the true" + " proportion values’ variance that can be explained by the predicted proportion" + " values. The best score, and upper bound, is 1.0. There is no fixed lower" + " bound for the metric. The uniform/non-weighted average across all cell" + " types/states is used to summarise performance." + ), + maximize=True, + paper_reference="miles2005rsquared", +) def r2(adata): import sklearn.metrics diff --git a/openproblems/tools/decorators.py b/openproblems/tools/decorators.py index dd2af6e193..ed5c9bec58 100644 --- a/openproblems/tools/decorators.py +++ b/openproblems/tools/decorators.py @@ -53,6 +53,7 @@ def _backport_code_version(apply_method, code_version): def method( method_name, + method_summary, paper_name, paper_reference, paper_year, @@ -67,6 +68,8 @@ def method( ---------- method_name : str Unique human readable name of the method + method_summary : str + Short summary of the method paper_name : str Title of the seminal paper describing the method paper_reference : str @@ -91,6 +94,7 @@ def apply_method(adata: anndata.AnnData, *args, **kwargs): apply_method.metadata = dict( method_name=method_name, + method_summary=method_summary, paper_name=paper_name, paper_reference=paper_reference, paper_year=paper_year, @@ -104,7 +108,19 @@ def apply_method(adata: anndata.AnnData, *args, **kwargs): return decorator -def metric(metric_name, maximize, paper_reference, image="openproblems"): +baseline_method = functools.partial( + method, + paper_name="Open Problems for Single Cell Analysis", + paper_reference="openproblems", + paper_year=2022, + code_url="https://github.com/openproblems-bio/openproblems", + is_baseline=True, +) + + +def metric( + metric_name, maximize, metric_summary, paper_reference, image="openproblems" +): """Decorate a metric function. Parameters @@ -116,6 +132,8 @@ def metric(metric_name, maximize, paper_reference, image="openproblems"): ---------- metric_name : str Unique human readable name of the metric + metric_summary : str + Short summary of the metric paper_reference : str BibTex key from `main.bib` referring to the seminal paper in which the metric was defined @@ -133,6 +151,7 @@ def apply_metric(adata: anndata.AnnData, *args, **kwargs): apply_metric.metadata = dict( metric_name=metric_name, + metric_summary=metric_summary, paper_reference=paper_reference, maximize=maximize, image=image, @@ -161,7 +180,7 @@ def dataset( BibTex key from `main.bib` referring to the paper describing how the dataset was generated dataset_summary : str - Short (<80 character) summary of the dataset + Short summary of the dataset image : str, optional (default: "openproblems") Name of the Docker image to be used for this dataset """ diff --git a/test/test_core_metadata.py b/test/test_core_metadata.py index 57139e0083..bc0fc8a1f0 100644 --- a/test/test_core_metadata.py +++ b/test/test_core_metadata.py @@ -8,7 +8,13 @@ import utils.name DATASET_SUMMARY_MINLEN = 40 -DATASET_SUMMARY_MAXLEN = 1000 +DATASET_SUMMARY_MAXLEN = 400 + +METHOD_SUMMARY_MINLEN = 40 +METHOD_SUMMARY_MAXLEN = 1000 + +METRIC_SUMMARY_MINLEN = 40 +METRIC_SUMMARY_MAXLEN = 400 @parameterized.parameterized.expand( @@ -61,6 +67,9 @@ def test_method_metadata(method): assert isinstance(method.metadata["image"], str) assert method.metadata["image"].startswith("openproblems") assert isinstance(method.metadata["method_name"], str) + assert isinstance(method.metadata["method_summary"], str) + assert len(method.metadata["method_summary"]) > METHOD_SUMMARY_MINLEN + assert len(method.metadata["method_summary"]) < METHOD_SUMMARY_MAXLEN assert isinstance(method.metadata["paper_name"], str) assert isinstance(method.metadata["paper_year"], int) assert isinstance(method.metadata["paper_reference"], str) @@ -81,6 +90,9 @@ def test_metric_metadata(metric): assert attr in metric.metadata assert isinstance(metric.metadata["maximize"], bool) assert isinstance(metric.metadata["metric_name"], str) + assert isinstance(metric.metadata["metric_summary"], str) + assert len(metric.metadata["metric_summary"]) > METRIC_SUMMARY_MINLEN + assert len(metric.metadata["metric_summary"]) < METRIC_SUMMARY_MAXLEN assert isinstance(metric.metadata["image"], str) assert metric.metadata["image"].startswith("openproblems") assert isinstance(metric.metadata["paper_reference"], str) From 154ccb9fd99113f3d28d9c3f139194539a0290f9 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Tue, 7 Mar 2023 14:03:47 -0500 Subject: [PATCH 262/266] Shorten method descriptions (#842) * shorten method descriptions * Update auprc.py * pre-commit --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../_common/metrics/auprc.py | 5 +++- .../dimensionality_reduction/methods/pca.py | 4 +-- .../dimensionality_reduction/methods/tsne.py | 8 ++---- .../methods/cell2location.py | 25 +++++++------------ test/test_core_metadata.py | 2 ++ 5 files changed, 19 insertions(+), 25 deletions(-) diff --git a/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py index ff7a12a902..8a2ac9d3b9 100644 --- a/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py +++ b/openproblems/tasks/_cell_cell_communication/_common/metrics/auprc.py @@ -4,7 +4,10 @@ @metric( metric_name="Precision-recall AUC", - metric_summary="Area under the precision-recall curve.", + metric_summary=( + "Area under the precision-recall curve for the binary classification task" + " predicting interactions." + ), paper_reference="davis2006prauc", maximize=True, ) diff --git a/openproblems/tasks/dimensionality_reduction/methods/pca.py b/openproblems/tasks/dimensionality_reduction/methods/pca.py index 7505787968..939a9babec 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pca.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pca.py @@ -38,13 +38,13 @@ def _pca(adata, genes=None): return adata -@_pca_method(method_name="Principle Component Analysis (PCA) (logCP10k)") +@_pca_method(method_name="PCA (logCP10k)") def pca_logCP10k(adata, test: bool = False): adata = log_cp10k(adata) return _pca(adata) -@_pca_method(method_name="Principle Component Analysis (PCA) (logCP10k, 1kHVG)") +@_pca_method(method_name="PCA (logCP10k, 1kHVG)") def pca_logCP10k_1kHVG(adata, test: bool = False): adata = log_cp10k_hvg(adata) return _pca(adata, genes=adata.var["highly_variable"]) diff --git a/openproblems/tasks/dimensionality_reduction/methods/tsne.py b/openproblems/tasks/dimensionality_reduction/methods/tsne.py index c074df2b4f..095f30f5b8 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/tsne.py +++ b/openproblems/tasks/dimensionality_reduction/methods/tsne.py @@ -41,17 +41,13 @@ def _tsne(adata, genes=None, test=False, n_pca=50): return adata -@_tsne_method( - method_name="t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCP10k, 1kHVG)" -) +@_tsne_method(method_name="t-SNE (logCP10k, 1kHVG)") def tsne_logCP10k_1kHVG(adata, test: bool = False, n_pca=50): adata = log_cp10k_hvg(adata) return _tsne(adata, genes=adata.var["highly_variable"], test=test, n_pca=n_pca) -@_tsne_method( - method_name="t-Distributed Stochastic Neighbor Embedding (t-SNE) (logCP10k)" -) +@_tsne_method(method_name="t-SNE (logCP10k)") def tsne_logCP10k(adata, test: bool = False, n_pca=50): adata = log_cp10k(adata) return _tsne(adata, test=test, n_pca=n_pca) diff --git a/openproblems/tasks/spatial_decomposition/methods/cell2location.py b/openproblems/tasks/spatial_decomposition/methods/cell2location.py index 6967c8a8f3..caf304f205 100644 --- a/openproblems/tasks/spatial_decomposition/methods/cell2location.py +++ b/openproblems/tasks/spatial_decomposition/methods/cell2location.py @@ -11,7 +11,10 @@ method_summary=( "Cell2location is a decomposition method based on Negative Binomial regression" " that is able to account for batch effects in estimating the single-cell gene" - " expression signature used for the spatial decomposition step." + " expression signature used for the spatial decomposition step. Note that since" + " batch information is unavailable in this task, here we use either a" + " hard-coded reference, or a negative-binomial learned reference without batch" + " labels. The parameter alpha refers to the detection efficiency prior." ), paper_name="Cell2location maps fine-grained cell types in spatial transcriptomics", paper_reference="kleshchevnikov2022cell2location", @@ -159,9 +162,7 @@ def _cell2location( return adata -@_cell2location_method( - method_name="Cell2location (detection_alpha=20, reference hard-coded)" -) +@_cell2location_method(method_name="Cell2location (alpha=20, reference hard-coded)") def cell2location_detection_alpha_20( adata, detection_alpha=20, @@ -190,9 +191,7 @@ def cell2location_detection_alpha_20( ) -@_cell2location_method( - method_name="Cell2location (detection_alpha=1, reference hard-coded)" -) +@_cell2location_method(method_name="Cell2location (alpha=1, reference hard-coded)") def cell2location_detection_alpha_1( adata, detection_alpha=1, @@ -221,9 +220,7 @@ def cell2location_detection_alpha_1( ) -@_cell2location_method( - method_name="Cell2location (detection_alpha=20, reference NB without batch info)" -) +@_cell2location_method(method_name="Cell2location (alpha=20, NB reference)") def cell2location_detection_alpha_20_nb( adata, detection_alpha=20, @@ -252,9 +249,7 @@ def cell2location_detection_alpha_20_nb( ) -@_cell2location_method( - method_name="Cell2location (detection_alpha=200, reference hard-coded)" -) +@_cell2location_method(method_name="Cell2location (alpha=200, reference hard-coded)") def cell2location_detection_alpha_200( adata, detection_alpha=200, @@ -283,9 +278,7 @@ def cell2location_detection_alpha_200( ) -@_cell2location_method( - method_name="Cell2location, amortised (detection_alpha=20, reference hard-coded)" -) +@_cell2location_method(method_name="Cell2location (alpha=20, amortised, hard-coded)") def cell2location_amortised_detection_alpha_20( adata, detection_alpha=20, diff --git a/test/test_core_metadata.py b/test/test_core_metadata.py index bc0fc8a1f0..6d8876c8ca 100644 --- a/test/test_core_metadata.py +++ b/test/test_core_metadata.py @@ -10,6 +10,7 @@ DATASET_SUMMARY_MINLEN = 40 DATASET_SUMMARY_MAXLEN = 400 +METHOD_NAME_MAXLEN = 50 METHOD_SUMMARY_MINLEN = 40 METHOD_SUMMARY_MAXLEN = 1000 @@ -67,6 +68,7 @@ def test_method_metadata(method): assert isinstance(method.metadata["image"], str) assert method.metadata["image"].startswith("openproblems") assert isinstance(method.metadata["method_name"], str) + assert len(method.metadata["method_name"]) < METHOD_NAME_MAXLEN assert isinstance(method.metadata["method_summary"], str) assert len(method.metadata["method_summary"]) > METHOD_SUMMARY_MINLEN assert len(method.metadata["method_summary"]) < METHOD_SUMMARY_MAXLEN From 637163fba7d74ab5393c2adbee5354dcf4d46f85 Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 10 Mar 2023 10:25:04 -0500 Subject: [PATCH 263/266] Update content processing for new website (#848) * parse metadata in new format * typo # ci skip * ignore workdir * remove selectively * temp * bugfix # ci skip * fix ref # ci skip * more temp # ci skip * don't remove tasks # ci skip # publish * pretty print # ci skip # publish * move get_sha to utils * add get_sha to workflow_utils * push meta to experimental website * monitor separate;y * don't build images for website * id -> task_id * add description if missing * only publish explicitly * test website # publish * fix ref # publish * untemp * remove patch # publish * clean up readmes # publish * also publish the bib * remove experimental website repo * remove unnecessary diff * add code url to function metadata # publish * set base to main # publish * code_url was already taken # publish --- .github/workflows/run_tests.yml | 1 + .github/workflows/update_website_content.yml | 17 ++-- main.bib | 2 +- .../batch_integration_embed/README.md | 4 - .../batch_integration_feature/README.md | 4 - .../batch_integration_graph/README.md | 4 - .../tasks/_cell_cell_communication/README.md | 2 - .../README.md | 4 +- .../README.md | 4 +- openproblems/tasks/denoising/README.md | 2 - .../tasks/dimensionality_reduction/README.md | 2 - openproblems/tasks/label_projection/README.md | 2 - .../tasks/matching_modalities/README.md | 2 - .../tasks/spatial_decomposition/README.md | 2 - openproblems/utils.py | 5 ++ scripts/generate_test_matrix.py | 2 +- setup.py | 2 +- test/test_core_cli.py | 18 ++-- test/test_task_1_load_data.py | 2 +- test/test_task_methods.py | 2 +- test/test_task_metrics.py | 2 +- test/utils/cache.py | 3 +- workflow/generate_website_markdown.py | 72 ---------------- workflow/parse_metadata.py | 86 +++++++++++++++++++ workflow/workflow_utils.py | 12 +++ 25 files changed, 133 insertions(+), 125 deletions(-) delete mode 100644 workflow/generate_website_markdown.py create mode 100644 workflow/parse_metadata.py diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index c4c3f9ece7..227b547475 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -26,6 +26,7 @@ jobs: if: | !endsWith(github.event.head_commit.message, '# ci skip') && !startsWith(github.ref, 'refs/heads/test_process') && + !startsWith(github.ref, 'refs/heads/test_website') && ( (github.event_name != 'pull_request_review') || ( diff --git a/.github/workflows/update_website_content.yml b/.github/workflows/update_website_content.yml index fdddf261dd..bf45b2516d 100644 --- a/.github/workflows/update_website_content.yml +++ b/.github/workflows/update_website_content.yml @@ -31,6 +31,7 @@ jobs: with: fetch-depth: 1 repository: openproblems-bio/website + ref: main path: website token: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} @@ -51,18 +52,18 @@ jobs: pip install -U --editable ./openproblems[process] python -c "import openproblems" - - name: Parse results + - name: Parse metadata run: | - rm -r website/content/benchmarks/*/ - python openproblems/workflow/generate_website_markdown.py website/content/benchmarks - cp openproblems/main.bib website/static/bibliography + python openproblems/workflow/parse_metadata.py website/results + cp openproblems/main.bib website/bibliography/library.bib cd website git diff --exit-code --quiet || echo "CHANGED=true" >> $GITHUB_ENV - - name: Upload markdown + + - name: Upload json uses: actions/upload-artifact@main with: - name: markdown + name: json path: website/content/benchmarks - name: Push to openproblems-bio/website @@ -88,11 +89,11 @@ jobs: ) uses: peter-evans/create-pull-request@v4 with: + base: main branch: ${{ env.UPDATE_BRANCH_NAME }} delete-branch: true - base: main title: '[auto] Update benchmark content' - reviewers: scottgigante-immunai,rcannood,dburkhardt + reviewers: scottgigante-immunai,rcannood path: './website' token: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} author: "openproblems-bio " diff --git a/main.bib b/main.bib index 5bce346a17..88eb6dc1a3 100644 --- a/main.bib +++ b/main.bib @@ -773,7 +773,7 @@ @article{schober2018correlation author = {Patrick Schober and Christa Boer and Lothar A. Schwarte}, year = {2018}, month = may, - journal = {Anesthesia {\&}amp$\mathsemicolon$ Analgesia}, + journal = {Anesthesia {\&} Analgesia}, publisher = {Ovid Technologies (Wolters Kluwer Health)}, volume = {126}, number = {5}, diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md index 0e3d1bcc93..88d609bc9b 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_embed/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_embed/README.md @@ -1,9 +1,5 @@ - - # Batch integration embedding -## The task - This is a sub-task of the overall batch integration task. Batch (or data) integration integrates datasets across batches that arise from various biological and technical sources. Methods that integrate batches typically have three different types of output: diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md index bad7e1499e..2b3ba2a5dc 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md @@ -1,9 +1,5 @@ - - # Batch integration feature -## The task - This is a sub-task of the overall batch integration task. Batch (or data) integration integrates datasets across batches that arise from various biological and technical sources. Methods that integrate batches typically have three different types of output: diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/README.md b/openproblems/tasks/_batch_integration/batch_integration_graph/README.md index 704302fc24..ba39bc2d58 100644 --- a/openproblems/tasks/_batch_integration/batch_integration_graph/README.md +++ b/openproblems/tasks/_batch_integration/batch_integration_graph/README.md @@ -1,9 +1,5 @@ - - # Batch integration (graph) -## The task - This is a sub-task of the overall batch integration task. Batch (or data) integration methods integrate datasets across batches that arise from various biological and technical sources. Methods that integrate batches typically have three different types diff --git a/openproblems/tasks/_cell_cell_communication/README.md b/openproblems/tasks/_cell_cell_communication/README.md index 637c03c4bd..5494d02aae 100644 --- a/openproblems/tasks/_cell_cell_communication/README.md +++ b/openproblems/tasks/_cell_cell_communication/README.md @@ -1,7 +1,5 @@ # Cell-cell Communication -## The task - The growing availability of single-cell data has sparked an increased interest in the inference of cell-cell communication (CCC), with an ever-growing number of computational tools developed for this purpose. diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md index 51546e8ab3..3b005b61d9 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_ligand_target/README.md @@ -1,6 +1,4 @@ -# Cell-cell Communication - -## The task +# Cell-cell Communication (ligand-target) The growing availability of single-cell data has sparked an increased interest in the inference of cell-cell communication (CCC), diff --git a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md index 941d2c93fa..d996555e64 100644 --- a/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md +++ b/openproblems/tasks/_cell_cell_communication/cell_cell_communication_source_target/README.md @@ -1,6 +1,4 @@ -# Cell-cell Communication - -## The task +# Cell-cell Communication (source-target) The growing availability of single-cell data has sparked an increased interest in the inference of cell-cell communication (CCC), diff --git a/openproblems/tasks/denoising/README.md b/openproblems/tasks/denoising/README.md index 7498d6e723..9e76488d4f 100644 --- a/openproblems/tasks/denoising/README.md +++ b/openproblems/tasks/denoising/README.md @@ -1,7 +1,5 @@ # Denoising -## The task - Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present in each cell. As a result, the measurements (UMI counts) observed for each gene and each cell are associated with generally high levels of technical noise ([Grün et al., diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index db7ea1a9ea..4a82604d30 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -1,7 +1,5 @@ # Dimensionality reduction for visualisation -## The task - Dimensionality reduction is one of the key challenges in single-cell data representation. Routine single-cell RNA sequencing (scRNA-seq) experiments measure cells in roughly 20,000-30,000 dimensions (i.e., features - mostly gene transcripts but also diff --git a/openproblems/tasks/label_projection/README.md b/openproblems/tasks/label_projection/README.md index 2ad3ba44da..4b17408a74 100644 --- a/openproblems/tasks/label_projection/README.md +++ b/openproblems/tasks/label_projection/README.md @@ -1,7 +1,5 @@ # Label Projection -## The task - A major challenge for integrating single cell datasets is creating matching cell type annotations for each cell. One of the most common strategies for annotating cell types is referred to as diff --git a/openproblems/tasks/matching_modalities/README.md b/openproblems/tasks/matching_modalities/README.md index 54bca2e6ea..cbc07db2ff 100644 --- a/openproblems/tasks/matching_modalities/README.md +++ b/openproblems/tasks/matching_modalities/README.md @@ -1,7 +1,5 @@ # Matching modalities -## The task - Cellular function is regulated by the complex interplay of different types of biological molecules (DNA, RNA, proteins, etc.), which determine the state of a cell. Several recently described technologies allow for simultaneous measurement of different aspects diff --git a/openproblems/tasks/spatial_decomposition/README.md b/openproblems/tasks/spatial_decomposition/README.md index ff20f325b2..5994fedfd3 100644 --- a/openproblems/tasks/spatial_decomposition/README.md +++ b/openproblems/tasks/spatial_decomposition/README.md @@ -1,7 +1,5 @@ # Spatial Decomposition/Deconvolution -## The task - Spatial decomposition (also often referred to as Spatial deconvolution) is applicable to spatial transcriptomics data where the transcription profile of each capture location (spot, voxel, bead, etc.) do not share a bijective diff --git a/openproblems/utils.py b/openproblems/utils.py index 4a9f35c5c0..6a9e23fda4 100644 --- a/openproblems/utils.py +++ b/openproblems/utils.py @@ -48,3 +48,8 @@ def get_members(module): def get_callable_members(module): """Get all callable public members from a module.""" return [member for member in get_members(module) if callable(member)] + + +def get_member_id(member): + """Get the submodule or function name for a task, dataset, method or metric""" + return member.__name__.split(".")[-1] diff --git a/scripts/generate_test_matrix.py b/scripts/generate_test_matrix.py index 455f772a26..06395af4a9 100644 --- a/scripts/generate_test_matrix.py +++ b/scripts/generate_test_matrix.py @@ -8,7 +8,7 @@ def generate_matrix(): suites = _CORE_TEST_SUITES.copy() for task in openproblems.TASKS: - task_name = task.__name__.split(".")[-1] + task_name = openproblems.utils.get_member_id(task) suites.extend([f"{suite} and {task_name}" for suite in _TASK_TEST_SUITES]) return suites diff --git a/setup.py b/setup.py index 578a337ae6..6ea808be24 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ evaluate_requires = ["snakemake>=7.8,<7.17", "tabulate<0.9"] -process_requires = ["numpyencoder==0.3.*"] +process_requires = ["numpyencoder==0.3.*", "gitpython==3.1.*"] test_requires = [ "pytest==7.1.*", diff --git a/test/test_core_cli.py b/test/test_core_cli.py index f0c416c328..e52cd7c4ca 100644 --- a/test/test_core_cli.py +++ b/test/test_core_cli.py @@ -24,11 +24,13 @@ def test_print(capsys): def test_tasks(capsys): """Test task listing.""" result = np.array(main(["tasks"], do_print=False)) - expected = np.array([task.__name__.split(".")[-1] for task in openproblems.TASKS]) + expected = np.array( + [openproblems.utils.get_member_id(task) for task in openproblems.TASKS] + ) assert np.all(result == expected) result = np.array(main(["tasks"], do_print=True)) - expected = ( - "\n".join([task.__name__.split(".")[-1] for task in openproblems.TASKS]) + "\n" + expected = "\n".join( + [openproblems.utils.get_member_id(task) for task in openproblems.TASKS] + [""] ) captured = capsys.readouterr() assert captured.out == expected @@ -42,7 +44,7 @@ def test_list(task): """Test function listing.""" result = np.array( main( - ["list", "--task", task.__name__.split(".")[-1], "--datasets"], + ["list", "--task", openproblems.utils.get_member_id(task), "--datasets"], do_print=False, ) ) @@ -51,7 +53,7 @@ def test_list(task): result = np.array( main( - ["list", "--task", task.__name__.split(".")[-1], "--methods"], + ["list", "--task", openproblems.utils.get_member_id(task), "--methods"], do_print=False, ) ) @@ -60,7 +62,7 @@ def test_list(task): result = np.array( main( - ["list", "--task", task.__name__.split(".")[-1], "--metrics"], + ["list", "--task", openproblems.utils.get_member_id(task), "--metrics"], do_print=False, ) ) @@ -73,7 +75,7 @@ def _test_image(task, function_type, function): [ "image", "--task", - task.__name__.split(".")[-1], + openproblems.utils.get_member_id(task), function_type, function.__name__, ], @@ -204,7 +206,7 @@ def __zero_metric(*args): [ "evaluate", "--task", - task.__name__.split(".")[-1], + openproblems.utils.get_member_id(task), "--input", dataset_file, metric_name, diff --git a/test/test_task_1_load_data.py b/test/test_task_1_load_data.py index 125a2f3fc0..9dfd3e6099 100644 --- a/test/test_task_1_load_data.py +++ b/test/test_task_1_load_data.py @@ -7,7 +7,7 @@ @parameterized.parameterized.expand( [ ( - task.__name__.split(".")[-1], + openproblems.utils.get_member_id(task), dataset.__name__, test, utils.TEMPDIR.name, diff --git a/test/test_task_methods.py b/test/test_task_methods.py index 78d2e69ce3..bdd18283e2 100644 --- a/test/test_task_methods.py +++ b/test/test_task_methods.py @@ -14,7 +14,7 @@ @parameterized.parameterized.expand( [ ( - task.__name__.split(".")[-1], + openproblems.utils.get_member_id(task), method.__name__, method.metadata["image"], ) diff --git a/test/test_task_metrics.py b/test/test_task_metrics.py index 96c57ceb01..d149ed7dfe 100644 --- a/test/test_task_metrics.py +++ b/test/test_task_metrics.py @@ -8,7 +8,7 @@ @parameterized.parameterized.expand( [ ( - task.__name__.split(".")[-1], + openproblems.utils.get_member_id(task), metric.__name__, metric.metadata["image"], ) diff --git a/test/utils/cache.py b/test/utils/cache.py index 93b31098a6..fcdfaa15d3 100644 --- a/test/utils/cache.py +++ b/test/utils/cache.py @@ -1,10 +1,11 @@ import anndata +import openproblems import os def _cache_name(tempdir, task, dataset, test=None, method=None): if not isinstance(task, str): - task = task.__name__.split(".")[-1] + task = openproblems.utils.get_member_id(task) if not isinstance(dataset, str): dataset = dataset.__name__ if method is not None: diff --git a/workflow/generate_website_markdown.py b/workflow/generate_website_markdown.py deleted file mode 100644 index 9bb3bbf643..0000000000 --- a/workflow/generate_website_markdown.py +++ /dev/null @@ -1,72 +0,0 @@ -import openproblems -import os -import pathlib -import re -import sys -import workflow_utils - -INDEX_TOML_TEMPLATE = """+++ -title = "{task_name}" -summary = "{task_summary}" -headless = false -theme = "op" -+++ -""" - -DATASET_TOML_TEMPLATE = """+++ -title = "{dataset_name}" -summary = "{dataset_summary}" -+++ -""" - -API_PATTERN = re.compile(r"^#.*API$") -HEADING_PATTERN = re.compile(r"^# ") - - -def write_index_md(task, outdir): - output_md = INDEX_TOML_TEMPLATE.format( - task_name=task._task_name, task_summary=task._task_summary - ) - readme_file = task.__file__.replace("__init__.py", "README.md") - with open(readme_file, "r") as readme_handle: - for line in readme_handle: - if HEADING_PATTERN.match(line): - # exclude top-level headings - continue - if API_PATTERN.match(line): - # exclude everything after ## API - break - output_md += line - - output_file = os.path.join(outdir, "_index.md") - with open(output_file, "w") as output_handle: - output_handle.write(output_md) - - -def write_dataset_md(dataset, outdir): - output_md = DATASET_TOML_TEMPLATE.format( - dataset_name=dataset.metadata["dataset_name"], - dataset_summary=dataset.metadata["dataset_summary"], - ) - - dataset_name = dataset.__name__.split(".")[-1] - output_file = os.path.join(outdir, f"{dataset_name}.md") - with open(output_file, "w") as output_handle: - output_handle.write(output_md) - - -def main(outdir): - for task in openproblems.TASKS: - if workflow_utils.task_is_incomplete(task): - # don't write md for incomplete tasks - continue - task_outdir = os.path.join(outdir, task.__name__.split(".")[-1]) - if not os.path.isdir(task_outdir): - pathlib.Path(task_outdir).mkdir(parents=True, exist_ok=True) - write_index_md(task, task_outdir) - for dataset in task.DATASETS: - write_dataset_md(dataset, task_outdir) - - -if __name__ == "__main__": - main(sys.argv[1]) diff --git a/workflow/parse_metadata.py b/workflow/parse_metadata.py new file mode 100644 index 0000000000..570a2f94b6 --- /dev/null +++ b/workflow/parse_metadata.py @@ -0,0 +1,86 @@ +import json +import openproblems +import pathlib +import re +import sys +import workflow_utils + +API_PATTERN = re.compile(r"^#.*API$") +HEADING_PATTERN = re.compile(r"^# ") + + +def get_task_description(task): + description = "" + readme_file = task.__file__.replace("__init__.py", "README.md") + with open(readme_file, "r") as readme_handle: + for line in readme_handle: + if HEADING_PATTERN.match(line): + # exclude top-level headings + continue + if API_PATTERN.match(line): + # exclude everything after ## API + break + description += line + return description + + +def write_task_json(task, outdir: pathlib.Path): + data = { + "task_id": openproblems.utils.get_member_id(task), + "commit_sha": workflow_utils.get_sha(), + "task_name": task._task_name, + "task_summary": task._task_summary, + "task_description": get_task_description(task), + "repo": "openproblems-bio/openproblems", + } + with open(outdir.joinpath("task_info.json"), "w") as handle: + json.dump(data, handle, indent=4) + + +def _write_function_json(task, outdir: pathlib.Path, functions, function_type: str): + data = [] + for function in functions: + function.metadata.update( + { + "task_id": openproblems.utils.get_member_id(task), + "commit_sha": workflow_utils.get_sha(), + f"{function_type}_id": openproblems.utils.get_member_id(function), + "implementation_url": ( + "https://github.com/openproblems-bio/openproblems/" + f"blob/main/{function.__module__.replace('.', '/')}.py" + ), + } + ) + data.append(function.metadata) + + with open(outdir.joinpath(f"{function_type}_info.json"), "w") as handle: + json.dump(data, handle, indent=4) + + +def write_dataset_json(task, outdir: pathlib.Path): + _write_function_json(task, outdir, task.DATASETS, "dataset") + + +def write_method_json(task, outdir: pathlib.Path): + _write_function_json(task, outdir, task.METHODS, "method") + + +def write_metric_json(task, outdir: pathlib.Path): + _write_function_json(task, outdir, task.METRICS, "metric") + + +def main(outdir: pathlib.Path): + for task in openproblems.TASKS: + if workflow_utils.task_is_incomplete(task): + # don't write json for incomplete tasks + continue + task_outdir = outdir.joinpath(openproblems.utils.get_member_id(task), "data") + task_outdir.mkdir(parents=True, exist_ok=True) + write_task_json(task, task_outdir) + write_dataset_json(task, task_outdir) + write_method_json(task, task_outdir) + write_metric_json(task, task_outdir) + + +if __name__ == "__main__": + main(pathlib.Path(sys.argv[1])) diff --git a/workflow/workflow_utils.py b/workflow/workflow_utils.py index 2cdfdb0e5c..dbaca0e332 100644 --- a/workflow/workflow_utils.py +++ b/workflow/workflow_utils.py @@ -1,8 +1,20 @@ +import functools +import git +import openproblems +import pathlib + TASK_MIN_DATASETS = 1 TASK_MIN_METHODS = 3 TASK_MIN_METRICS = 1 +@functools.lru_cache() +def get_sha(): + repo = git.Repo(pathlib.Path(openproblems.__path__[0]).parent) + assert not repo.bare + return repo.head.commit.hexsha + + def task_is_incomplete(task): if len(task.DATASETS) < TASK_MIN_DATASETS: return True From 00c280aacf1ca1b27dbdb2a00dac6aadc1ed1ede Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Fri, 10 Mar 2023 10:25:13 -0500 Subject: [PATCH 264/266] Update results processing for new website (#847) * temp * add get_sha to workflow_utils * refactor results # ci skip * fixes # ci skip * install gitpython * upload new results path * # publish * test process * test process # publish * test process # publish * temp * test process # publish * Revert "temp" This reverts commit 5ccbdfade2f4790a64707648e787cc05f1d6a383. * temp # publish * pre-commit * remove experimental website repo --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .github/workflows/process_results.yml | 54 +---- workflow/parse_nextflow.py | 331 ++++++++------------------ 2 files changed, 101 insertions(+), 284 deletions(-) diff --git a/.github/workflows/process_results.yml b/.github/workflows/process_results.yml index 3a366b6d4a..a8a62d8038 100644 --- a/.github/workflows/process_results.yml +++ b/.github/workflows/process_results.yml @@ -32,27 +32,15 @@ jobs: with: fetch-depth: 1 repository: openproblems-bio/website + ref: main path: website token: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} - - name: Checkout NBT reproduciblity repo - uses: actions/checkout@v3 - with: - fetch-depth: 1 - repository: openproblems-bio/website-experimental - path: nbt2022-reproducibility - token: ${{ secrets.GH_ACTIONS_NBT_REPRODUCIBILITY_PAT }} - - name: Set up website Git branch working-directory: website run: | git checkout -b $UPDATE_BRANCH_NAME - - name: Set up nbt2022-reproducibility Git branch - working-directory: nbt2022-reproducibility - run: | - git checkout -b $UPDATE_BRANCH_NAME - - name: Set up Python uses: actions/setup-python@v4 with: @@ -86,19 +74,13 @@ jobs: S3_URI="s3://openproblems-nextflow/cwd_example" fi aws s3 cp --quiet --recursive "${S3_URI}" /tmp/results/ - rm -r website/data/results/*/ - python openproblems/workflow/parse_nextflow.py /tmp website/data/results + python openproblems/workflow/parse_nextflow.py /tmp website/results - name: Upload results uses: actions/upload-artifact@main with: name: results - path: website/data/results - - - name: Move raw output - run: | - rsync -v -r --include "*.raw.json" --include "*/" --exclude "*" website/data/results/ nbt2022-reproducibility/results - rm website/data/results/*/*.raw.json + path: website/results - name: Push to openproblems-bio/website if: | @@ -119,41 +101,13 @@ jobs: with: branch: ${{ env.UPDATE_BRANCH_NAME }} delete-branch: true - base: main title: '[auto] Update benchmark results' - reviewers: scottgigante-immunai,rcannood,dburkhardt + reviewers: scottgigante-immunai,rcannood path: './website' token: ${{ secrets.GH_ACTIONS_WEBSITE_PAT }} author: "openproblems-bio " commit-message: "Update benchmark results" - - name: Push to openproblems-bio/nbt2022-reproducibility - if: | - github.event_name == 'repository_dispatch' || - endsWith(github.event.head_commit.message, '# publish') - shell: bash - working-directory: './nbt2022-reproducibility' - env: - GITHUB_TOKEN: ${{ secrets.GH_ACTIONS_NBT_REPRODUCIBILITY_PAT }} - run: | - git push origin "${UPDATE_BRANCH_NAME}" - - - name: Create nbt2022-reproducibility Pull Request - if: | - github.event_name == 'repository_dispatch' || - endsWith(github.event.head_commit.message, '# publish') - uses: peter-evans/create-pull-request@v4 - with: - branch: ${{ env.UPDATE_BRANCH_NAME }} - delete-branch: true - base: main - title: '[auto] Update benchmark results' - reviewers: scottgigante-immunai,rcannood - path: './nbt2022-reproducibility' - token: ${{ secrets.GH_ACTIONS_NBT_REPRODUCIBILITY_PAT }} - author: "openproblems-bio " - commit-message: "Update benchmark results" - - name: AWS S3 cleanup if: "github.event_name == 'repository_dispatch'" env: diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index e23217967b..9b9cb31ec6 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -1,53 +1,34 @@ """ Schema: -# results/{task.__name__}/{dataset.__name__}.json -{ - "name": dataset.metadata["dataset_name"], - "data_url": dataset.metadata["data_url"], - "data_reference": dataset.metadata["data_reference"], - "headers": { - "names": [ - "Rank", - "Name", - "Metric1 Raw", - "Metric2 Raw", - ..., - "Mean score Scaled", - "Metric1 Scaled", - ..., - "Memory (GB)", - "Runtime (min)", - "CPU (%)", - "Paper", - "Year", - "Library" - ], - "fixed": ["Name", "Paper", "Library"] - }, - "results": [ - { - "Name": method.metadata["method_name"], - "Paper": method.metadata["paper_name"], - "Paper URL": method.metadata["paper_url"], - "Year": method.metadata["year"], - "Library": method.metadata["code_url"], - "Implementation": "https://github.com/.../path/to/method.py", - "Version": method.metadata["method_version"], - "Runtime (min)": runtime, - "CPU (%)": cpu, - "Memory (GB)": memory, - "Rank": rank, - "Metric1 Raw": metric1_raw, - "Metric2 Raw": metric2_raw, . - .., - "Mean score Scaled": mean_score, - "Metric1 Scaled": metric1, +# content/benchmarks/{task.__name__}/data/results.json +[ + { + "task_id": task.__name__, + "commit_sha": "abc123", + "method_id": method.__name__, + "dataset_id": dataset.__name__, + "submission_time": "1970-01-01 00:00:00.000", + "code_version": openproblems.__version__, + "resources": { + "duration_sec": 100.0, + "cpu_pct": 100.0, + "peak_memory_mb": 1000.0, + "disk_read_mb": 1000.0, + "disk_write_mb": 1000.0, + } + "metric_values": { + metric.__name__: 1.0, + ... + } + "scaled_scores": { + metric.__name__: 1.0, ... }, - ... - ] -} + "mean_score": 1.0 + }, + ... +] """ import collections import copy @@ -57,6 +38,7 @@ import openproblems.api.utils import os import pandas as pd +import pathlib import sys import warnings import workflow_utils @@ -78,36 +60,36 @@ def dump_json(obj, fp): size_units = {"B": 1, "KB": 10**3, "MB": 10**6, "GB": 10**9, "TB": 10**12} -def parse_size_to_gb(size): - """Convert a file size to an integer in GB. +def parse_size_to_mb(size): + """Convert a file size to an integer in MB. Example ------- - >>> parse_size_to_gb("1000 MB") - 1 + >>> parse_size_to_gb("1 GB") + 1000 """ number, unit = [string.strip() for string in size.split()] - return int(float(number) * size_units[unit]) / size_units["GB"] + return int(float(number) * size_units[unit]) / size_units["MB"] time_units = {"s": 1, "m": 60, "h": 3600, "d": 3600 * 24} -def parse_time_to_min(time): - """Convert a duration to an integer in minutes. +def parse_time_to_sec(time): + """Convert a duration to an integer in seconds. Example ------- >>> parse_time_to_min("2m 30s") - 2.5 + 150 """ if " " in time: - return sum([parse_time_to_min(t) for t in time.split(" ")]) + return sum([parse_time_to_sec(t) for t in time.split(" ")]) time = time.strip() for unit, value in time_units.items(): if time.endswith(unit): number = float(time.replace(unit, "")) - return number * value / time_units["m"] + return number * value / time_units["s"] def read_trace(filename): @@ -152,15 +134,13 @@ def parse_trace_to_dict(df): return results -def parse_metric_results(results_path, results): +def parse_metric_results(results_path: pathlib.Path, results): """Add metric results to the trace output.""" missing_traces = [] - metric_filenames = os.listdir(os.path.join(results_path, "results/metrics")) + metric_filenames = os.listdir(results_path.joinpath("results", "metrics")) print(f"Loading {len(metric_filenames)} metric results") for filename in sorted(metric_filenames): - with open( - os.path.join(results_path, "results/metrics", filename), "r" - ) as handle: + with open(results_path.joinpath("results", "metrics", filename), "r") as handle: result = float(handle.read().strip()) task_name, dataset_name, method_name, metric_name = filename.replace( ".metric.txt", "" @@ -181,12 +161,12 @@ def parse_metric_results(results_path, results): return results -def parse_method_versions(results_path, results): +def parse_method_versions(results_path: pathlib.Path, results): """Add method versions to the trace output.""" missing_traces = [] - for filename in os.listdir(os.path.join(results_path, "results/method_versions")): + for filename in os.listdir(results_path.joinpath("results", "method_versions")): with open( - os.path.join(results_path, "results/method_versions", filename), "r" + results_path.joinpath("results", "method_versions", filename), "r" ) as handle: code_version = handle.read().strip() task_name, dataset_name, method_name = filename.replace( @@ -264,188 +244,71 @@ def normalize_scores(task_name, dataset_results): return dataset_results -def drop_baselines(task_name, dataset_results): - """Remove baseline methods from dataset results.""" - dataset_results = copy.copy(dataset_results) - method_names = list(dataset_results.keys()) - n_removed = 0 - for method_name in method_names: - method = openproblems.api.utils.get_function( - task_name, - "methods", - method_name, - ) - if method.metadata["is_baseline"]: - n_removed += 1 - del dataset_results[method_name] - - print(f"Dropped {n_removed} baseline methods") - return dataset_results - - -def drop_nan_metrics(dataset_results): - n_removed = 0 - metric_names = list(list(dataset_results.values())[0]["metrics"].keys()) - for metric_name in metric_names: - metric_scores = np.array( - [ - dataset_results[method_name]["metrics"][metric_name] - for method_name in dataset_results - ] - ) - if np.all(np.isnan(metric_scores)): - n_removed += 1 - for method_name in dataset_results: - del dataset_results[method_name]["metrics"][metric_name] - del dataset_results[method_name]["metrics_raw"][metric_name] - if n_removed > 0: - print(f"[WARN] Removed {n_removed} all-NaN metrics") - return dataset_results - - -def compute_ranking(dataset_results): - """Rank all methods on a specific dataset.""" - metric_sums = np.zeros(len(dataset_results)) - metric_names = list(dataset_results.values())[0]["metrics"].keys() - method_names = list(dataset_results.keys()) - for metric_name in metric_names: - metric_scores = np.array( - [ - dataset_results[method_name]["metrics"][metric_name] - for method_name in method_names - ] - ) - metric_scores[np.isnan(metric_scores) | np.isneginf(metric_scores)] = 0 - metric_scores[np.isinf(metric_scores)] = 1 - metric_sums += metric_scores - - final_ranking = { - method_names[method_idx]: rank + 1 - for rank, method_idx in enumerate(np.argsort(metric_sums)[::-1]) - } - for method_name, metrics_sum in zip(method_names, metric_sums): - dataset_results[method_name]["mean_score"] = metrics_sum / len(metric_names) - return dataset_results, final_ranking - - -def dataset_results_to_json(task_name, dataset_name, dataset_results_raw): - """Convert the raw dataset results to pretty JSON for web.""" - print( - f"Formatting {len(dataset_results_raw)} methods for {task_name}.{dataset_name}" - ) - dataset = openproblems.api.utils.get_function(task_name, "datasets", dataset_name) - output = dict( - name=dataset.metadata["dataset_name"], - data_url=dataset.metadata["data_url"], - data_reference=( - "https://openproblems.bio/" - f"bibliography#{dataset.metadata['data_reference']}" - ), - headers=dict( - names=["Rank", "Name", "Mean score"], fixed=["Name", "Paper", "Library"] - ), - results=list(), - ) - dataset_results_raw = normalize_scores(task_name, dataset_results_raw) - dataset_results = drop_baselines(task_name, dataset_results_raw) - dataset_results = drop_nan_metrics(dataset_results) - dataset_results, ranking = compute_ranking(dataset_results) - metric_names = set() - for method_name, rank in ranking.items(): - method_results = dataset_results[method_name] - method = openproblems.api.utils.get_function( - task_name, - "methods", - method_name, - ) +def fix_values(metric_result): + if np.isnan(metric_result): + return "NaN" + if np.isneginf(metric_result): + return "-Inf" + if np.isinf(metric_result): + return "Inf" + return metric_result + + +def fix_values_scaled(metric_result): + if np.isnan(metric_result) or np.isinf(metric_result): + return 0 + return metric_result + + +def dataset_results_to_json(task_name, dataset_name, dataset_results): + dataset_results = normalize_scores(task_name, dataset_results) + out = [] + for method_name, method_results in dataset_results.items(): + raw = {k: fix_values(v) for k, v in method_results["metrics_raw"].items()} + scaled = {k: fix_values_scaled(v) for k, v in method_results["metrics"].items()} + resources = { + "duration_sec": parse_time_to_sec(method_results["duration"]), + "cpu_pct": float(method_results["%cpu"].replace("%", "")), + "peak_memory_mb": parse_size_to_mb(method_results["peak_rss"]), + "disk_read_mb": parse_size_to_mb(method_results["rchar"]), + "disk_write_mb": parse_size_to_mb(method_results["wchar"]), + } result = { - "Name": method.metadata["method_name"], - "Paper": method.metadata["paper_name"], - "Paper URL": ( - "https://openproblems.bio/" - f"bibliography#{method.metadata['paper_reference']}" - ), - "Year": method.metadata["paper_year"], - "Library": method.metadata["code_url"], - "Implementation": ( - "https://github.com/openproblems-bio/openproblems/" - f"blob/main/{method.__module__.replace('.', '/')}.py" - ), - "Version": method_results["code_version"], - "Runtime (min)": parse_time_to_min(method_results["realtime"]), - "CPU (%)": float(method_results["%cpu"].replace("%", "")), - "Memory (GB)": parse_size_to_gb(method_results["peak_rss"]), - "Rank": rank, - "Mean score": method_results["mean_score"], + "task_id": task_name, + "commit_sha": workflow_utils.get_sha(), + "method_id": method_name, + "dataset_id": dataset_name, + "submission_time": method_results["submit"], + "code_version": method_results["code_version"], + "resources": resources, + "metric_values": raw, + "scaled_scores": scaled, + "mean_score": np.array(list(scaled.values())).mean(), } - result_metrics = {} - for metric_type in ["metrics_raw", "metrics"]: - metric_type_name = "Raw" if metric_type == "metrics_raw" else "Scaled" - for metric_name, metric_result in method_results[metric_type].items(): - metric = openproblems.api.utils.get_function( - task_name, "metrics", metric_name - ) - if np.isnan(metric_result): - metric_result = "NaN" - elif np.isneginf(metric_result): - metric_result = "-Inf" - elif np.isinf(metric_result): - metric_result = "Inf" - metric_name_fmt = f"{metric.metadata['metric_name']} {metric_type_name}" - result_metrics[metric_name_fmt] = metric_result - metric_names.add(metric_name_fmt) - result.update(sorted(result_metrics.items())) - output["results"].append(result) - output["headers"]["names"].extend(sorted(list(metric_names))) - output["headers"]["names"].extend( - [ - "Memory (GB)", - "Runtime (min)", - "CPU (%)", - "Paper", - "Year", - "Library", - ] - ) - return output, dataset_results_raw + out.append(result) + return out -def results_to_json(results, outdir): +def results_to_json(results, outdir: pathlib.Path): """Convert the full results to pretty JSON for web.""" - if not os.path.isdir(outdir): - os.mkdir(outdir) for task_name, task_results in results.items(): + task_results_out = [] + task_dir = outdir.joinpath(task_name, "data") + task_dir.mkdir(parents=True, exist_ok=True) for dataset_name, dataset_results in task_results.items(): results_dir = os.path.join(outdir, task_name) if not os.path.isdir(results_dir): os.mkdir(results_dir) - filename = os.path.join(results_dir, "{}.json".format(dataset_name)) - filename_raw = os.path.join(results_dir, "{}.raw.json".format(dataset_name)) - dataset_results_json, dataset_results_raw = dataset_results_to_json( - task_name, dataset_name, dataset_results + task_results_out.extend( + dataset_results_to_json(task_name, dataset_name, dataset_results) ) - with open(filename_raw, "w") as handle: - dump_json( - dataset_results_raw, - handle, - ) - if workflow_utils.task_is_incomplete( - openproblems.api.utils.str_to_task(task_name) - ): - print("Skipping stub task") - else: - with open(filename, "w") as handle: - dump_json( - dataset_results_json, - handle, - ) - - -def main(results_path, outdir): + with open(task_dir.joinpath("results.json"), "w") as handle: + dump_json(task_results_out, handle) + + +def main(results_path: pathlib.Path, outdir: pathlib.Path): """Parse the nextflow output.""" - df = read_trace( - os.path.join(results_path, "results/pipeline_info/execution_trace.txt") - ) + df = read_trace(results_path.joinpath("results/pipeline_info/execution_trace.txt")) results = parse_trace_to_dict(df) results = parse_metric_results(results_path, results) results = parse_method_versions(results_path, results) @@ -454,4 +317,4 @@ def main(results_path, outdir): if __name__ == "__main__": - main(sys.argv[1], sys.argv[2]) + main(pathlib.Path(sys.argv[1]), pathlib.Path(sys.argv[2])) From c37d953773a84516f841d455a9132a7b1a744c29 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Fri, 10 Mar 2023 10:56:56 -0500 Subject: [PATCH 265/266] document sub-stub task behaviour --- CONTRIBUTING.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dbc99a88d1..9f08016476 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -358,6 +358,10 @@ Notes: ### Adding a new task +To add a new task, you must provide a task description, dataset and method API, and at +least one dataset, one method, and one metric. In order to appear on the website, a task +must have at least three methods. + The task directory structure is as follows ```text From 4571a779a01b3d50c3153da41a2b2ec59848b0da Mon Sep 17 00:00:00 2001 From: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Date: Mon, 13 Mar 2023 16:31:13 -0400 Subject: [PATCH 266/266] only update sha on changes (#850) * only update sha on changes * handle git hash of functions * comment git_hash function * pre-commit * ignore another warning --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .github/workflows/update_website_content.yml | 2 +- openproblems/api/hash.py | 20 ++++++++++++++------ pytest.ini | 1 + test/test_core_cli.py | 16 ++++++++++++++++ workflow/parse_metadata.py | 5 +++-- 5 files changed, 35 insertions(+), 9 deletions(-) diff --git a/.github/workflows/update_website_content.yml b/.github/workflows/update_website_content.yml index bf45b2516d..77e01b4c36 100644 --- a/.github/workflows/update_website_content.yml +++ b/.github/workflows/update_website_content.yml @@ -23,7 +23,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - fetch-depth: 1 + fetch-depth: 0 path: openproblems - name: Checkout website repo diff --git a/openproblems/api/hash.py b/openproblems/api/hash.py index 3c5786ad44..6b524dd6d9 100644 --- a/openproblems/api/hash.py +++ b/openproblems/api/hash.py @@ -32,12 +32,20 @@ def get_module(fun): return fun.__module__ -def git_hash(file): - """Get the git commit hash associated with a file.""" - return _run( - ["git", "log", "-n", "1", "--pretty=format:%H", "--", file], - cwd=os.path.dirname(__file__), - ) +def git_hash(obj): + """Get the git commit hash associated with the latest change to a file.""" + if isinstance(obj, str) and os.path.isfile(obj): + # if it's a file, run git log to get the hash + return _run( + ["git", "log", "-n", "1", "--pretty=format:%H", "--", obj], + cwd=os.path.dirname(__file__), + ) + elif hasattr(obj, "__file__"): + # if it's a module, get the associated file + return git_hash(obj.__file__) + elif callable(obj): + # if it's a function, get the associated module + return git_hash(importlib.import_module(get_module(obj))) def docker_token(image_name): diff --git a/pytest.ini b/pytest.ini index b967c2a2f2..0947bd0a12 100644 --- a/pytest.ini +++ b/pytest.ini @@ -13,4 +13,5 @@ filterwarnings = ignore:`Model\.state_updates` will be removed in a future version\.:UserWarning ignore:Tensorflow not installed. ParametricUMAP will be unavailable:ImportWarning ignore:Deprecated call to `pkg_resources\.declare_namespace:DeprecationWarning + ignore:pkg_resources is deprecated as an API:DeprecationWarning always:Container failed with AssertionError\. Retrying [0-9]* more time:RuntimeWarning diff --git a/test/test_core_cli.py b/test/test_core_cli.py index e52cd7c4ca..71e15c726b 100644 --- a/test/test_core_cli.py +++ b/test/test_core_cli.py @@ -1,7 +1,9 @@ from openproblems.api.hash import docker_labels_from_api +from openproblems.api.hash import git_hash from openproblems.api.main import main from openproblems.api.utils import print_output +import importlib import numpy as np import openproblems import os @@ -159,6 +161,20 @@ def test_hash_docker_api(): assert labels["bio.openproblems.build"] in ["github_actions", "local"] +@parameterized.parameterized.expand( + [ + (openproblems.tasks.label_projection.datasets.zebrafish_labs,), + (openproblems.tasks.label_projection.methods.knn_classifier_log_cp10k,), + ], + name_func=utils.name.name_test, +) +def test_git_hash(func): + h1 = git_hash(func) + module = importlib.import_module(func.__wrapped__.__module__) + assert git_hash(module) == h1 + assert git_hash(module.__file__) == h1 + + @parameterized.parameterized.expand( [ (dataset, method, metric) diff --git a/workflow/parse_metadata.py b/workflow/parse_metadata.py index 570a2f94b6..ebfda7a60f 100644 --- a/workflow/parse_metadata.py +++ b/workflow/parse_metadata.py @@ -1,5 +1,6 @@ import json import openproblems +import openproblems.api.hash import pathlib import re import sys @@ -27,7 +28,7 @@ def get_task_description(task): def write_task_json(task, outdir: pathlib.Path): data = { "task_id": openproblems.utils.get_member_id(task), - "commit_sha": workflow_utils.get_sha(), + "commit_sha": openproblems.api.hash.git_hash(task), "task_name": task._task_name, "task_summary": task._task_summary, "task_description": get_task_description(task), @@ -43,7 +44,7 @@ def _write_function_json(task, outdir: pathlib.Path, functions, function_type: s function.metadata.update( { "task_id": openproblems.utils.get_member_id(task), - "commit_sha": workflow_utils.get_sha(), + "commit_sha": openproblems.api.hash.git_hash(function), f"{function_type}_id": openproblems.utils.get_member_id(function), "implementation_url": ( "https://github.com/openproblems-bio/openproblems/"