From a49a3325ca10a2fd5f9df1ad9eb1d6b98706d3b1 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Thu, 27 Feb 2025 12:48:51 +0100
Subject: [PATCH 01/14] add results

---
 .../data/dataset_info.json                    |  12 +
 .../data/method_info.json                     | 114 ++++
 .../data/metric_execution_info.json           | 100 +++
 .../data/metric_info.json                     |  32 +
 .../data/quality_control.json                 | 632 ++++++++++++++++++
 .../cyto_batch_integration/data/results.json  | 254 +++++++
 .../cyto_batch_integration/data/state.yaml    |   9 +
 .../data/task_info.json                       |  24 +
 results/cyto_batch_integration/index.qmd      |  22 +
 9 files changed, 1199 insertions(+)
 create mode 100644 results/cyto_batch_integration/data/dataset_info.json
 create mode 100644 results/cyto_batch_integration/data/method_info.json
 create mode 100644 results/cyto_batch_integration/data/metric_execution_info.json
 create mode 100644 results/cyto_batch_integration/data/metric_info.json
 create mode 100644 results/cyto_batch_integration/data/quality_control.json
 create mode 100644 results/cyto_batch_integration/data/results.json
 create mode 100644 results/cyto_batch_integration/data/state.yaml
 create mode 100644 results/cyto_batch_integration/data/task_info.json
 create mode 100644 results/cyto_batch_integration/index.qmd

diff --git a/results/cyto_batch_integration/data/dataset_info.json b/results/cyto_batch_integration/data/dataset_info.json
new file mode 100644
index 00000000..fee9418a
--- /dev/null
+++ b/results/cyto_batch_integration/data/dataset_info.json
@@ -0,0 +1,12 @@
+[
+  {
+    "dataset_id": "XXXXX",
+    "dataset_name": "Leomazzi_data_subset",
+    "dataset_summary": "Flow cytometry data of spleens of 9 mice, subsampled to 1000 cells per sample.",
+    "dataset_description": "Flow cytometry data of spleens from 4 WT (IKK2 fl/fl CD11c-cre +/+) and 5 KO (IKK2 fl/fl CD11c-cre Tg/+) B6 mice,  measured with a 22-color panel on 2 different instrument settings. Subsampled to 1000 cells per sample. Data has been preprocessed (compensated witha batch-specific compensation matrix, logicle transformed, cleaned with PeacoQC and pregated on live single CD45+ cells).",
+    "data_reference": "",
+    "data_url": "",
+    "date_created": "27-02-2025",
+    "file_size": 1444801
+  }
+]
diff --git a/results/cyto_batch_integration/data/method_info.json b/results/cyto_batch_integration/data/method_info.json
new file mode 100644
index 00000000..aa88110b
--- /dev/null
+++ b/results/cyto_batch_integration/data/method_info.json
@@ -0,0 +1,114 @@
+[
+  {
+    "task_id": "control_methods",
+    "method_id": "shuffle_integration",
+    "method_name": "Shuffle integration",
+    "method_summary": "Integrations are randomly permuted",
+    "method_description": "Integrations are randomly permuted",
+    "is_baseline": true,
+    "references_doi": null,
+    "references_bibtex": null,
+    "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
+    "documentation_url": null,
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/control_methods/shuffle_integration",
+    "code_version": "build_main",
+    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+  },
+  {
+    "task_id": "control_methods",
+    "method_id": "shuffle_integration_by_batch",
+    "method_name": "Shuffle integration by batch",
+    "method_summary": "Integrations are randomly permuted within each batch",
+    "method_description": "Integrations are randomly permuted within each batch",
+    "is_baseline": true,
+    "references_doi": null,
+    "references_bibtex": null,
+    "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
+    "documentation_url": null,
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_batch:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/control_methods/shuffle_integration_by_batch",
+    "code_version": "build_main",
+    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+  },
+  {
+    "task_id": "control_methods",
+    "method_id": "shuffle_integration_by_cell_type",
+    "method_name": "Shuffle integration by cell type",
+    "method_summary": "Integrations are randomly permuted within each cell type",
+    "method_description": "Integrations are randomly permuted within each cell type",
+    "is_baseline": true,
+    "references_doi": null,
+    "references_bibtex": null,
+    "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
+    "documentation_url": null,
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_cell_type:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/control_methods/shuffle_integration_by_cell_type",
+    "code_version": "build_main",
+    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+  },
+  {
+    "task_id": "methods",
+    "method_id": "harmonypy",
+    "method_name": "Harmonypy",
+    "method_summary": "Harmonypy is a port of the harmony R package",
+    "method_description": "Harmony is a general-purpose R package with an efficient algorithm for integrating multiple data sets. \nIt is especially useful for large single-cell datasets such as single-cell RNA-seq.\n",
+    "is_baseline": false,
+    "references_doi": "10.1038/s41592-019-0619-0",
+    "references_bibtex": null,
+    "code_url": "https://github.com/slowkow/harmonypy",
+    "documentation_url": "https://portals.broadinstitute.org/harmony",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/harmonypy:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/methods/harmonypy",
+    "code_version": "build_main",
+    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+  },
+  {
+    "task_id": "methods",
+    "method_id": "limma_remove_batch_effect",
+    "method_name": "Limma removeBatchEffect",
+    "method_summary": "Uses a linear model and matrix decomposition to remove batch effects from a dataset",
+    "method_description": "Limma removeBatchEffect is a method that uses a linear model and matrix\ndecomposition to remove batch effects from a dataset. It first fits a linear\nmodel to the data, then decomposes the model matrix into a set of orthogonal\ncomponents. The batch effect is then removed by subtracting the component\ncorresponding to the batch effect from the data.\n",
+    "is_baseline": false,
+    "references_doi": "10.1093/nar/gkv007",
+    "references_bibtex": null,
+    "code_url": "https://github.com/bioc/limma",
+    "documentation_url": "https://bioinf.wehi.edu.au/limma",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/limma_remove_batch_effect:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/methods/limma_remove_batch_effect",
+    "code_version": "build_main",
+    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+  },
+  {
+    "task_id": "control_methods",
+    "method_id": "no_integration",
+    "method_name": "No Integration",
+    "method_summary": "Control method returning the unintegrated data without performing batch correction.",
+    "method_description": "The component works by reading and writing back the 'unintegrated' data without performing any operation. \n",
+    "is_baseline": true,
+    "references_doi": null,
+    "references_bibtex": null,
+    "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
+    "documentation_url": null,
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/no_integration:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/control_methods/no_integration",
+    "code_version": "build_main",
+    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+  },
+  {
+    "task_id": "methods",
+    "method_id": "combat",
+    "method_name": "Combat",
+    "method_summary": "ComBat batch correction for single-cell data, implemented in the scanpy package",
+    "method_description": "Corrects for batch effects by fitting linear models, gains statistical power via an EB framework where information is borrowed across genes. \nThis uses the implementation combat.py\n",
+    "is_baseline": false,
+    "references_doi": "10.1093/biostatistics/kxj037",
+    "references_bibtex": null,
+    "code_url": "https://github.com/brentp/combat.py",
+    "documentation_url": "https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/combat:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/methods/combat",
+    "code_version": "build_main",
+    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+  }
+]
diff --git a/results/cyto_batch_integration/data/metric_execution_info.json b/results/cyto_batch_integration/data/metric_execution_info.json
new file mode 100644
index 00000000..b4ab3c90
--- /dev/null
+++ b/results/cyto_batch_integration/data/metric_execution_info.json
@@ -0,0 +1,100 @@
+[
+  {
+    "dataset_id": null,
+    "method_id": "combat",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-02-27 11:29:43",
+      "exit_code": 0,
+      "duration_sec": 58.2,
+      "cpu_pct": 117.9,
+      "peak_memory_mb": 5940,
+      "disk_read_mb": 124,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "harmonypy",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-02-27 11:29:23",
+      "exit_code": 0,
+      "duration_sec": 51,
+      "cpu_pct": 105.7,
+      "peak_memory_mb": 1844,
+      "disk_read_mb": 122,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "limma_remove_batch_effect",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-02-27 11:28:43",
+      "exit_code": 0,
+      "duration_sec": 58.2,
+      "cpu_pct": 124.9,
+      "peak_memory_mb": 5940,
+      "disk_read_mb": 124,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "no_integration",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-02-27 11:30:13",
+      "exit_code": 0,
+      "duration_sec": 50.6,
+      "cpu_pct": 106.4,
+      "peak_memory_mb": 1844,
+      "disk_read_mb": 122,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-02-27 11:27:44",
+      "exit_code": 0,
+      "duration_sec": 58.6,
+      "cpu_pct": 117.4,
+      "peak_memory_mb": 5837,
+      "disk_read_mb": 122,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration_by_batch",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-02-27 11:31:03",
+      "exit_code": 0,
+      "duration_sec": 51.2,
+      "cpu_pct": 105.6,
+      "peak_memory_mb": 1844,
+      "disk_read_mb": 122,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration_by_cell_type",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-02-27 11:31:03",
+      "exit_code": 0,
+      "duration_sec": 52.8,
+      "cpu_pct": 129.8,
+      "peak_memory_mb": 5837,
+      "disk_read_mb": 122,
+      "disk_write_mb": 2
+    }
+  }
+]
diff --git a/results/cyto_batch_integration/data/metric_info.json b/results/cyto_batch_integration/data/metric_info.json
new file mode 100644
index 00000000..2c09f198
--- /dev/null
+++ b/results/cyto_batch_integration/data/metric_info.json
@@ -0,0 +1,32 @@
+[
+  {
+    "task_id": "metrics",
+    "component_name": "emd",
+    "metric_id": "emd_mean",
+    "metric_name": "EMD Mean",
+    "metric_summary": "Mean Earth Mover Distance to compute differences in distribution of marker expressions.",
+    "metric_description": "Earth Mover Distance (EMD) is a metric designed for comparing two distributions.\nIt is also known as the Wasserstein metric.\n",
+    "references_doi": "10.1023/A:1026543900054",
+    "references_bibtex": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/metrics/emd",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
+    "code_version": "build_main",
+    "commit_sha": "0423799b338abe490764d3403608758eb58041ec",
+    "maximize": false
+  },
+  {
+    "task_id": "metrics",
+    "component_name": "emd",
+    "metric_id": "emd_max",
+    "metric_name": "EMD Max",
+    "metric_summary": "Max Earth Mover Distance to compute differences in distribution of marker expressions.",
+    "metric_description": "Earth Mover Distance (EMD) is a metric designed for comparing two distributions.\nIt is also known as the Wasserstein metric.\n",
+    "references_doi": "10.1023/A:1026543900054",
+    "references_bibtex": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/metrics/emd",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
+    "code_version": "build_main",
+    "commit_sha": "0423799b338abe490764d3403608758eb58041ec",
+    "maximize": false
+  }
+]
diff --git a/results/cyto_batch_integration/data/quality_control.json b/results/cyto_batch_integration/data/quality_control.json
new file mode 100644
index 00000000..ded06fe3
--- /dev/null
+++ b/results/cyto_batch_integration/data/quality_control.json
@@ -0,0 +1,632 @@
+[
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Task info", 
+        "name": "Pct 'task_id' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing([task_info], field)", 
+        "message": "Task metadata field 'task_id' should be defined\n  Task id: task_cyto_batch_integration\n  Field: task_id\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Task info", 
+        "name": "Pct 'task_name' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing([task_info], field)", 
+        "message": "Task metadata field 'task_name' should be defined\n  Task id: task_cyto_batch_integration\n  Field: task_name\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Task info", 
+        "name": "Pct 'task_summary' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing([task_info], field)", 
+        "message": "Task metadata field 'task_summary' should be defined\n  Task id: task_cyto_batch_integration\n  Field: task_summary\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Task info", 
+        "name": "Pct 'task_description' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing([task_info], field)", 
+        "message": "Task metadata field 'task_description' should be defined\n  Task id: task_cyto_batch_integration\n  Field: task_description\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Method info", 
+        "name": "Pct 'task_id' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(method_info, field)", 
+        "message": "Method metadata field 'task_id' should be defined\n  Task id: task_cyto_batch_integration\n  Field: task_id\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Method info", 
+        "name": "Pct 'commit_sha' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(method_info, field)", 
+        "message": "Method metadata field 'commit_sha' should be defined\n  Task id: task_cyto_batch_integration\n  Field: commit_sha\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Method info", 
+        "name": "Pct 'method_id' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(method_info, field)", 
+        "message": "Method metadata field 'method_id' should be defined\n  Task id: task_cyto_batch_integration\n  Field: method_id\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Method info", 
+        "name": "Pct 'method_name' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(method_info, field)", 
+        "message": "Method metadata field 'method_name' should be defined\n  Task id: task_cyto_batch_integration\n  Field: method_name\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Method info", 
+        "name": "Pct 'method_summary' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(method_info, field)", 
+        "message": "Method metadata field 'method_summary' should be defined\n  Task id: task_cyto_batch_integration\n  Field: method_summary\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Method info", 
+        "name": "Pct 'paper_reference' missing", 
+        "value": 0.42857142857142855, 
+        "severity": 2, 
+        "severity_value": 3.0, 
+        "code": "percent_missing(method_info, field)", 
+        "message": "Method metadata field 'paper_reference' should be defined\n  Task id: task_cyto_batch_integration\n  Field: paper_reference\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Method info", 
+        "name": "Pct 'is_baseline' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(method_info, field)", 
+        "message": "Method metadata field 'is_baseline' should be defined\n  Task id: task_cyto_batch_integration\n  Field: is_baseline\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Metric info", 
+        "name": "Pct 'task_id' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(metric_info, field)", 
+        "message": "Metric metadata field 'task_id' should be defined\n  Task id: task_cyto_batch_integration\n  Field: task_id\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Metric info", 
+        "name": "Pct 'commit_sha' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(metric_info, field)", 
+        "message": "Metric metadata field 'commit_sha' should be defined\n  Task id: task_cyto_batch_integration\n  Field: commit_sha\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Metric info", 
+        "name": "Pct 'metric_id' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(metric_info, field)", 
+        "message": "Metric metadata field 'metric_id' should be defined\n  Task id: task_cyto_batch_integration\n  Field: metric_id\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Metric info", 
+        "name": "Pct 'metric_name' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(metric_info, field)", 
+        "message": "Metric metadata field 'metric_name' should be defined\n  Task id: task_cyto_batch_integration\n  Field: metric_name\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Metric info", 
+        "name": "Pct 'metric_summary' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(metric_info, field)", 
+        "message": "Metric metadata field 'metric_summary' should be defined\n  Task id: task_cyto_batch_integration\n  Field: metric_summary\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Metric info", 
+        "name": "Pct 'paper_reference' missing", 
+        "value": 1.0, 
+        "severity": 2, 
+        "severity_value": 3.0, 
+        "code": "percent_missing(metric_info, field)", 
+        "message": "Metric metadata field 'paper_reference' should be defined\n  Task id: task_cyto_batch_integration\n  Field: paper_reference\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Metric info", 
+        "name": "Pct 'maximize' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(metric_info, field)", 
+        "message": "Metric metadata field 'maximize' should be defined\n  Task id: task_cyto_batch_integration\n  Field: maximize\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Dataset info", 
+        "name": "Pct 'task_id' missing", 
+        "value": 1.0, 
+        "severity": 2, 
+        "severity_value": 3.0, 
+        "code": "percent_missing(dataset_info, field)", 
+        "message": "Dataset metadata field 'task_id' should be defined\n  Task id: task_cyto_batch_integration\n  Field: task_id\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Dataset info", 
+        "name": "Pct 'dataset_id' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(dataset_info, field)", 
+        "message": "Dataset metadata field 'dataset_id' should be defined\n  Task id: task_cyto_batch_integration\n  Field: dataset_id\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Dataset info", 
+        "name": "Pct 'dataset_name' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(dataset_info, field)", 
+        "message": "Dataset metadata field 'dataset_name' should be defined\n  Task id: task_cyto_batch_integration\n  Field: dataset_name\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Dataset info", 
+        "name": "Pct 'dataset_summary' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(dataset_info, field)", 
+        "message": "Dataset metadata field 'dataset_summary' should be defined\n  Task id: task_cyto_batch_integration\n  Field: dataset_summary\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Dataset info", 
+        "name": "Pct 'data_reference' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(dataset_info, field)", 
+        "message": "Dataset metadata field 'data_reference' should be defined\n  Task id: task_cyto_batch_integration\n  Field: data_reference\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Dataset info", 
+        "name": "Pct 'data_url' missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "percent_missing(dataset_info, field)", 
+        "message": "Dataset metadata field 'data_url' should be defined\n  Task id: task_cyto_batch_integration\n  Field: data_url\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw data", 
+        "name": "Number of results", 
+        "value": 14, 
+        "severity": 0, 
+        "severity_value": -10.0, 
+        "code": "len(results) == len(method_info) * len(metric_info) * len(dataset_info)", 
+        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_cyto_batch_integration\n  Number of results: 14\n  Number of methods: 7\n  Number of metrics: 2\n  Number of datasets: 1\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Metric 'emd_mean' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_mean\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Metric 'emd_max' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_max\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Method 'shuffle_integration' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Method 'shuffle_integration_by_batch' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration_by_batch\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Method 'shuffle_integration_by_cell_type' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration_by_cell_type\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Method 'harmonypy' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: harmonypy\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Method 'limma_remove_batch_effect' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: limma_remove_batch_effect\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Method 'no_integration' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: no_integration\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Method 'combat' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: combat\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Dataset 'XXXXX' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  dataset id: XXXXX\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration emd_mean", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration emd_mean", 
+        "value": 0.1028, 
+        "severity": 0, 
+        "severity_value": 0.0514, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Best score: 0.1028%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_batch emd_mean", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_batch emd_mean", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type emd_mean", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type emd_mean", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy emd_mean", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy emd_mean", 
+        "value": 0.9692, 
+        "severity": 0, 
+        "severity_value": 0.4846, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Best score: 0.9692%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect emd_mean", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect emd_mean", 
+        "value": 0.9297, 
+        "severity": 0, 
+        "severity_value": 0.46485, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Best score: 0.9297%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration emd_mean", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration emd_mean", 
+        "value": 0.8418, 
+        "severity": 0, 
+        "severity_value": 0.4209, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Best score: 0.8418%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat emd_mean", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat emd_mean", 
+        "value": 0.9366, 
+        "severity": 0, 
+        "severity_value": 0.4683, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Best score: 0.9366%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration emd_max", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration emd_max", 
+        "value": 0.5533, 
+        "severity": 0, 
+        "severity_value": 0.27665, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Best score: 0.5533%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_batch emd_max", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_batch emd_max", 
+        "value": 0.2183, 
+        "severity": 0, 
+        "severity_value": 0.10915, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Best score: 0.2183%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type emd_max", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type emd_max", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy emd_max", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy emd_max", 
+        "value": 0.9046, 
+        "severity": 0, 
+        "severity_value": 0.4523, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max\n  Best score: 0.9046%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect emd_max", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect emd_max", 
+        "value": 0.1228, 
+        "severity": 0, 
+        "severity_value": 0.0614, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max\n  Best score: 0.1228%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration emd_max", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration emd_max", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat emd_max", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat emd_max", 
+        "value": 0.2904, 
+        "severity": 0, 
+        "severity_value": 0.1452, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max\n  Best score: 0.2904%\n"
+    }
+]
\ No newline at end of file
diff --git a/results/cyto_batch_integration/data/results.json b/results/cyto_batch_integration/data/results.json
new file mode 100644
index 00000000..5edfcebc
--- /dev/null
+++ b/results/cyto_batch_integration/data/results.json
@@ -0,0 +1,254 @@
+[
+  {
+    "dataset_id": "XXXXX",
+    "method_id": "combat",
+    "metric_values": {
+      "emd_max": 32.3545,
+      "emd_mean": 3.1871
+    },
+    "scaled_scores": {
+      "emd_max": 0.2904,
+      "emd_mean": 0.9366
+    },
+    "mean_score": 0.6135,
+    "resources": {}
+  },
+  {
+    "dataset_id": "XXXXX",
+    "method_id": "harmonypy",
+    "metric_values": {
+      "emd_max": 26.8545,
+      "emd_mean": 3.0487
+    },
+    "scaled_scores": {
+      "emd_max": 0.9046,
+      "emd_mean": 0.9692
+    },
+    "mean_score": 0.9369,
+    "resources": {}
+  },
+  {
+    "dataset_id": "XXXXX",
+    "method_id": "limma_remove_batch_effect",
+    "metric_values": {
+      "emd_max": 33.8545,
+      "emd_mean": 3.2163
+    },
+    "scaled_scores": {
+      "emd_max": 0.1228,
+      "emd_mean": 0.9297
+    },
+    "mean_score": 0.5263,
+    "resources": {}
+  },
+  {
+    "dataset_id": "XXXXX",
+    "method_id": "no_integration",
+    "metric_values": {
+      "emd_max": 34.9545,
+      "emd_mean": 3.5885
+    },
+    "scaled_scores": {
+      "emd_max": 0,
+      "emd_mean": 0.8418
+    },
+    "mean_score": 0.4209,
+    "resources": {}
+  },
+  {
+    "dataset_id": "XXXXX",
+    "method_id": "shuffle_integration",
+    "metric_values": {
+      "emd_max": 30,
+      "emd_mean": 6.7201
+    },
+    "scaled_scores": {
+      "emd_max": 0.5533,
+      "emd_mean": 0.1028
+    },
+    "mean_score": 0.3281,
+    "resources": {}
+  },
+  {
+    "dataset_id": "XXXXX",
+    "method_id": "shuffle_integration_by_batch",
+    "metric_values": {
+      "emd_max": 33,
+      "emd_mean": 7.1559
+    },
+    "scaled_scores": {
+      "emd_max": 0.2183,
+      "emd_mean": 0
+    },
+    "mean_score": 0.1091,
+    "resources": {}
+  },
+  {
+    "dataset_id": "XXXXX",
+    "method_id": "shuffle_integration_by_cell_type",
+    "metric_values": {
+      "emd_max": 26,
+      "emd_mean": 2.9183
+    },
+    "scaled_scores": {
+      "emd_max": 1,
+      "emd_mean": 1
+    },
+    "mean_score": 1,
+    "resources": {}
+  },
+  {
+    "dataset_id": null,
+    "method_id": "combat",
+    "metric_values": {
+      "emd_mean": "NA",
+      "emd_max": "NA"
+    },
+    "scaled_scores": {
+      "emd_mean": 0,
+      "emd_max": 0
+    },
+    "mean_score": 0,
+    "resources": {
+      "submit": "2025-02-27 11:26:45",
+      "exit_code": 0,
+      "duration_sec": 4.8,
+      "cpu_pct": 138.6,
+      "peak_memory_mb": 1844,
+      "disk_read_mb": 48,
+      "disk_write_mb": 3
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "harmonypy",
+    "metric_values": {
+      "emd_mean": "NA",
+      "emd_max": "NA"
+    },
+    "scaled_scores": {
+      "emd_mean": 0,
+      "emd_max": 0
+    },
+    "mean_score": 0,
+    "resources": {
+      "submit": "2025-02-27 11:26:45",
+      "exit_code": 0,
+      "duration_sec": 9.2,
+      "cpu_pct": 744.3,
+      "peak_memory_mb": 2560,
+      "disk_read_mb": 34,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "limma_remove_batch_effect",
+    "metric_values": {
+      "emd_mean": "NA",
+      "emd_max": "NA"
+    },
+    "scaled_scores": {
+      "emd_mean": 0,
+      "emd_max": 0
+    },
+    "mean_score": 0,
+    "resources": {
+      "submit": "2025-02-27 11:26:44",
+      "exit_code": 0,
+      "duration_sec": 4.3,
+      "cpu_pct": 203.4,
+      "peak_memory_mb": 1844,
+      "disk_read_mb": 30,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "no_integration",
+    "metric_values": {
+      "emd_mean": "NA",
+      "emd_max": "NA"
+    },
+    "scaled_scores": {
+      "emd_mean": 0,
+      "emd_max": 0
+    },
+    "mean_score": 0,
+    "resources": {
+      "submit": "2025-02-27 11:26:45",
+      "exit_code": 0,
+      "duration_sec": 1.8,
+      "cpu_pct": 253.7,
+      "peak_memory_mb": 764,
+      "disk_read_mb": 20,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration",
+    "metric_values": {
+      "emd_mean": "NA",
+      "emd_max": "NA"
+    },
+    "scaled_scores": {
+      "emd_mean": 0,
+      "emd_max": 0
+    },
+    "mean_score": 0,
+    "resources": {
+      "submit": "2025-02-27 11:26:44",
+      "exit_code": 0,
+      "duration_sec": 1.9,
+      "cpu_pct": 242.3,
+      "peak_memory_mb": 763,
+      "disk_read_mb": 20,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration_by_batch",
+    "metric_values": {
+      "emd_mean": "NA",
+      "emd_max": "NA"
+    },
+    "scaled_scores": {
+      "emd_mean": 0,
+      "emd_max": 0
+    },
+    "mean_score": 0,
+    "resources": {
+      "submit": "2025-02-27 11:26:45",
+      "exit_code": 0,
+      "duration_sec": 3.8,
+      "cpu_pct": 340.5,
+      "peak_memory_mb": 5530,
+      "disk_read_mb": 20,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration_by_cell_type",
+    "metric_values": {
+      "emd_mean": "NA",
+      "emd_max": "NA"
+    },
+    "scaled_scores": {
+      "emd_mean": 0,
+      "emd_max": 0
+    },
+    "mean_score": 0,
+    "resources": {
+      "submit": "2025-02-27 11:26:46",
+      "exit_code": 0,
+      "duration_sec": 4,
+      "cpu_pct": 354.1,
+      "peak_memory_mb": 2765,
+      "disk_read_mb": 20,
+      "disk_write_mb": 2
+    }
+  }
+]
diff --git a/results/cyto_batch_integration/data/state.yaml b/results/cyto_batch_integration/data/state.yaml
new file mode 100644
index 00000000..abbb0fc1
--- /dev/null
+++ b/results/cyto_batch_integration/data/state.yaml
@@ -0,0 +1,9 @@
+id: process
+output_scores: !file results.json
+output_method_info: !file method_info.json
+output_metric_info: !file metric_info.json
+output_dataset_info: !file dataset_info.json
+output_task_info: !file task_info.json
+output_qc: !file quality_control.json
+output_metric_execution_info: !file metric_execution_info.json
+
diff --git a/results/cyto_batch_integration/data/task_info.json b/results/cyto_batch_integration/data/task_info.json
new file mode 100644
index 00000000..57837d24
--- /dev/null
+++ b/results/cyto_batch_integration/data/task_info.json
@@ -0,0 +1,24 @@
+{
+  "task_id": "task_cyto_batch_integration",
+  "commit_sha": null,
+  "task_name": "Cyto Batch Integration",
+  "task_summary": "A one sentence summary of purpose and methodology. Used for creating an overview tables.",
+  "task_description": "Provide a clear and concise description of your task, detailing the specific problem it aims\nto solve. Outline the input data types, the expected output, and any assumptions or constraints.\nBe sure to explain any terminology or concepts that are essential for understanding the task.\n\nExplain the motivation behind your proposed task. Describe the biological or computational \nproblem you aim to address and why it's important. Discuss the current state of research in\nthis area and any gaps or challenges that your task could help address. This section \nshould convince readers of the significance and relevance of your task.\n",
+  "repo": "https://github.com/openproblems-bio/task_cyto_batch_integration",
+  "issue_tracker": "https://github.com/openproblems-bio/task_cyto_batch_integration/issues",
+  "authors": [
+    {
+      "name": "John Doe",
+      "roles": ["author", "maintainer"],
+      "info": {
+        "github": "johndoe",
+        "orcid": "0000-0000-0000-0000",
+        "email": "john@doe.me",
+        "twitter": "johndoe",
+        "linkedin": "johndoe"
+      }
+    }
+  ],
+  "version": "build_main",
+  "license": "MIT"
+}
diff --git a/results/cyto_batch_integration/index.qmd b/results/cyto_batch_integration/index.qmd
new file mode 100644
index 00000000..8103f8a9
--- /dev/null
+++ b/results/cyto_batch_integration/index.qmd
@@ -0,0 +1,22 @@
+---
+title: "Cyto Batch Integration"
+subtitle: "A one sentence summary of purpose and methodology. Used for creating an overview tables."
+image: thumbnail.svg
+page-layout: full
+css: ../_include/task_template.css
+engine: knitr
+fig-cap-location: bottom
+citation-location: document
+bibliography: 
+  - library.bib
+  - ../../library.bib
+toc: false
+---
+
+```{r}
+#| include: false
+params <- list(data_dir = "results/cyto_batch_integration/data")
+params <- list(data_dir = "./data")
+```
+
+{{< include ../_include/_task_template.qmd >}}

From 2f9deaf8b0ace1a07bd5c98539bfe9339fa7f026 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Wed, 12 Mar 2025 11:56:05 +0100
Subject: [PATCH 02/14] update results

---
 .../data/dataset_info.json                    |   2 +-
 .../data/method_info.json                     |  60 +++-
 .../data/metric_execution_info.json           | 204 ++++++++++++--
 .../data/metric_info.json                     |  23 +-
 .../data/quality_control.json                 | 260 +++++++++++++-----
 .../cyto_batch_integration/data/results.json  | 174 ++++++++----
 6 files changed, 553 insertions(+), 170 deletions(-)

diff --git a/results/cyto_batch_integration/data/dataset_info.json b/results/cyto_batch_integration/data/dataset_info.json
index fee9418a..81cfebea 100644
--- a/results/cyto_batch_integration/data/dataset_info.json
+++ b/results/cyto_batch_integration/data/dataset_info.json
@@ -6,7 +6,7 @@
     "dataset_description": "Flow cytometry data of spleens from 4 WT (IKK2 fl/fl CD11c-cre +/+) and 5 KO (IKK2 fl/fl CD11c-cre Tg/+) B6 mice,  measured with a 22-color panel on 2 different instrument settings. Subsampled to 1000 cells per sample. Data has been preprocessed (compensated witha batch-specific compensation matrix, logicle transformed, cleaned with PeacoQC and pregated on live single CD45+ cells).",
     "data_reference": "",
     "data_url": "",
-    "date_created": "27-02-2025",
+    "date_created": "12-03-2025",
     "file_size": 1444801
   }
 ]
diff --git a/results/cyto_batch_integration/data/method_info.json b/results/cyto_batch_integration/data/method_info.json
index aa88110b..1cd4f5e8 100644
--- a/results/cyto_batch_integration/data/method_info.json
+++ b/results/cyto_batch_integration/data/method_info.json
@@ -11,9 +11,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/control_methods/shuffle_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/control_methods/shuffle_integration",
     "code_version": "build_main",
-    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
   },
   {
     "task_id": "control_methods",
@@ -27,9 +27,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_batch:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/control_methods/shuffle_integration_by_batch",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/control_methods/shuffle_integration_by_batch",
     "code_version": "build_main",
-    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
   },
   {
     "task_id": "control_methods",
@@ -43,9 +43,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_cell_type:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/control_methods/shuffle_integration_by_cell_type",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/control_methods/shuffle_integration_by_cell_type",
     "code_version": "build_main",
-    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
   },
   {
     "task_id": "methods",
@@ -59,9 +59,9 @@
     "code_url": "https://github.com/slowkow/harmonypy",
     "documentation_url": "https://portals.broadinstitute.org/harmony",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/harmonypy:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/methods/harmonypy",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/methods/harmonypy",
     "code_version": "build_main",
-    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
   },
   {
     "task_id": "methods",
@@ -75,9 +75,9 @@
     "code_url": "https://github.com/bioc/limma",
     "documentation_url": "https://bioinf.wehi.edu.au/limma",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/limma_remove_batch_effect:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/methods/limma_remove_batch_effect",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/methods/limma_remove_batch_effect",
     "code_version": "build_main",
-    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
   },
   {
     "task_id": "control_methods",
@@ -91,9 +91,25 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/no_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/control_methods/no_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/control_methods/no_integration",
     "code_version": "build_main",
-    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
+  },
+  {
+    "task_id": "control_methods",
+    "method_id": "perfect_integration",
+    "method_name": "Perfect Integration",
+    "method_summary": "Positive control method which imitates what perfect batch integration.",
+    "method_description": "The method actually just return the validation data but just changing the batch\nand sample ID to those that are in the unintegrated_censored.\nBecause the marker expression is the exactly same as the validation data, there won't\nbe any batch effect present.\n",
+    "is_baseline": true,
+    "references_doi": null,
+    "references_bibtex": null,
+    "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
+    "documentation_url": null,
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/perfect_integration:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/control_methods/perfect_integration",
+    "code_version": "build_main",
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
   },
   {
     "task_id": "methods",
@@ -107,8 +123,24 @@
     "code_url": "https://github.com/brentp/combat.py",
     "documentation_url": "https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/combat:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/methods/combat",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/methods/combat",
+    "code_version": "build_main",
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
+  },
+  {
+    "task_id": "methods",
+    "method_id": "cycombine_nocontrols",
+    "method_name": "cyCombine (no-controls)",
+    "method_summary": "cyCombine perform batch correction by using self-organizing maps and ComBat.",
+    "method_description": "cyCombine perform batch integration by first using self-organizing maps (SOM) to \ngroup similar cells, then applies a ComBat-based method to correct batch effects within \neach group of similar cells. \n\nHere, we run cyCombine without control samples (replicates in cyCombine terminology).\n",
+    "is_baseline": false,
+    "references_doi": "10.1038/s41467-022-29383-5",
+    "references_bibtex": null,
+    "code_url": "https://github.com/biosurf/cyCombine",
+    "documentation_url": "https://biosurf.org/cyCombine.html",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/cycombine_nocontrols:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/methods/cycombine_nocontrols",
     "code_version": "build_main",
-    "commit_sha": "0423799b338abe490764d3403608758eb58041ec"
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
   }
 ]
diff --git a/results/cyto_batch_integration/data/metric_execution_info.json b/results/cyto_batch_integration/data/metric_execution_info.json
index b4ab3c90..4e213e5e 100644
--- a/results/cyto_batch_integration/data/metric_execution_info.json
+++ b/results/cyto_batch_integration/data/metric_execution_info.json
@@ -4,97 +4,251 @@
     "method_id": "combat",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-02-27 11:29:43",
+      "submit": "2025-03-12 10:34:36",
       "exit_code": 0,
-      "duration_sec": 58.2,
-      "cpu_pct": 117.9,
-      "peak_memory_mb": 5940,
+      "duration_sec": 55,
+      "cpu_pct": 99.3,
+      "peak_memory_mb": 3175,
       "disk_read_mb": 124,
       "disk_write_mb": 2
     }
   },
+  {
+    "dataset_id": null,
+    "method_id": "combat",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-03-12 10:34:36",
+      "exit_code": 0,
+      "duration_sec": 17.9,
+      "cpu_pct": 2483.5,
+      "peak_memory_mb": 2970,
+      "disk_read_mb": 32,
+      "disk_write_mb": 1
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "cycombine_nocontrols",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-03-12 10:35:55",
+      "exit_code": 0,
+      "duration_sec": 54.8,
+      "cpu_pct": 98.4,
+      "peak_memory_mb": 3175,
+      "disk_read_mb": 124,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "cycombine_nocontrols",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-03-12 10:35:55",
+      "exit_code": 0,
+      "duration_sec": 17.6,
+      "cpu_pct": 2480.9,
+      "peak_memory_mb": 2970,
+      "disk_read_mb": 32,
+      "disk_write_mb": 1
+    }
+  },
   {
     "dataset_id": null,
     "method_id": "harmonypy",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-02-27 11:29:23",
+      "submit": "2025-03-12 10:37:55",
       "exit_code": 0,
-      "duration_sec": 51,
-      "cpu_pct": 105.7,
+      "duration_sec": 51.4,
+      "cpu_pct": 105.6,
       "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
   },
+  {
+    "dataset_id": null,
+    "method_id": "harmonypy",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-03-12 10:37:55",
+      "exit_code": 0,
+      "duration_sec": 7.3,
+      "cpu_pct": 1062.9,
+      "peak_memory_mb": 1536,
+      "disk_read_mb": 31,
+      "disk_write_mb": 1
+    }
+  },
   {
     "dataset_id": null,
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-02-27 11:28:43",
+      "submit": "2025-03-12 10:37:36",
       "exit_code": 0,
-      "duration_sec": 58.2,
-      "cpu_pct": 124.9,
-      "peak_memory_mb": 5940,
+      "duration_sec": 55.8,
+      "cpu_pct": 106.6,
+      "peak_memory_mb": 3175,
       "disk_read_mb": 124,
       "disk_write_mb": 2
     }
   },
+  {
+    "dataset_id": null,
+    "method_id": "limma_remove_batch_effect",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-03-12 10:37:36",
+      "exit_code": 0,
+      "duration_sec": 21.1,
+      "cpu_pct": 2665.8,
+      "peak_memory_mb": 2970,
+      "disk_read_mb": 31,
+      "disk_write_mb": 1
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "no_integration",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-03-12 10:37:45",
+      "exit_code": 0,
+      "duration_sec": 62.2,
+      "cpu_pct": 100.2,
+      "peak_memory_mb": 3175,
+      "disk_read_mb": 122,
+      "disk_write_mb": 2
+    }
+  },
   {
     "dataset_id": null,
     "method_id": "no_integration",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-03-12 10:37:45",
+      "exit_code": 0,
+      "duration_sec": 7.2,
+      "cpu_pct": 1075.3,
+      "peak_memory_mb": 1536,
+      "disk_read_mb": 31,
+      "disk_write_mb": 1
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "perfect_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-02-27 11:30:13",
+      "submit": "2025-03-12 10:37:05",
       "exit_code": 0,
-      "duration_sec": 50.6,
-      "cpu_pct": 106.4,
+      "duration_sec": 51.8,
+      "cpu_pct": 104.7,
       "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
   },
+  {
+    "dataset_id": null,
+    "method_id": "perfect_integration",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-03-12 10:37:05",
+      "exit_code": 0,
+      "duration_sec": 17.4,
+      "cpu_pct": 2397.9,
+      "peak_memory_mb": 2970,
+      "disk_read_mb": 30,
+      "disk_write_mb": 1
+    }
+  },
   {
     "dataset_id": null,
     "method_id": "shuffle_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-02-27 11:27:44",
+      "submit": "2025-03-12 10:36:26",
       "exit_code": 0,
-      "duration_sec": 58.6,
-      "cpu_pct": 117.4,
-      "peak_memory_mb": 5837,
+      "duration_sec": 55.2,
+      "cpu_pct": 98.4,
+      "peak_memory_mb": 3175,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
   },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-03-12 10:36:25",
+      "exit_code": 0,
+      "duration_sec": 7.2,
+      "cpu_pct": 1091,
+      "peak_memory_mb": 1536,
+      "disk_read_mb": 31,
+      "disk_write_mb": 1
+    }
+  },
   {
     "dataset_id": null,
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-02-27 11:31:03",
+      "submit": "2025-03-12 10:36:26",
       "exit_code": 0,
-      "duration_sec": 51.2,
-      "cpu_pct": 105.6,
+      "duration_sec": 52.2,
+      "cpu_pct": 104.2,
       "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
   },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration_by_batch",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-03-12 10:36:26",
+      "exit_code": 0,
+      "duration_sec": 7.1,
+      "cpu_pct": 1087,
+      "peak_memory_mb": 1536,
+      "disk_read_mb": 31,
+      "disk_write_mb": 1
+    }
+  },
   {
     "dataset_id": null,
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-02-27 11:31:03",
+      "submit": "2025-03-12 10:37:35",
       "exit_code": 0,
-      "duration_sec": 52.8,
-      "cpu_pct": 129.8,
-      "peak_memory_mb": 5837,
+      "duration_sec": 50.8,
+      "cpu_pct": 105.5,
+      "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration_by_cell_type",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-03-12 10:37:36",
+      "exit_code": 0,
+      "duration_sec": 13.3,
+      "cpu_pct": 2478.6,
+      "peak_memory_mb": 2970,
+      "disk_read_mb": 31,
+      "disk_write_mb": 1
+    }
   }
 ]
diff --git a/results/cyto_batch_integration/data/metric_info.json b/results/cyto_batch_integration/data/metric_info.json
index 2c09f198..7fabea83 100644
--- a/results/cyto_batch_integration/data/metric_info.json
+++ b/results/cyto_batch_integration/data/metric_info.json
@@ -8,10 +8,10 @@
     "metric_description": "Earth Mover Distance (EMD) is a metric designed for comparing two distributions.\nIt is also known as the Wasserstein metric.\n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "0423799b338abe490764d3403608758eb58041ec",
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd",
     "maximize": false
   },
   {
@@ -23,10 +23,25 @@
     "metric_description": "Earth Mover Distance (EMD) is a metric designed for comparing two distributions.\nIt is also known as the Wasserstein metric.\n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/0423799b338abe490764d3403608758eb58041ec/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "0423799b338abe490764d3403608758eb58041ec",
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd",
+    "maximize": false
+  },
+  {
+    "task_id": "metrics",
+    "component_name": "n_inconsistent_peaks",
+    "metric_id": "n_inconsistent_peaks",
+    "metric_name": "Number of inconsistent peaks",
+    "metric_summary": "Compare the number of marker-expression peaks between validation and batch-normalized data.",
+    "metric_description": "The metric compares the number of marker-expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe marker-expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
+    "references_doi": "10.1038/s41592-019-0686-2",
+    "references_bibtex": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/metrics/n_inconsistent_peaks",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/n_inconsistent_peaks:build_main",
+    "code_version": "build_main",
+    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd",
     "maximize": false
   }
 ]
diff --git a/results/cyto_batch_integration/data/quality_control.json b/results/cyto_batch_integration/data/quality_control.json
index ded06fe3..a26b438b 100644
--- a/results/cyto_batch_integration/data/quality_control.json
+++ b/results/cyto_batch_integration/data/quality_control.json
@@ -93,7 +93,7 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Method info", 
         "name": "Pct 'paper_reference' missing", 
-        "value": 0.42857142857142855, 
+        "value": 0.4444444444444444, 
         "severity": 2, 
         "severity_value": 3.0, 
         "code": "percent_missing(method_info, field)", 
@@ -243,11 +243,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw data", 
         "name": "Number of results", 
-        "value": 14, 
+        "value": 18, 
         "severity": 0, 
         "severity_value": -10.0, 
         "code": "len(results) == len(method_info) * len(metric_info) * len(dataset_info)", 
-        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_cyto_batch_integration\n  Number of results: 14\n  Number of methods: 7\n  Number of metrics: 2\n  Number of datasets: 1\n"
+        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_cyto_batch_integration\n  Number of results: 18\n  Number of methods: 9\n  Number of metrics: 3\n  Number of datasets: 1\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -269,85 +269,115 @@
         "code": "pct_missing <= .1", 
         "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_max\n  Percentage missing: 0%\n"
     }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Metric 'n_inconsistent_peaks' %missing", 
+        "value": 1.0, 
+        "severity": 3, 
+        "severity_value": 10.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: n_inconsistent_peaks\n  Percentage missing: 100%\n"
+    }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'shuffle_integration' %missing", 
-        "value": 0.0, 
-        "severity": 0, 
-        "severity_value": 0.0, 
+        "value": 0.33333333333333337, 
+        "severity": 3, 
+        "severity_value": 3.3333333333333335, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration\n  Percentage missing: 33%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'shuffle_integration_by_batch' %missing", 
-        "value": 0.0, 
-        "severity": 0, 
-        "severity_value": 0.0, 
+        "value": 0.33333333333333337, 
+        "severity": 3, 
+        "severity_value": 3.3333333333333335, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration_by_batch\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration_by_batch\n  Percentage missing: 33%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'shuffle_integration_by_cell_type' %missing", 
-        "value": 0.0, 
-        "severity": 0, 
-        "severity_value": 0.0, 
+        "value": 0.33333333333333337, 
+        "severity": 3, 
+        "severity_value": 3.3333333333333335, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration_by_cell_type\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration_by_cell_type\n  Percentage missing: 33%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'harmonypy' %missing", 
-        "value": 0.0, 
-        "severity": 0, 
-        "severity_value": 0.0, 
+        "value": 0.33333333333333337, 
+        "severity": 3, 
+        "severity_value": 3.3333333333333335, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: harmonypy\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: harmonypy\n  Percentage missing: 33%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'limma_remove_batch_effect' %missing", 
-        "value": 0.0, 
-        "severity": 0, 
-        "severity_value": 0.0, 
+        "value": 0.33333333333333337, 
+        "severity": 3, 
+        "severity_value": 3.3333333333333335, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: limma_remove_batch_effect\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: limma_remove_batch_effect\n  Percentage missing: 33%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'no_integration' %missing", 
-        "value": 0.0, 
-        "severity": 0, 
-        "severity_value": 0.0, 
+        "value": 0.33333333333333337, 
+        "severity": 3, 
+        "severity_value": 3.3333333333333335, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: no_integration\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: no_integration\n  Percentage missing: 33%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Method 'perfect_integration' %missing", 
+        "value": 0.33333333333333337, 
+        "severity": 3, 
+        "severity_value": 3.3333333333333335, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: perfect_integration\n  Percentage missing: 33%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'combat' %missing", 
-        "value": 0.0, 
-        "severity": 0, 
-        "severity_value": 0.0, 
+        "value": 0.33333333333333337, 
+        "severity": 3, 
+        "severity_value": 3.3333333333333335, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: combat\n  Percentage missing: 33%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Method 'cycombine_nocontrols' %missing", 
+        "value": 0.33333333333333337, 
+        "severity": 3, 
+        "severity_value": 3.3333333333333335, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: combat\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: cycombine_nocontrols\n  Percentage missing: 33%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Dataset 'XXXXX' %missing", 
-        "value": 0.0, 
-        "severity": 0, 
-        "severity_value": 0.0, 
+        "value": 0.33333333333333337, 
+        "severity": 3, 
+        "severity_value": 3.3333333333333335, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  dataset id: XXXXX\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  dataset id: XXXXX\n  Percentage missing: 33%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -363,11 +393,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration emd_mean", 
-        "value": 0.1028, 
+        "value": 0.0533, 
         "severity": 0, 
-        "severity_value": 0.0514, 
+        "severity_value": 0.02665, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Best score: 0.1028%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Best score: 0.0533%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -393,21 +423,21 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type emd_mean", 
-        "value": 0, 
+        "value": 0.0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Worst score: 0%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type emd_mean", 
-        "value": 1, 
+        "value": 0.5884, 
         "severity": 0, 
-        "severity_value": 0.5, 
+        "severity_value": 0.2942, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Best score: 1%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Best score: 0.5884%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -423,11 +453,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy emd_mean", 
-        "value": 0.9692, 
+        "value": 0.5684, 
         "severity": 0, 
-        "severity_value": 0.4846, 
+        "severity_value": 0.2842, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Best score: 0.9692%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Best score: 0.5684%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -443,11 +473,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect emd_mean", 
-        "value": 0.9297, 
+        "value": 0.5446, 
         "severity": 0, 
-        "severity_value": 0.46485, 
+        "severity_value": 0.2723, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Best score: 0.9297%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Best score: 0.5446%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -463,11 +493,31 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration emd_mean", 
-        "value": 0.8418, 
+        "value": 0.4919, 
+        "severity": 0, 
+        "severity_value": 0.24595, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Best score: 0.4919%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration emd_mean", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration emd_mean", 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": 0.4209, 
+        "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Best score: 0.8418%\n"
+        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean\n  Best score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -483,11 +533,31 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat emd_mean", 
-        "value": 0.9366, 
+        "value": 0.5488, 
+        "severity": 0, 
+        "severity_value": 0.2744, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Best score: 0.5488%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols emd_mean", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols emd_mean", 
+        "value": 0.5599, 
         "severity": 0, 
-        "severity_value": 0.4683, 
+        "severity_value": 0.27995, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Best score: 0.9366%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean\n  Best score: 0.5599%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -503,11 +573,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration emd_max", 
-        "value": 0.5533, 
+        "value": 0.2133, 
         "severity": 0, 
-        "severity_value": 0.27665, 
+        "severity_value": 0.10665, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Best score: 0.5533%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Best score: 0.2133%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -523,31 +593,31 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_batch emd_max", 
-        "value": 0.2183, 
+        "value": 0.1651, 
         "severity": 0, 
-        "severity_value": 0.10915, 
+        "severity_value": 0.08255, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Best score: 0.2183%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Best score: 0.1651%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type emd_max", 
-        "value": 0, 
+        "value": 0.0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Worst score: 0%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Worst score: 0.0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type emd_max", 
-        "value": 1, 
+        "value": 0.1417, 
         "severity": 0, 
-        "severity_value": 0.5, 
+        "severity_value": 0.07085, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Best score: 1%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Best score: 0.1417%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -563,11 +633,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy emd_max", 
-        "value": 0.9046, 
+        "value": 0.2317, 
         "severity": 0, 
-        "severity_value": 0.4523, 
+        "severity_value": 0.11585, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max\n  Best score: 0.9046%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max\n  Best score: 0.2317%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -583,11 +653,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect emd_max", 
-        "value": 0.1228, 
+        "value": 0.0315, 
         "severity": 0, 
-        "severity_value": 0.0614, 
+        "severity_value": 0.01575, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max\n  Best score: 0.1228%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max\n  Best score: 0.0315%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -609,6 +679,26 @@
         "code": "best_score <= 2", 
         "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max\n  Best score: 0%\n"
     }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration emd_max", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration emd_max", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max\n  Best score: 1%\n"
+    }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
@@ -623,10 +713,30 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat emd_max", 
-        "value": 0.2904, 
+        "value": 0.0744, 
         "severity": 0, 
-        "severity_value": 0.1452, 
+        "severity_value": 0.0372, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max\n  Best score: 0.0744%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols emd_max", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols emd_max", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max\n  Best score: 0.2904%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max\n  Best score: 0%\n"
     }
 ]
\ No newline at end of file
diff --git a/results/cyto_batch_integration/data/results.json b/results/cyto_batch_integration/data/results.json
index 5edfcebc..ae258b74 100644
--- a/results/cyto_batch_integration/data/results.json
+++ b/results/cyto_batch_integration/data/results.json
@@ -7,10 +7,24 @@
       "emd_mean": 3.1871
     },
     "scaled_scores": {
-      "emd_max": 0.2904,
-      "emd_mean": 0.9366
+      "emd_max": 0.0744,
+      "emd_mean": 0.5488
     },
-    "mean_score": 0.6135,
+    "mean_score": 0.3116,
+    "resources": {}
+  },
+  {
+    "dataset_id": "XXXXX",
+    "method_id": "cycombine_nocontrols",
+    "metric_values": {
+      "emd_max": 34.9545,
+      "emd_mean": 3.1082
+    },
+    "scaled_scores": {
+      "emd_max": 0,
+      "emd_mean": 0.5599
+    },
+    "mean_score": 0.28,
     "resources": {}
   },
   {
@@ -21,10 +35,10 @@
       "emd_mean": 3.0487
     },
     "scaled_scores": {
-      "emd_max": 0.9046,
-      "emd_mean": 0.9692
+      "emd_max": 0.2317,
+      "emd_mean": 0.5684
     },
-    "mean_score": 0.9369,
+    "mean_score": 0.4,
     "resources": {}
   },
   {
@@ -35,10 +49,10 @@
       "emd_mean": 3.2163
     },
     "scaled_scores": {
-      "emd_max": 0.1228,
-      "emd_mean": 0.9297
+      "emd_max": 0.0315,
+      "emd_mean": 0.5446
     },
-    "mean_score": 0.5263,
+    "mean_score": 0.288,
     "resources": {}
   },
   {
@@ -50,51 +64,65 @@
     },
     "scaled_scores": {
       "emd_max": 0,
-      "emd_mean": 0.8418
+      "emd_mean": 0.4919
+    },
+    "mean_score": 0.246,
+    "resources": {}
+  },
+  {
+    "dataset_id": "XXXXX",
+    "method_id": "perfect_integration",
+    "metric_values": {
+      "emd_max": 0,
+      "emd_mean": 0
+    },
+    "scaled_scores": {
+      "emd_max": 1,
+      "emd_mean": 1
     },
-    "mean_score": 0.4209,
+    "mean_score": 1,
     "resources": {}
   },
   {
     "dataset_id": "XXXXX",
     "method_id": "shuffle_integration",
     "metric_values": {
-      "emd_max": 30,
-      "emd_mean": 6.7201
+      "emd_max": 27.5,
+      "emd_mean": 6.6865
     },
     "scaled_scores": {
-      "emd_max": 0.5533,
-      "emd_mean": 0.1028
+      "emd_max": 0.2133,
+      "emd_mean": 0.0533
     },
-    "mean_score": 0.3281,
+    "mean_score": 0.1333,
     "resources": {}
   },
   {
     "dataset_id": "XXXXX",
     "method_id": "shuffle_integration_by_batch",
     "metric_values": {
-      "emd_max": 33,
-      "emd_mean": 7.1559
+      "emd_max": 29.1833,
+      "emd_mean": 7.0631
     },
     "scaled_scores": {
-      "emd_max": 0.2183,
+      "emd_max": 0.1651,
       "emd_mean": 0
     },
-    "mean_score": 0.1091,
+    "mean_score": 0.0826,
     "resources": {}
   },
   {
     "dataset_id": "XXXXX",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_values": {
-      "emd_max": 26,
-      "emd_mean": 2.9183
+      "emd_max": 30,
+      "emd_mean": 2.9069
     },
     "scaled_scores": {
-      "emd_max": 1,
-      "emd_mean": 1
+      "emd_max": 0.1417,
+      "emd_mean": 0.5884
     },
-    "mean_score": 1,
+    "mean_score": 0.3651,
     "resources": {}
   },
   {
@@ -110,15 +138,37 @@
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-02-27 11:26:45",
+      "submit": "2025-03-12 10:33:36",
       "exit_code": 0,
-      "duration_sec": 4.8,
-      "cpu_pct": 138.6,
+      "duration_sec": 4.5,
+      "cpu_pct": 145.6,
       "peak_memory_mb": 1844,
       "disk_read_mb": 48,
       "disk_write_mb": 3
     }
   },
+  {
+    "dataset_id": null,
+    "method_id": "cycombine_nocontrols",
+    "metric_values": {
+      "emd_mean": "NA",
+      "emd_max": "NA"
+    },
+    "scaled_scores": {
+      "emd_mean": 0,
+      "emd_max": 0
+    },
+    "mean_score": 0,
+    "resources": {
+      "submit": "2025-03-12 10:33:36",
+      "exit_code": 0,
+      "duration_sec": 14,
+      "cpu_pct": 130.5,
+      "peak_memory_mb": 2253,
+      "disk_read_mb": 55,
+      "disk_write_mb": 3
+    }
+  },
   {
     "dataset_id": null,
     "method_id": "harmonypy",
@@ -132,11 +182,11 @@
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-02-27 11:26:45",
+      "submit": "2025-03-12 10:33:36",
       "exit_code": 0,
-      "duration_sec": 9.2,
-      "cpu_pct": 744.3,
-      "peak_memory_mb": 2560,
+      "duration_sec": 10.7,
+      "cpu_pct": 1386.3,
+      "peak_memory_mb": 4608,
       "disk_read_mb": 34,
       "disk_write_mb": 2
     }
@@ -154,10 +204,10 @@
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-02-27 11:26:44",
+      "submit": "2025-03-12 10:33:36",
       "exit_code": 0,
-      "duration_sec": 4.3,
-      "cpu_pct": 203.4,
+      "duration_sec": 4.5,
+      "cpu_pct": 196.9,
       "peak_memory_mb": 1844,
       "disk_read_mb": 30,
       "disk_write_mb": 2
@@ -176,15 +226,37 @@
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-02-27 11:26:45",
+      "submit": "2025-03-12 10:33:37",
       "exit_code": 0,
-      "duration_sec": 1.8,
-      "cpu_pct": 253.7,
-      "peak_memory_mb": 764,
+      "duration_sec": 2,
+      "cpu_pct": 427,
+      "peak_memory_mb": 1434,
       "disk_read_mb": 20,
       "disk_write_mb": 2
     }
   },
+  {
+    "dataset_id": null,
+    "method_id": "perfect_integration",
+    "metric_values": {
+      "emd_mean": "NA",
+      "emd_max": "NA"
+    },
+    "scaled_scores": {
+      "emd_mean": 0,
+      "emd_max": 0
+    },
+    "mean_score": 0,
+    "resources": {
+      "submit": "2025-03-12 10:33:37",
+      "exit_code": 0,
+      "duration_sec": 1.7,
+      "cpu_pct": 271,
+      "peak_memory_mb": 768,
+      "disk_read_mb": 19,
+      "disk_write_mb": 1
+    }
+  },
   {
     "dataset_id": null,
     "method_id": "shuffle_integration",
@@ -198,11 +270,11 @@
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-02-27 11:26:44",
+      "submit": "2025-03-12 10:33:36",
       "exit_code": 0,
-      "duration_sec": 1.9,
-      "cpu_pct": 242.3,
-      "peak_memory_mb": 763,
+      "duration_sec": 1.8,
+      "cpu_pct": 226.6,
+      "peak_memory_mb": 765,
       "disk_read_mb": 20,
       "disk_write_mb": 2
     }
@@ -220,11 +292,11 @@
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-02-27 11:26:45",
+      "submit": "2025-03-12 10:33:36",
       "exit_code": 0,
-      "duration_sec": 3.8,
-      "cpu_pct": 340.5,
-      "peak_memory_mb": 5530,
+      "duration_sec": 1.9,
+      "cpu_pct": 232.2,
+      "peak_memory_mb": 762,
       "disk_read_mb": 20,
       "disk_write_mb": 2
     }
@@ -242,11 +314,11 @@
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-02-27 11:26:46",
+      "submit": "2025-03-12 10:33:36",
       "exit_code": 0,
-      "duration_sec": 4,
-      "cpu_pct": 354.1,
-      "peak_memory_mb": 2765,
+      "duration_sec": 1.9,
+      "cpu_pct": 241.5,
+      "peak_memory_mb": 762,
       "disk_read_mb": 20,
       "disk_write_mb": 2
     }

From c7f9a4b3e49fb99c325bd935dff9cdc92bee6774 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Thu, 13 Mar 2025 09:07:02 +0100
Subject: [PATCH 03/14] upload artifact on failure

---
 .github/workflows/quarto_netlify.yml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/quarto_netlify.yml b/.github/workflows/quarto_netlify.yml
index e8bf4be2..db905e11 100644
--- a/.github/workflows/quarto_netlify.yml
+++ b/.github/workflows/quarto_netlify.yml
@@ -107,6 +107,13 @@ jobs:
           dir: '_site'
           alias: "${{ env.BRANCH_NAME }}"
           message: 'Deploy preview ${{ github.ref }}'
+        
+      - name: Upload artifact
+        if: failure()
+        uses: actions/upload-artifact@v2
+        id: upload-artifact
+        with:
+          name: _site
 
       - name: Comment on PR (success)
         uses: thollander/actions-comment-pull-request@v2
@@ -122,6 +129,8 @@ jobs:
         with:
           message: |
             [![Deploy: failure](https://img.shields.io/badge/Deploy-failure-critical)](${{ steps.deploy_preview.outputs.logs }})
+
+            Artifacts are available for download [here](${{ steps.upload-artifact.outputs.artifact_url }})
           comment_tag: deploy_status
 
       - name: Comment on PR (actions failure)
@@ -130,5 +139,6 @@ jobs:
         with:
           message: |
             [![Deploy: failure](https://img.shields.io/badge/Deploy-failure-critical)](https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}/jobs/${{github.job}})
+
+            Artifacts are available for download [here](${{ steps.upload-artifact.outputs.artifact_url }})
           comment_tag: deploy_status
-  
\ No newline at end of file

From d4b2a7e161934aa1a53b137ca573bc8c56b560fa Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Wed, 19 Mar 2025 14:46:12 +0100
Subject: [PATCH 04/14] update results

---
 .../data/dataset_info.json                    |   2 +-
 .../data/method_info.json                     |  36 +-
 .../data/metric_execution_info.json           | 288 ++++--
 .../data/metric_info.json                     |  63 +-
 .../data/quality_control.json                 | 900 ++++++++++++++++--
 .../cyto_batch_integration/data/results.json  | 276 ++++--
 .../data/task_info.json                       |  42 +-
 7 files changed, 1349 insertions(+), 258 deletions(-)

diff --git a/results/cyto_batch_integration/data/dataset_info.json b/results/cyto_batch_integration/data/dataset_info.json
index 81cfebea..57d1b2a3 100644
--- a/results/cyto_batch_integration/data/dataset_info.json
+++ b/results/cyto_batch_integration/data/dataset_info.json
@@ -6,7 +6,7 @@
     "dataset_description": "Flow cytometry data of spleens from 4 WT (IKK2 fl/fl CD11c-cre +/+) and 5 KO (IKK2 fl/fl CD11c-cre Tg/+) B6 mice,  measured with a 22-color panel on 2 different instrument settings. Subsampled to 1000 cells per sample. Data has been preprocessed (compensated witha batch-specific compensation matrix, logicle transformed, cleaned with PeacoQC and pregated on live single CD45+ cells).",
     "data_reference": "",
     "data_url": "",
-    "date_created": "12-03-2025",
+    "date_created": "19-03-2025",
     "file_size": 1444801
   }
 ]
diff --git a/results/cyto_batch_integration/data/method_info.json b/results/cyto_batch_integration/data/method_info.json
index 1cd4f5e8..0b122b50 100644
--- a/results/cyto_batch_integration/data/method_info.json
+++ b/results/cyto_batch_integration/data/method_info.json
@@ -11,9 +11,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/control_methods/shuffle_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/control_methods/shuffle_integration",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
   },
   {
     "task_id": "control_methods",
@@ -27,9 +27,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_batch:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/control_methods/shuffle_integration_by_batch",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/control_methods/shuffle_integration_by_batch",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
   },
   {
     "task_id": "control_methods",
@@ -43,9 +43,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_cell_type:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/control_methods/shuffle_integration_by_cell_type",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/control_methods/shuffle_integration_by_cell_type",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
   },
   {
     "task_id": "methods",
@@ -59,9 +59,9 @@
     "code_url": "https://github.com/slowkow/harmonypy",
     "documentation_url": "https://portals.broadinstitute.org/harmony",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/harmonypy:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/methods/harmonypy",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/methods/harmonypy",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
   },
   {
     "task_id": "methods",
@@ -75,9 +75,9 @@
     "code_url": "https://github.com/bioc/limma",
     "documentation_url": "https://bioinf.wehi.edu.au/limma",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/limma_remove_batch_effect:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/methods/limma_remove_batch_effect",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/methods/limma_remove_batch_effect",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
   },
   {
     "task_id": "control_methods",
@@ -91,9 +91,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/no_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/control_methods/no_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/control_methods/no_integration",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
   },
   {
     "task_id": "control_methods",
@@ -107,9 +107,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/perfect_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/control_methods/perfect_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/control_methods/perfect_integration",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
   },
   {
     "task_id": "methods",
@@ -123,9 +123,9 @@
     "code_url": "https://github.com/brentp/combat.py",
     "documentation_url": "https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/combat:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/methods/combat",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/methods/combat",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
   },
   {
     "task_id": "methods",
@@ -139,8 +139,8 @@
     "code_url": "https://github.com/biosurf/cyCombine",
     "documentation_url": "https://biosurf.org/cyCombine.html",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/cycombine_nocontrols:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/methods/cycombine_nocontrols",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/methods/cycombine_nocontrols",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd"
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
   }
 ]
diff --git a/results/cyto_batch_integration/data/metric_execution_info.json b/results/cyto_batch_integration/data/metric_execution_info.json
index 4e213e5e..a77319e0 100644
--- a/results/cyto_batch_integration/data/metric_execution_info.json
+++ b/results/cyto_batch_integration/data/metric_execution_info.json
@@ -1,14 +1,28 @@
 [
+  {
+    "dataset_id": null,
+    "method_id": "combat",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-03-19 13:06:00",
+      "exit_code": 0,
+      "duration_sec": 24.8,
+      "cpu_pct": 133.2,
+      "peak_memory_mb": 4301,
+      "disk_read_mb": 70,
+      "disk_write_mb": 2
+    }
+  },
   {
     "dataset_id": null,
     "method_id": "combat",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-12 10:34:36",
+      "submit": "2025-03-19 13:06:01",
       "exit_code": 0,
-      "duration_sec": 55,
-      "cpu_pct": 99.3,
-      "peak_memory_mb": 3175,
+      "duration_sec": 51,
+      "cpu_pct": 106.1,
+      "peak_memory_mb": 1844,
       "disk_read_mb": 124,
       "disk_write_mb": 2
     }
@@ -18,13 +32,27 @@
     "method_id": "combat",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-12 10:34:36",
+      "submit": "2025-03-19 13:06:00",
       "exit_code": 0,
-      "duration_sec": 17.9,
-      "cpu_pct": 2483.5,
-      "peak_memory_mb": 2970,
-      "disk_read_mb": 32,
-      "disk_write_mb": 1
+      "duration_sec": 51.6,
+      "cpu_pct": 3873.9,
+      "peak_memory_mb": 4301,
+      "disk_read_mb": 64,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "cycombine_nocontrols",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-03-19 13:08:01",
+      "exit_code": 0,
+      "duration_sec": 20.4,
+      "cpu_pct": 124.8,
+      "peak_memory_mb": 1639,
+      "disk_read_mb": 70,
+      "disk_write_mb": 2
     }
   },
   {
@@ -32,11 +60,11 @@
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-12 10:35:55",
+      "submit": "2025-03-19 13:08:01",
       "exit_code": 0,
-      "duration_sec": 54.8,
-      "cpu_pct": 98.4,
-      "peak_memory_mb": 3175,
+      "duration_sec": 50.8,
+      "cpu_pct": 106.8,
+      "peak_memory_mb": 1844,
       "disk_read_mb": 124,
       "disk_write_mb": 2
     }
@@ -46,13 +74,27 @@
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-12 10:35:55",
+      "submit": "2025-03-19 13:08:01",
       "exit_code": 0,
-      "duration_sec": 17.6,
-      "cpu_pct": 2480.9,
+      "duration_sec": 14.4,
+      "cpu_pct": 2136.4,
       "peak_memory_mb": 2970,
-      "disk_read_mb": 32,
-      "disk_write_mb": 1
+      "disk_read_mb": 64,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "harmonypy",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-03-19 13:07:31",
+      "exit_code": 0,
+      "duration_sec": 20.6,
+      "cpu_pct": 123.5,
+      "peak_memory_mb": 1639,
+      "disk_read_mb": 68,
+      "disk_write_mb": 2
     }
   },
   {
@@ -60,11 +102,11 @@
     "method_id": "harmonypy",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-12 10:37:55",
+      "submit": "2025-03-19 13:07:31",
       "exit_code": 0,
       "duration_sec": 51.4,
-      "cpu_pct": 105.6,
-      "peak_memory_mb": 1844,
+      "cpu_pct": 114.6,
+      "peak_memory_mb": 3175,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
@@ -74,13 +116,27 @@
     "method_id": "harmonypy",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-12 10:37:55",
+      "submit": "2025-03-19 13:07:31",
       "exit_code": 0,
-      "duration_sec": 7.3,
-      "cpu_pct": 1062.9,
+      "duration_sec": 14.4,
+      "cpu_pct": 1067.6,
       "peak_memory_mb": 1536,
-      "disk_read_mb": 31,
-      "disk_write_mb": 1
+      "disk_read_mb": 62,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "limma_remove_batch_effect",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-03-19 13:04:51",
+      "exit_code": 0,
+      "duration_sec": 24.2,
+      "cpu_pct": 130.6,
+      "peak_memory_mb": 2970,
+      "disk_read_mb": 70,
+      "disk_write_mb": 2
     }
   },
   {
@@ -88,11 +144,11 @@
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-12 10:37:36",
+      "submit": "2025-03-19 13:04:51",
       "exit_code": 0,
-      "duration_sec": 55.8,
-      "cpu_pct": 106.6,
-      "peak_memory_mb": 3175,
+      "duration_sec": 55.6,
+      "cpu_pct": 103.2,
+      "peak_memory_mb": 4506,
       "disk_read_mb": 124,
       "disk_write_mb": 2
     }
@@ -102,13 +158,27 @@
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-12 10:37:36",
+      "submit": "2025-03-19 13:04:51",
+      "exit_code": 0,
+      "duration_sec": 14.2,
+      "cpu_pct": 1092.6,
+      "peak_memory_mb": 1536,
+      "disk_read_mb": 62,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "no_integration",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-03-19 13:06:50",
       "exit_code": 0,
-      "duration_sec": 21.1,
-      "cpu_pct": 2665.8,
+      "duration_sec": 22.6,
+      "cpu_pct": 143.6,
       "peak_memory_mb": 2970,
-      "disk_read_mb": 31,
-      "disk_write_mb": 1
+      "disk_read_mb": 68,
+      "disk_write_mb": 2
     }
   },
   {
@@ -116,10 +186,10 @@
     "method_id": "no_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-12 10:37:45",
+      "submit": "2025-03-19 13:06:50",
       "exit_code": 0,
-      "duration_sec": 62.2,
-      "cpu_pct": 100.2,
+      "duration_sec": 51.4,
+      "cpu_pct": 115.8,
       "peak_memory_mb": 3175,
       "disk_read_mb": 122,
       "disk_write_mb": 2
@@ -130,13 +200,27 @@
     "method_id": "no_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-12 10:37:45",
+      "submit": "2025-03-19 13:06:50",
       "exit_code": 0,
-      "duration_sec": 7.2,
-      "cpu_pct": 1075.3,
+      "duration_sec": 14.6,
+      "cpu_pct": 1074.6,
       "peak_memory_mb": 1536,
-      "disk_read_mb": 31,
-      "disk_write_mb": 1
+      "disk_read_mb": 62,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "perfect_integration",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-03-19 13:06:20",
+      "exit_code": 0,
+      "duration_sec": 20.8,
+      "cpu_pct": 124.6,
+      "peak_memory_mb": 1639,
+      "disk_read_mb": 68,
+      "disk_write_mb": 2
     }
   },
   {
@@ -144,10 +228,10 @@
     "method_id": "perfect_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-12 10:37:05",
+      "submit": "2025-03-19 13:06:20",
       "exit_code": 0,
-      "duration_sec": 51.8,
-      "cpu_pct": 104.7,
+      "duration_sec": 51.2,
+      "cpu_pct": 105.7,
       "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
@@ -158,13 +242,27 @@
     "method_id": "perfect_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-12 10:37:05",
+      "submit": "2025-03-19 13:06:20",
       "exit_code": 0,
-      "duration_sec": 17.4,
-      "cpu_pct": 2397.9,
-      "peak_memory_mb": 2970,
-      "disk_read_mb": 30,
-      "disk_write_mb": 1
+      "duration_sec": 15,
+      "cpu_pct": 1121,
+      "peak_memory_mb": 1536,
+      "disk_read_mb": 60,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-03-19 13:07:30",
+      "exit_code": 0,
+      "duration_sec": 20.4,
+      "cpu_pct": 123.9,
+      "peak_memory_mb": 1639,
+      "disk_read_mb": 68,
+      "disk_write_mb": 2
     }
   },
   {
@@ -172,11 +270,11 @@
     "method_id": "shuffle_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-12 10:36:26",
+      "submit": "2025-03-19 13:07:30",
       "exit_code": 0,
-      "duration_sec": 55.2,
-      "cpu_pct": 98.4,
-      "peak_memory_mb": 3175,
+      "duration_sec": 62.6,
+      "cpu_pct": 113.8,
+      "peak_memory_mb": 4506,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
@@ -186,13 +284,27 @@
     "method_id": "shuffle_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-12 10:36:25",
+      "submit": "2025-03-19 13:07:30",
       "exit_code": 0,
-      "duration_sec": 7.2,
-      "cpu_pct": 1091,
+      "duration_sec": 14.8,
+      "cpu_pct": 1097.9,
       "peak_memory_mb": 1536,
-      "disk_read_mb": 31,
-      "disk_write_mb": 1
+      "disk_read_mb": 62,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration_by_batch",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-03-19 13:07:31",
+      "exit_code": 0,
+      "duration_sec": 20.6,
+      "cpu_pct": 122.8,
+      "peak_memory_mb": 1639,
+      "disk_read_mb": 68,
+      "disk_write_mb": 2
     }
   },
   {
@@ -200,10 +312,10 @@
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-12 10:36:26",
+      "submit": "2025-03-19 13:07:31",
       "exit_code": 0,
-      "duration_sec": 52.2,
-      "cpu_pct": 104.2,
+      "duration_sec": 51.2,
+      "cpu_pct": 105.7,
       "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
@@ -214,13 +326,27 @@
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-12 10:36:26",
+      "submit": "2025-03-19 13:07:31",
       "exit_code": 0,
-      "duration_sec": 7.1,
-      "cpu_pct": 1087,
-      "peak_memory_mb": 1536,
-      "disk_read_mb": 31,
-      "disk_write_mb": 1
+      "duration_sec": 54,
+      "cpu_pct": 3713.7,
+      "peak_memory_mb": 4301,
+      "disk_read_mb": 62,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": null,
+    "method_id": "shuffle_integration_by_cell_type",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-03-19 13:07:31",
+      "exit_code": 0,
+      "duration_sec": 21,
+      "cpu_pct": 156.7,
+      "peak_memory_mb": 2970,
+      "disk_read_mb": 68,
+      "disk_write_mb": 2
     }
   },
   {
@@ -228,11 +354,11 @@
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-12 10:37:35",
+      "submit": "2025-03-19 13:07:31",
       "exit_code": 0,
-      "duration_sec": 50.8,
-      "cpu_pct": 105.5,
-      "peak_memory_mb": 1844,
+      "duration_sec": 61.8,
+      "cpu_pct": 108.5,
+      "peak_memory_mb": 4506,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
@@ -242,13 +368,13 @@
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-12 10:37:36",
+      "submit": "2025-03-19 13:07:31",
       "exit_code": 0,
-      "duration_sec": 13.3,
-      "cpu_pct": 2478.6,
-      "peak_memory_mb": 2970,
-      "disk_read_mb": 31,
-      "disk_write_mb": 1
+      "duration_sec": 14.2,
+      "cpu_pct": 1082.9,
+      "peak_memory_mb": 1536,
+      "disk_read_mb": 62,
+      "disk_write_mb": 2
     }
   }
 ]
diff --git a/results/cyto_batch_integration/data/metric_info.json b/results/cyto_batch_integration/data/metric_info.json
index 7fabea83..50b4fc9e 100644
--- a/results/cyto_batch_integration/data/metric_info.json
+++ b/results/cyto_batch_integration/data/metric_info.json
@@ -8,10 +8,10 @@
     "metric_description": "Earth Mover Distance (EMD) is a metric designed for comparing two distributions.\nIt is also known as the Wasserstein metric.\n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd",
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
     "maximize": false
   },
   {
@@ -23,25 +23,70 @@
     "metric_description": "Earth Mover Distance (EMD) is a metric designed for comparing two distributions.\nIt is also known as the Wasserstein metric.\n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd",
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
     "maximize": false
   },
   {
     "task_id": "metrics",
     "component_name": "n_inconsistent_peaks",
     "metric_id": "n_inconsistent_peaks",
-    "metric_name": "Number of inconsistent peaks",
-    "metric_summary": "Compare the number of marker-expression peaks between validation and batch-normalized data.",
-    "metric_description": "The metric compares the number of marker-expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe marker-expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
+    "metric_name": "Number of inconsistent peaks Global",
+    "metric_summary": "Comparison of the number of marker‑expression peaks between validation and batch‑normalized data.",
+    "metric_description": "The metric compares the number of marker expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
     "references_doi": "10.1038/s41592-019-0686-2",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/818cd97980d3e08d36a079595f02bfb75dca71bd/src/metrics/n_inconsistent_peaks",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/n_inconsistent_peaks",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/n_inconsistent_peaks:build_main",
     "code_version": "build_main",
-    "commit_sha": "818cd97980d3e08d36a079595f02bfb75dca71bd",
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
+    "maximize": false
+  },
+  {
+    "task_id": "metrics",
+    "component_name": "n_inconsistent_peaks",
+    "metric_id": "n_inconsistent_peaks_ct",
+    "metric_name": "Number of inconsistent peaks (Cell Type)",
+    "metric_summary": "Comparison of the number of cell‑type marker‑expression peaks between validation and batch‑normalized data.",
+    "metric_description": "The metric compares the number of cell type specific marker expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe (cell type) marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
+    "references_doi": "10.1038/s41592-019-0686-2",
+    "references_bibtex": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/n_inconsistent_peaks",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/n_inconsistent_peaks:build_main",
+    "code_version": "build_main",
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
+    "maximize": false
+  },
+  {
+    "task_id": "metrics",
+    "component_name": "average_batch_r2",
+    "metric_id": "average_batch_r2_global",
+    "metric_name": "Average Batch R-squared Global",
+    "metric_summary": "The average batch R-squared quantifies, on average, how strongly the batch variable B explains the variance in the data.",
+    "metric_description": "First, a simple linear model `sklearn.linear_model.LinearRegression` is fitted for each paired sample and marker to determine the fraction of variance (R^2) explained by the batch covariate B. |\nThe average batch R_squared is then computed as the average of the $R^2$ values across all paired samples, markers. |\nAs a result, $\\overline{R^2_B}_{global}$ quantifies how much of the total variability in the data is driven by batch effects. Consequently, lower values are desirable. |\n\n$\\overline{R^2_B}_{global} = \\frac{1}{N*M}\\sum_{\\substack{(x_{\\mathrm{int}},\\,x_{\\mathrm{val}})\\\\ \\text{paired samples}}}^{N} \\sum_{i=1}^{M} \\,R^2\\!\\bigl(\\mathrm{marker}_i \\mid B\\bigr)$\n\nWhere:\n- $N$ is the number of paired samples, where x_{\\mathrm{int}} is the replicate that has been batch-corrected and x_{\\mathrm{val}} is replicate used for validation. Paired samples belong to different batches.\n- $M$ is the number of markers\n- $B$ is the batch covariate\n\nA higher value of $\\overline{R^2_B}_{global}$ indicates that the batch variable explains more of the variance in the data, which indicates a higher level of batch effects. |\n",
+    "references_doi": null,
+    "references_bibtex": "@book{draper1998applied,\ntitle={Applied regression analysis},\nauthor={Draper, Norman R and Smith, Harry},\npublisher={John Wiley \\& Sons}\n}\n",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/average_batch_r2",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/average_batch_r2:build_main",
+    "code_version": "build_main",
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
+    "maximize": false
+  },
+  {
+    "task_id": "metrics",
+    "component_name": "average_batch_r2",
+    "metric_id": "average_batch_r2_ct",
+    "metric_name": "Average Batch R-squared Cell Type",
+    "metric_summary": "The average batch R-squared Cell Type quantifies, on average, how strongly the batch variable B explains the variance in the data (by taking into account cell type effect).",
+    "metric_description": "First, a simple linear model `sklearn.linear_model.LinearRegression` is fitted for each paired sample, marker and cell type to determine the fraction of variance (R^2) explained by the batch covariate B. |\nThe average batch R_squared is then computed as the average of the $R^2$ values across all paired samples, markers and cell types. |\nAs a result, $\\overline{R^2_B}_{cell\\ type}$ quantifies how much of the total variability in the data is driven by batch effects. Consequently, lower values are desirable. |\n\n$\\overline{R^2_B}_{cell\\ type} = \\frac{1}{N*C*M}\\sum_{\\substack{(x_{\\mathrm{int}},\\,x_{\\mathrm{val}})\\\\ \\text{paired samples}}}^{N} \\sum_{j=1}^{C} \\sum_{i=1}^{M}\\,R^2\\!\\bigl(\\mathrm{marker}_i \\mid B\\bigr)$\n\nWhere:\n- $N$ is the number of paired samples, where x_{\\mathrm{int}} is the replicate that has been batch-corrected and x_{\\mathrm{val}} is replicate used for validation. Paired samples belong to different batches.\n- $C$ is the number of cell types\n- $M$ is the number of markers\n- $B$ is the batch covariate\n\nThe $\\overline{Rˆ2_B}_{global}$ is a variation of the latter metric, where the average is computed across paired samples and markers only, without taking into account the cell types. |\n\nA higher value of $\\overline{R^2_B}_{global}$ or $\\overline{R^2_B}_{cell\\ type}$ indicates that the batch variable explains more of the variance in the data, which indicates a higher level of batch effects. |\n\nA good performance on $\\overline{R^2_B}_{global}$ but not on $\\overline{R^2_B}_{cell\\ type}$ might indicate that the batch effect correction is discarding cell type specific batch effects. |\n",
+    "references_doi": null,
+    "references_bibtex": "@book{draper1998applied,\ntitle={Applied regression analysis},\nauthor={Draper, Norman R and Smith, Harry},\npublisher={John Wiley \\& Sons}\n}\n",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/average_batch_r2",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/average_batch_r2:build_main",
+    "code_version": "build_main",
+    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
     "maximize": false
   }
 ]
diff --git a/results/cyto_batch_integration/data/quality_control.json b/results/cyto_batch_integration/data/quality_control.json
index a26b438b..3a6fb7ec 100644
--- a/results/cyto_batch_integration/data/quality_control.json
+++ b/results/cyto_batch_integration/data/quality_control.json
@@ -247,7 +247,7 @@
         "severity": 0, 
         "severity_value": -10.0, 
         "code": "len(results) == len(method_info) * len(metric_info) * len(dataset_info)", 
-        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_cyto_batch_integration\n  Number of results: 18\n  Number of methods: 9\n  Number of metrics: 3\n  Number of datasets: 1\n"
+        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_cyto_batch_integration\n  Number of results: 18\n  Number of methods: 9\n  Number of metrics: 6\n  Number of datasets: 1\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -273,111 +273,141 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Metric 'n_inconsistent_peaks' %missing", 
-        "value": 1.0, 
-        "severity": 3, 
-        "severity_value": 10.0, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: n_inconsistent_peaks\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Metric 'n_inconsistent_peaks_ct' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: n_inconsistent_peaks_ct\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Metric 'average_batch_r2_global' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: average_batch_r2_global\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Metric 'average_batch_r2_ct' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: n_inconsistent_peaks\n  Percentage missing: 100%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: average_batch_r2_ct\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'shuffle_integration' %missing", 
-        "value": 0.33333333333333337, 
-        "severity": 3, 
-        "severity_value": 3.3333333333333335, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration\n  Percentage missing: 33%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'shuffle_integration_by_batch' %missing", 
-        "value": 0.33333333333333337, 
-        "severity": 3, 
-        "severity_value": 3.3333333333333335, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration_by_batch\n  Percentage missing: 33%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration_by_batch\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'shuffle_integration_by_cell_type' %missing", 
-        "value": 0.33333333333333337, 
-        "severity": 3, 
-        "severity_value": 3.3333333333333335, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration_by_cell_type\n  Percentage missing: 33%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: shuffle_integration_by_cell_type\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'harmonypy' %missing", 
-        "value": 0.33333333333333337, 
-        "severity": 3, 
-        "severity_value": 3.3333333333333335, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: harmonypy\n  Percentage missing: 33%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: harmonypy\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'limma_remove_batch_effect' %missing", 
-        "value": 0.33333333333333337, 
-        "severity": 3, 
-        "severity_value": 3.3333333333333335, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: limma_remove_batch_effect\n  Percentage missing: 33%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: limma_remove_batch_effect\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'no_integration' %missing", 
-        "value": 0.33333333333333337, 
-        "severity": 3, 
-        "severity_value": 3.3333333333333335, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: no_integration\n  Percentage missing: 33%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: no_integration\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'perfect_integration' %missing", 
-        "value": 0.33333333333333337, 
-        "severity": 3, 
-        "severity_value": 3.3333333333333335, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: perfect_integration\n  Percentage missing: 33%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: perfect_integration\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'combat' %missing", 
-        "value": 0.33333333333333337, 
-        "severity": 3, 
-        "severity_value": 3.3333333333333335, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: combat\n  Percentage missing: 33%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: combat\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Method 'cycombine_nocontrols' %missing", 
-        "value": 0.33333333333333337, 
-        "severity": 3, 
-        "severity_value": 3.3333333333333335, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: cycombine_nocontrols\n  Percentage missing: 33%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: cycombine_nocontrols\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
         "name": "Dataset 'XXXXX' %missing", 
-        "value": 0.33333333333333337, 
-        "severity": 3, 
-        "severity_value": 3.3333333333333335, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  dataset id: XXXXX\n  Percentage missing: 33%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  dataset id: XXXXX\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -393,11 +423,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration emd_mean", 
-        "value": 0.0533, 
+        "value": 0.0627, 
         "severity": 0, 
-        "severity_value": 0.02665, 
+        "severity_value": 0.03135, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Best score: 0.0533%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Best score: 0.0627%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -433,11 +463,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type emd_mean", 
-        "value": 0.5884, 
+        "value": 0.5937, 
         "severity": 0, 
-        "severity_value": 0.2942, 
+        "severity_value": 0.29685, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Best score: 0.5884%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Best score: 0.5937%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -453,11 +483,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy emd_mean", 
-        "value": 0.5684, 
+        "value": 0.5738, 
         "severity": 0, 
-        "severity_value": 0.2842, 
+        "severity_value": 0.2869, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Best score: 0.5684%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Best score: 0.5738%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -473,11 +503,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect emd_mean", 
-        "value": 0.5446, 
+        "value": 0.5504, 
         "severity": 0, 
-        "severity_value": 0.2723, 
+        "severity_value": 0.2752, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Best score: 0.5446%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Best score: 0.5504%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -493,11 +523,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration emd_mean", 
-        "value": 0.4919, 
+        "value": 0.4983, 
         "severity": 0, 
-        "severity_value": 0.24595, 
+        "severity_value": 0.24915, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Best score: 0.4919%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Best score: 0.4983%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -533,11 +563,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat emd_mean", 
-        "value": 0.5488, 
+        "value": 0.5545, 
         "severity": 0, 
-        "severity_value": 0.2744, 
+        "severity_value": 0.27725, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Best score: 0.5488%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Best score: 0.5545%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -553,11 +583,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols emd_mean", 
-        "value": 0.5599, 
+        "value": 0.5655, 
         "severity": 0, 
-        "severity_value": 0.27995, 
+        "severity_value": 0.28275, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean\n  Best score: 0.5599%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean\n  Best score: 0.5655%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -573,11 +603,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration emd_max", 
-        "value": 0.2133, 
+        "value": 0.199, 
         "severity": 0, 
-        "severity_value": 0.10665, 
+        "severity_value": 0.0995, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Best score: 0.2133%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Best score: 0.199%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -593,11 +623,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_batch emd_max", 
-        "value": 0.1651, 
+        "value": 0.1417, 
         "severity": 0, 
-        "severity_value": 0.08255, 
+        "severity_value": 0.07085, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Best score: 0.1651%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Best score: 0.1417%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -613,11 +643,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type emd_max", 
-        "value": 0.1417, 
+        "value": 0.2848, 
         "severity": 0, 
-        "severity_value": 0.07085, 
+        "severity_value": 0.1424, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Best score: 0.1417%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Best score: 0.2848%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -738,5 +768,725 @@
         "severity_value": 0.0, 
         "code": "best_score <= 2", 
         "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration n_inconsistent_peaks", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration n_inconsistent_peaks", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_batch n_inconsistent_peaks", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_batch n_inconsistent_peaks", 
+        "value": 0.1818, 
+        "severity": 0, 
+        "severity_value": 0.0909, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks\n  Best score: 0.1818%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type n_inconsistent_peaks", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type n_inconsistent_peaks", 
+        "value": 0.1364, 
+        "severity": 0, 
+        "severity_value": 0.0682, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Best score: 0.1364%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy n_inconsistent_peaks", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy n_inconsistent_peaks", 
+        "value": 0.4545, 
+        "severity": 0, 
+        "severity_value": 0.22725, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Best score: 0.4545%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect n_inconsistent_peaks", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect n_inconsistent_peaks", 
+        "value": 0.3182, 
+        "severity": 0, 
+        "severity_value": 0.1591, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Best score: 0.3182%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration n_inconsistent_peaks", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration n_inconsistent_peaks", 
+        "value": 0.3182, 
+        "severity": 0, 
+        "severity_value": 0.1591, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 0.3182%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration n_inconsistent_peaks", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration n_inconsistent_peaks", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat n_inconsistent_peaks", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat n_inconsistent_peaks", 
+        "value": 0.2727, 
+        "severity": 0, 
+        "severity_value": 0.13635, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Best score: 0.2727%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols n_inconsistent_peaks", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols n_inconsistent_peaks", 
+        "value": 0.3182, 
+        "severity": 0, 
+        "severity_value": 0.1591, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Best score: 0.3182%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration n_inconsistent_peaks_ct", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration n_inconsistent_peaks_ct", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_batch n_inconsistent_peaks_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_batch n_inconsistent_peaks_ct", 
+        "value": 0.3548, 
+        "severity": 0, 
+        "severity_value": 0.1774, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.3548%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
+        "value": 0.5161, 
+        "severity": 0, 
+        "severity_value": 0.25805, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5161%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy n_inconsistent_peaks_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy n_inconsistent_peaks_ct", 
+        "value": 0.6129, 
+        "severity": 0, 
+        "severity_value": 0.30645, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.6129%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect n_inconsistent_peaks_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect n_inconsistent_peaks_ct", 
+        "value": 0.5968, 
+        "severity": 0, 
+        "severity_value": 0.2984, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5968%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration n_inconsistent_peaks_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration n_inconsistent_peaks_ct", 
+        "value": 0.5968, 
+        "severity": 0, 
+        "severity_value": 0.2984, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5968%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration n_inconsistent_peaks_ct", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration n_inconsistent_peaks_ct", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat n_inconsistent_peaks_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat n_inconsistent_peaks_ct", 
+        "value": 0.5806, 
+        "severity": 0, 
+        "severity_value": 0.2903, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5806%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols n_inconsistent_peaks_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols n_inconsistent_peaks_ct", 
+        "value": 0.5806, 
+        "severity": 0, 
+        "severity_value": 0.2903, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5806%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration average_batch_r2_global", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration average_batch_r2_global", 
+        "value": 0.5761, 
+        "severity": 0, 
+        "severity_value": 0.28805, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.5761%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_batch average_batch_r2_global", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: average_batch_r2_global\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_batch average_batch_r2_global", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: average_batch_r2_global\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type average_batch_r2_global", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type average_batch_r2_global", 
+        "value": 0.6962, 
+        "severity": 0, 
+        "severity_value": 0.3481, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Best score: 0.6962%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy average_batch_r2_global", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy average_batch_r2_global", 
+        "value": 0.6218, 
+        "severity": 0, 
+        "severity_value": 0.3109, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Best score: 0.6218%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect average_batch_r2_global", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect average_batch_r2_global", 
+        "value": 0.6575, 
+        "severity": 0, 
+        "severity_value": 0.32875, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Best score: 0.6575%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration average_batch_r2_global", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration average_batch_r2_global", 
+        "value": 0.1712, 
+        "severity": 0, 
+        "severity_value": 0.0856, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.1712%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration average_batch_r2_global", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration average_batch_r2_global", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_global\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat average_batch_r2_global", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat average_batch_r2_global", 
+        "value": 0.6432, 
+        "severity": 0, 
+        "severity_value": 0.3216, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Best score: 0.6432%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols average_batch_r2_global", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols average_batch_r2_global", 
+        "value": 0.4241, 
+        "severity": 0, 
+        "severity_value": 0.21205, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Best score: 0.4241%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration average_batch_r2_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration average_batch_r2_ct", 
+        "value": 0.1429, 
+        "severity": 0, 
+        "severity_value": 0.07145, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.1429%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_batch average_batch_r2_ct", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: average_batch_r2_ct\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_batch average_batch_r2_ct", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: average_batch_r2_ct\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type average_batch_r2_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type average_batch_r2_ct", 
+        "value": 0.7318, 
+        "severity": 0, 
+        "severity_value": 0.3659, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Best score: 0.7318%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy average_batch_r2_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy average_batch_r2_ct", 
+        "value": 0.57, 
+        "severity": 0, 
+        "severity_value": 0.285, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Best score: 0.57%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect average_batch_r2_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect average_batch_r2_ct", 
+        "value": 0.4977, 
+        "severity": 0, 
+        "severity_value": 0.24885, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Best score: 0.4977%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration average_batch_r2_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration average_batch_r2_ct", 
+        "value": 0.4497, 
+        "severity": 0, 
+        "severity_value": 0.22485, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.4497%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration average_batch_r2_ct", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration average_batch_r2_ct", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_ct\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat average_batch_r2_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat average_batch_r2_ct", 
+        "value": 0.5202, 
+        "severity": 0, 
+        "severity_value": 0.2601, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Best score: 0.5202%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols average_batch_r2_ct", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols average_batch_r2_ct", 
+        "value": 0.5823, 
+        "severity": 0, 
+        "severity_value": 0.29115, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Best score: 0.5823%\n"
     }
 ]
\ No newline at end of file
diff --git a/results/cyto_batch_integration/data/results.json b/results/cyto_batch_integration/data/results.json
index ae258b74..169f5c16 100644
--- a/results/cyto_batch_integration/data/results.json
+++ b/results/cyto_batch_integration/data/results.json
@@ -3,82 +3,130 @@
     "dataset_id": "XXXXX",
     "method_id": "combat",
     "metric_values": {
+      "average_batch_r2_ct": 0.1219,
+      "average_batch_r2_global": 0.0268,
       "emd_max": 32.3545,
-      "emd_mean": 3.1871
+      "emd_mean": 3.1871,
+      "n_inconsistent_peaks": 16,
+      "n_inconsistent_peaks_ct": 26
     },
     "scaled_scores": {
+      "average_batch_r2_ct": 0.5202,
+      "average_batch_r2_global": 0.6432,
       "emd_max": 0.0744,
-      "emd_mean": 0.5488
+      "emd_mean": 0.5545,
+      "n_inconsistent_peaks": 0.2727,
+      "n_inconsistent_peaks_ct": 0.5806
     },
-    "mean_score": 0.3116,
+    "mean_score": 0.4409,
     "resources": {}
   },
   {
     "dataset_id": "XXXXX",
     "method_id": "cycombine_nocontrols",
     "metric_values": {
+      "average_batch_r2_ct": 0.1061,
+      "average_batch_r2_global": 0.0433,
       "emd_max": 34.9545,
-      "emd_mean": 3.1082
+      "emd_mean": 3.1082,
+      "n_inconsistent_peaks": 15,
+      "n_inconsistent_peaks_ct": 26
     },
     "scaled_scores": {
+      "average_batch_r2_ct": 0.5823,
+      "average_batch_r2_global": 0.4241,
       "emd_max": 0,
-      "emd_mean": 0.5599
+      "emd_mean": 0.5655,
+      "n_inconsistent_peaks": 0.3182,
+      "n_inconsistent_peaks_ct": 0.5806
     },
-    "mean_score": 0.28,
+    "mean_score": 0.4118,
     "resources": {}
   },
   {
     "dataset_id": "XXXXX",
     "method_id": "harmonypy",
     "metric_values": {
+      "average_batch_r2_ct": 0.1093,
+      "average_batch_r2_global": 0.0284,
       "emd_max": 26.8545,
-      "emd_mean": 3.0487
+      "emd_mean": 3.0487,
+      "n_inconsistent_peaks": 12,
+      "n_inconsistent_peaks_ct": 24
     },
     "scaled_scores": {
+      "average_batch_r2_ct": 0.57,
+      "average_batch_r2_global": 0.6218,
       "emd_max": 0.2317,
-      "emd_mean": 0.5684
+      "emd_mean": 0.5738,
+      "n_inconsistent_peaks": 0.4545,
+      "n_inconsistent_peaks_ct": 0.6129
     },
-    "mean_score": 0.4,
+    "mean_score": 0.5108,
     "resources": {}
   },
   {
     "dataset_id": "XXXXX",
     "method_id": "limma_remove_batch_effect",
     "metric_values": {
+      "average_batch_r2_ct": 0.1276,
+      "average_batch_r2_global": 0.0258,
       "emd_max": 33.8545,
-      "emd_mean": 3.2163
+      "emd_mean": 3.2163,
+      "n_inconsistent_peaks": 15,
+      "n_inconsistent_peaks_ct": 25
     },
     "scaled_scores": {
+      "average_batch_r2_ct": 0.4977,
+      "average_batch_r2_global": 0.6575,
       "emd_max": 0.0315,
-      "emd_mean": 0.5446
+      "emd_mean": 0.5504,
+      "n_inconsistent_peaks": 0.3182,
+      "n_inconsistent_peaks_ct": 0.5968
     },
-    "mean_score": 0.288,
+    "mean_score": 0.442,
     "resources": {}
   },
   {
     "dataset_id": "XXXXX",
     "method_id": "no_integration",
     "metric_values": {
+      "average_batch_r2_ct": 0.1398,
+      "average_batch_r2_global": 0.0623,
       "emd_max": 34.9545,
-      "emd_mean": 3.5885
+      "emd_mean": 3.5885,
+      "n_inconsistent_peaks": 15,
+      "n_inconsistent_peaks_ct": 25
     },
     "scaled_scores": {
+      "average_batch_r2_ct": 0.4497,
+      "average_batch_r2_global": 0.1712,
       "emd_max": 0,
-      "emd_mean": 0.4919
+      "emd_mean": 0.4983,
+      "n_inconsistent_peaks": 0.3182,
+      "n_inconsistent_peaks_ct": 0.5968
     },
-    "mean_score": 0.246,
+    "mean_score": 0.339,
     "resources": {}
   },
   {
     "dataset_id": "XXXXX",
     "method_id": "perfect_integration",
     "metric_values": {
+      "average_batch_r2_ct": 2.3291e-19,
+      "average_batch_r2_global": 0,
       "emd_max": 0,
-      "emd_mean": 0
+      "emd_mean": 0,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0
     },
     "scaled_scores": {
+      "average_batch_r2_ct": 1,
+      "average_batch_r2_global": 1,
       "emd_max": 1,
-      "emd_mean": 1
+      "emd_mean": 1,
+      "n_inconsistent_peaks": 1,
+      "n_inconsistent_peaks_ct": 1
     },
     "mean_score": 1,
     "resources": {}
@@ -87,61 +135,93 @@
     "dataset_id": "XXXXX",
     "method_id": "shuffle_integration",
     "metric_values": {
-      "emd_max": 27.5,
-      "emd_mean": 6.6865
+      "average_batch_r2_ct": 0.2178,
+      "average_batch_r2_global": 0.0319,
+      "emd_max": 28,
+      "emd_mean": 6.7046,
+      "n_inconsistent_peaks": 22,
+      "n_inconsistent_peaks_ct": 62
     },
     "scaled_scores": {
-      "emd_max": 0.2133,
-      "emd_mean": 0.0533
-    },
-    "mean_score": 0.1333,
+      "average_batch_r2_ct": 0.1429,
+      "average_batch_r2_global": 0.5761,
+      "emd_max": 0.199,
+      "emd_mean": 0.0627,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0
+    },
+    "mean_score": 0.1635,
     "resources": {}
   },
   {
     "dataset_id": "XXXXX",
     "method_id": "shuffle_integration_by_batch",
     "metric_values": {
-      "emd_max": 29.1833,
-      "emd_mean": 7.0631
+      "average_batch_r2_ct": 0.2541,
+      "average_batch_r2_global": 0.0752,
+      "emd_max": 30,
+      "emd_mean": 7.1534,
+      "n_inconsistent_peaks": 18,
+      "n_inconsistent_peaks_ct": 40
     },
     "scaled_scores": {
-      "emd_max": 0.1651,
-      "emd_mean": 0
+      "average_batch_r2_ct": 0,
+      "average_batch_r2_global": 0,
+      "emd_max": 0.1417,
+      "emd_mean": 0,
+      "n_inconsistent_peaks": 0.1818,
+      "n_inconsistent_peaks_ct": 0.3548
     },
-    "mean_score": 0.0826,
+    "mean_score": 0.1131,
     "resources": {}
   },
   {
     "dataset_id": "XXXXX",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_values": {
-      "emd_max": 30,
-      "emd_mean": 2.9069
+      "average_batch_r2_ct": 0.0681,
+      "average_batch_r2_global": 0.0228,
+      "emd_max": 25,
+      "emd_mean": 2.9063,
+      "n_inconsistent_peaks": 19,
+      "n_inconsistent_peaks_ct": 30
     },
     "scaled_scores": {
-      "emd_max": 0.1417,
-      "emd_mean": 0.5884
-    },
-    "mean_score": 0.3651,
+      "average_batch_r2_ct": 0.7318,
+      "average_batch_r2_global": 0.6962,
+      "emd_max": 0.2848,
+      "emd_mean": 0.5937,
+      "n_inconsistent_peaks": 0.1364,
+      "n_inconsistent_peaks_ct": 0.5161
+    },
+    "mean_score": 0.4932,
     "resources": {}
   },
   {
     "dataset_id": null,
     "method_id": "combat",
     "metric_values": {
+      "average_batch_r2_global": "NA",
+      "average_batch_r2_ct": "NA",
+      "n_inconsistent_peaks": "NA",
+      "n_inconsistent_peaks_ct": "NA",
       "emd_mean": "NA",
       "emd_max": "NA"
     },
     "scaled_scores": {
+      "average_batch_r2_global": 0,
+      "average_batch_r2_ct": 0,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0,
       "emd_mean": 0,
       "emd_max": 0
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-03-12 10:33:36",
+      "submit": "2025-03-19 13:03:51",
       "exit_code": 0,
-      "duration_sec": 4.5,
-      "cpu_pct": 145.6,
+      "duration_sec": 4.6,
+      "cpu_pct": 144.5,
       "peak_memory_mb": 1844,
       "disk_read_mb": 48,
       "disk_write_mb": 3
@@ -151,20 +231,28 @@
     "dataset_id": null,
     "method_id": "cycombine_nocontrols",
     "metric_values": {
+      "average_batch_r2_global": "NA",
+      "average_batch_r2_ct": "NA",
+      "n_inconsistent_peaks": "NA",
+      "n_inconsistent_peaks_ct": "NA",
       "emd_mean": "NA",
       "emd_max": "NA"
     },
     "scaled_scores": {
+      "average_batch_r2_global": 0,
+      "average_batch_r2_ct": 0,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0,
       "emd_mean": 0,
       "emd_max": 0
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-03-12 10:33:36",
+      "submit": "2025-03-19 13:03:51",
       "exit_code": 0,
-      "duration_sec": 14,
-      "cpu_pct": 130.5,
-      "peak_memory_mb": 2253,
+      "duration_sec": 15,
+      "cpu_pct": 156.4,
+      "peak_memory_mb": 3584,
       "disk_read_mb": 55,
       "disk_write_mb": 3
     }
@@ -173,20 +261,28 @@
     "dataset_id": null,
     "method_id": "harmonypy",
     "metric_values": {
+      "average_batch_r2_global": "NA",
+      "average_batch_r2_ct": "NA",
+      "n_inconsistent_peaks": "NA",
+      "n_inconsistent_peaks_ct": "NA",
       "emd_mean": "NA",
       "emd_max": "NA"
     },
     "scaled_scores": {
+      "average_batch_r2_global": 0,
+      "average_batch_r2_ct": 0,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0,
       "emd_mean": 0,
       "emd_max": 0
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-03-12 10:33:36",
+      "submit": "2025-03-19 13:03:51",
       "exit_code": 0,
-      "duration_sec": 10.7,
-      "cpu_pct": 1386.3,
-      "peak_memory_mb": 4608,
+      "duration_sec": 9.1,
+      "cpu_pct": 737.3,
+      "peak_memory_mb": 2560,
       "disk_read_mb": 34,
       "disk_write_mb": 2
     }
@@ -195,19 +291,27 @@
     "dataset_id": null,
     "method_id": "limma_remove_batch_effect",
     "metric_values": {
+      "average_batch_r2_global": "NA",
+      "average_batch_r2_ct": "NA",
+      "n_inconsistent_peaks": "NA",
+      "n_inconsistent_peaks_ct": "NA",
       "emd_mean": "NA",
       "emd_max": "NA"
     },
     "scaled_scores": {
+      "average_batch_r2_global": 0,
+      "average_batch_r2_ct": 0,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0,
       "emd_mean": 0,
       "emd_max": 0
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-03-12 10:33:36",
+      "submit": "2025-03-19 13:03:51",
       "exit_code": 0,
-      "duration_sec": 4.5,
-      "cpu_pct": 196.9,
+      "duration_sec": 4.3,
+      "cpu_pct": 206.5,
       "peak_memory_mb": 1844,
       "disk_read_mb": 30,
       "disk_write_mb": 2
@@ -217,20 +321,28 @@
     "dataset_id": null,
     "method_id": "no_integration",
     "metric_values": {
+      "average_batch_r2_global": "NA",
+      "average_batch_r2_ct": "NA",
+      "n_inconsistent_peaks": "NA",
+      "n_inconsistent_peaks_ct": "NA",
       "emd_mean": "NA",
       "emd_max": "NA"
     },
     "scaled_scores": {
+      "average_batch_r2_global": 0,
+      "average_batch_r2_ct": 0,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0,
       "emd_mean": 0,
       "emd_max": 0
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-03-12 10:33:37",
+      "submit": "2025-03-19 13:03:51",
       "exit_code": 0,
-      "duration_sec": 2,
-      "cpu_pct": 427,
-      "peak_memory_mb": 1434,
+      "duration_sec": 1.7,
+      "cpu_pct": 268.7,
+      "peak_memory_mb": 768,
       "disk_read_mb": 20,
       "disk_write_mb": 2
     }
@@ -239,20 +351,28 @@
     "dataset_id": null,
     "method_id": "perfect_integration",
     "metric_values": {
+      "average_batch_r2_global": "NA",
+      "average_batch_r2_ct": "NA",
+      "n_inconsistent_peaks": "NA",
+      "n_inconsistent_peaks_ct": "NA",
       "emd_mean": "NA",
       "emd_max": "NA"
     },
     "scaled_scores": {
+      "average_batch_r2_global": 0,
+      "average_batch_r2_ct": 0,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0,
       "emd_mean": 0,
       "emd_max": 0
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-03-12 10:33:37",
+      "submit": "2025-03-19 13:03:51",
       "exit_code": 0,
       "duration_sec": 1.7,
-      "cpu_pct": 271,
-      "peak_memory_mb": 768,
+      "cpu_pct": 271.4,
+      "peak_memory_mb": 767,
       "disk_read_mb": 19,
       "disk_write_mb": 1
     }
@@ -261,20 +381,28 @@
     "dataset_id": null,
     "method_id": "shuffle_integration",
     "metric_values": {
+      "average_batch_r2_global": "NA",
+      "average_batch_r2_ct": "NA",
+      "n_inconsistent_peaks": "NA",
+      "n_inconsistent_peaks_ct": "NA",
       "emd_mean": "NA",
       "emd_max": "NA"
     },
     "scaled_scores": {
+      "average_batch_r2_global": 0,
+      "average_batch_r2_ct": 0,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0,
       "emd_mean": 0,
       "emd_max": 0
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-03-12 10:33:36",
+      "submit": "2025-03-19 13:03:51",
       "exit_code": 0,
-      "duration_sec": 1.8,
-      "cpu_pct": 226.6,
-      "peak_memory_mb": 765,
+      "duration_sec": 1.9,
+      "cpu_pct": 204.8,
+      "peak_memory_mb": 764,
       "disk_read_mb": 20,
       "disk_write_mb": 2
     }
@@ -283,20 +411,28 @@
     "dataset_id": null,
     "method_id": "shuffle_integration_by_batch",
     "metric_values": {
+      "average_batch_r2_global": "NA",
+      "average_batch_r2_ct": "NA",
+      "n_inconsistent_peaks": "NA",
+      "n_inconsistent_peaks_ct": "NA",
       "emd_mean": "NA",
       "emd_max": "NA"
     },
     "scaled_scores": {
+      "average_batch_r2_global": 0,
+      "average_batch_r2_ct": 0,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0,
       "emd_mean": 0,
       "emd_max": 0
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-03-12 10:33:36",
+      "submit": "2025-03-19 13:03:51",
       "exit_code": 0,
       "duration_sec": 1.9,
-      "cpu_pct": 232.2,
-      "peak_memory_mb": 762,
+      "cpu_pct": 185,
+      "peak_memory_mb": 763,
       "disk_read_mb": 20,
       "disk_write_mb": 2
     }
@@ -305,20 +441,28 @@
     "dataset_id": null,
     "method_id": "shuffle_integration_by_cell_type",
     "metric_values": {
+      "average_batch_r2_global": "NA",
+      "average_batch_r2_ct": "NA",
+      "n_inconsistent_peaks": "NA",
+      "n_inconsistent_peaks_ct": "NA",
       "emd_mean": "NA",
       "emd_max": "NA"
     },
     "scaled_scores": {
+      "average_batch_r2_global": 0,
+      "average_batch_r2_ct": 0,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0,
       "emd_mean": 0,
       "emd_max": 0
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-03-12 10:33:36",
+      "submit": "2025-03-19 13:03:51",
       "exit_code": 0,
-      "duration_sec": 1.9,
-      "cpu_pct": 241.5,
-      "peak_memory_mb": 762,
+      "duration_sec": 1.8,
+      "cpu_pct": 261.1,
+      "peak_memory_mb": 765,
       "disk_read_mb": 20,
       "disk_write_mb": 2
     }
diff --git a/results/cyto_batch_integration/data/task_info.json b/results/cyto_batch_integration/data/task_info.json
index 57837d24..44971ae3 100644
--- a/results/cyto_batch_integration/data/task_info.json
+++ b/results/cyto_batch_integration/data/task_info.json
@@ -2,20 +2,46 @@
   "task_id": "task_cyto_batch_integration",
   "commit_sha": null,
   "task_name": "Cyto Batch Integration",
-  "task_summary": "A one sentence summary of purpose and methodology. Used for creating an overview tables.",
-  "task_description": "Provide a clear and concise description of your task, detailing the specific problem it aims\nto solve. Outline the input data types, the expected output, and any assumptions or constraints.\nBe sure to explain any terminology or concepts that are essential for understanding the task.\n\nExplain the motivation behind your proposed task. Describe the biological or computational \nproblem you aim to address and why it's important. Discuss the current state of research in\nthis area and any gaps or challenges that your task could help address. This section \nshould convince readers of the significance and relevance of your task.\n",
+  "task_summary": "Benchmarking of batch integration algorithms for cytometry data.",
+  "task_description": "Cytometry is a non-sequencing single cell profiling technique commonly used in clinical studies. \nIt is very sensitive to batch effects, which can lead to biases in the interpretation of the result. \nBatch integration algorithms are often used to mitigate this effect.\n\nIn this project, we are building a pipeline for reproducible and continuous benchmarking \nof batch integration algorithms for cytometry data.\nAs input, methods require cleaned and normalised (using arc-sinh or logicle transformation)\ndata with multiple batches, cell type labels, and biological subjects, with paired samples\nfrom a subject profiled across multiple batches.\nThe batch integrated output must be an integrated marker by cell matrix stored in \nAnndata format.\nAll markers in the input data must be returned, regardless of whether they were integrated or not.\nThis output is then evaluated using metrics that assess how well the batch effects\nwere removed and how much biological signals were preserved. \n",
   "repo": "https://github.com/openproblems-bio/task_cyto_batch_integration",
   "issue_tracker": "https://github.com/openproblems-bio/task_cyto_batch_integration/issues",
   "authors": [
     {
-      "name": "John Doe",
+      "name": "Luca Leomazzi",
       "roles": ["author", "maintainer"],
       "info": {
-        "github": "johndoe",
-        "orcid": "0000-0000-0000-0000",
-        "email": "john@doe.me",
-        "twitter": "johndoe",
-        "linkedin": "johndoe"
+        "github": "LuLeom"
+      }
+    },
+    {
+      "name": "Givanna Putri",
+      "roles": ["author", "maintainer"],
+      "info": {
+        "github": "ghar1821",
+        "orcid": "0000-0002-7399-8014"
+      }
+    },
+    {
+      "name": "Robrecht Cannoodt",
+      "roles": "author",
+      "info": {
+        "github": "rcannood",
+        "orcid": "0000-0003-3641-729X"
+      }
+    },
+    {
+      "name": "Katrien Quintelier",
+      "roles": "contributor",
+      "info": {
+        "github": "KatrienQ"
+      }
+    },
+    {
+      "name": "Sofie Van Gassen",
+      "roles": "contributor",
+      "info": {
+        "github": "SofieVG"
       }
     }
   ],

From 4c47126fc1d14ceb0ff9efa1fd37aa928ba4465e Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Wed, 19 Mar 2025 20:19:45 +0100
Subject: [PATCH 05/14] fix action

---
 .github/workflows/quarto_netlify.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/quarto_netlify.yml b/.github/workflows/quarto_netlify.yml
index db905e11..1d95acc1 100644
--- a/.github/workflows/quarto_netlify.yml
+++ b/.github/workflows/quarto_netlify.yml
@@ -110,7 +110,7 @@ jobs:
         
       - name: Upload artifact
         if: failure()
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         id: upload-artifact
         with:
           name: _site

From 8f94cfa180796dfe707c3d665e26b637b3e3859b Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Wed, 19 Mar 2025 21:06:37 +0100
Subject: [PATCH 06/14] fix dataset info

---
 results/cyto_batch_integration/data/dataset_info.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/results/cyto_batch_integration/data/dataset_info.json b/results/cyto_batch_integration/data/dataset_info.json
index 57d1b2a3..e21c50c9 100644
--- a/results/cyto_batch_integration/data/dataset_info.json
+++ b/results/cyto_batch_integration/data/dataset_info.json
@@ -4,8 +4,8 @@
     "dataset_name": "Leomazzi_data_subset",
     "dataset_summary": "Flow cytometry data of spleens of 9 mice, subsampled to 1000 cells per sample.",
     "dataset_description": "Flow cytometry data of spleens from 4 WT (IKK2 fl/fl CD11c-cre +/+) and 5 KO (IKK2 fl/fl CD11c-cre Tg/+) B6 mice,  measured with a 22-color panel on 2 different instrument settings. Subsampled to 1000 cells per sample. Data has been preprocessed (compensated witha batch-specific compensation matrix, logicle transformed, cleaned with PeacoQC and pregated on live single CD45+ cells).",
-    "data_reference": "",
-    "data_url": "",
+    "data_reference": null,
+    "data_url": null,
     "date_created": "19-03-2025",
     "file_size": 1444801
   }

From e4964703bd8e2b8b8140dd3bf8410016e272b7b5 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Wed, 19 Mar 2025 23:12:49 +0100
Subject: [PATCH 07/14] update results

---
 .../data/dataset_info.json                    |   6 +-
 .../data/method_info.json                     |  36 +-
 .../data/metric_execution_info.json           | 244 ++++-----
 .../data/metric_info.json                     |  24 +-
 .../data/quality_control.json                 | 510 +++++++++---------
 .../cyto_batch_integration/data/results.json  | 468 +++++-----------
 6 files changed, 545 insertions(+), 743 deletions(-)

diff --git a/results/cyto_batch_integration/data/dataset_info.json b/results/cyto_batch_integration/data/dataset_info.json
index e21c50c9..965ca937 100644
--- a/results/cyto_batch_integration/data/dataset_info.json
+++ b/results/cyto_batch_integration/data/dataset_info.json
@@ -1,12 +1,12 @@
 [
   {
-    "dataset_id": "XXXXX",
-    "dataset_name": "Leomazzi_data_subset",
+    "dataset_id": "cyto_spleen_subset",
+    "dataset_name": "Cytometry Spleen Subset",
     "dataset_summary": "Flow cytometry data of spleens of 9 mice, subsampled to 1000 cells per sample.",
     "dataset_description": "Flow cytometry data of spleens from 4 WT (IKK2 fl/fl CD11c-cre +/+) and 5 KO (IKK2 fl/fl CD11c-cre Tg/+) B6 mice,  measured with a 22-color panel on 2 different instrument settings. Subsampled to 1000 cells per sample. Data has been preprocessed (compensated witha batch-specific compensation matrix, logicle transformed, cleaned with PeacoQC and pregated on live single CD45+ cells).",
     "data_reference": null,
     "data_url": null,
     "date_created": "19-03-2025",
-    "file_size": 1444801
+    "file_size": 1447383
   }
 ]
diff --git a/results/cyto_batch_integration/data/method_info.json b/results/cyto_batch_integration/data/method_info.json
index 0b122b50..eb1c93d5 100644
--- a/results/cyto_batch_integration/data/method_info.json
+++ b/results/cyto_batch_integration/data/method_info.json
@@ -11,9 +11,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/control_methods/shuffle_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/control_methods/shuffle_integration",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
   },
   {
     "task_id": "control_methods",
@@ -27,9 +27,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_batch:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/control_methods/shuffle_integration_by_batch",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/control_methods/shuffle_integration_by_batch",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
   },
   {
     "task_id": "control_methods",
@@ -43,9 +43,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_cell_type:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/control_methods/shuffle_integration_by_cell_type",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/control_methods/shuffle_integration_by_cell_type",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
   },
   {
     "task_id": "methods",
@@ -59,9 +59,9 @@
     "code_url": "https://github.com/slowkow/harmonypy",
     "documentation_url": "https://portals.broadinstitute.org/harmony",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/harmonypy:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/methods/harmonypy",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/methods/harmonypy",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
   },
   {
     "task_id": "methods",
@@ -75,9 +75,9 @@
     "code_url": "https://github.com/bioc/limma",
     "documentation_url": "https://bioinf.wehi.edu.au/limma",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/limma_remove_batch_effect:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/methods/limma_remove_batch_effect",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/methods/limma_remove_batch_effect",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
   },
   {
     "task_id": "control_methods",
@@ -91,9 +91,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/no_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/control_methods/no_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/control_methods/no_integration",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
   },
   {
     "task_id": "control_methods",
@@ -107,9 +107,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/perfect_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/control_methods/perfect_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/control_methods/perfect_integration",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
   },
   {
     "task_id": "methods",
@@ -123,9 +123,9 @@
     "code_url": "https://github.com/brentp/combat.py",
     "documentation_url": "https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/combat:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/methods/combat",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/methods/combat",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
   },
   {
     "task_id": "methods",
@@ -139,8 +139,8 @@
     "code_url": "https://github.com/biosurf/cyCombine",
     "documentation_url": "https://biosurf.org/cyCombine.html",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/cycombine_nocontrols:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/methods/cycombine_nocontrols",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/methods/cycombine_nocontrols",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb"
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
   }
 ]
diff --git a/results/cyto_batch_integration/data/metric_execution_info.json b/results/cyto_batch_integration/data/metric_execution_info.json
index a77319e0..acf703cd 100644
--- a/results/cyto_batch_integration/data/metric_execution_info.json
+++ b/results/cyto_batch_integration/data/metric_execution_info.json
@@ -1,378 +1,378 @@
 [
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "combat",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 13:06:00",
+      "submit": "2025-03-19 21:44:34",
       "exit_code": 0,
-      "duration_sec": 24.8,
-      "cpu_pct": 133.2,
-      "peak_memory_mb": 4301,
+      "duration_sec": 20.8,
+      "cpu_pct": 121.9,
+      "peak_memory_mb": 1639,
       "disk_read_mb": 70,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "combat",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 13:06:01",
+      "submit": "2025-03-19 21:44:34",
       "exit_code": 0,
-      "duration_sec": 51,
-      "cpu_pct": 106.1,
+      "duration_sec": 51.6,
+      "cpu_pct": 104.8,
       "peak_memory_mb": 1844,
       "disk_read_mb": 124,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "combat",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 13:06:00",
+      "submit": "2025-03-19 21:44:34",
       "exit_code": 0,
-      "duration_sec": 51.6,
-      "cpu_pct": 3873.9,
-      "peak_memory_mb": 4301,
+      "duration_sec": 180,
+      "cpu_pct": 1834.9,
+      "peak_memory_mb": 5632,
       "disk_read_mb": 64,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 13:08:01",
+      "submit": "2025-03-19 21:45:54",
       "exit_code": 0,
-      "duration_sec": 20.4,
-      "cpu_pct": 124.8,
+      "duration_sec": 20,
+      "cpu_pct": 126.4,
       "peak_memory_mb": 1639,
       "disk_read_mb": 70,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 13:08:01",
+      "submit": "2025-03-19 21:45:54",
       "exit_code": 0,
-      "duration_sec": 50.8,
-      "cpu_pct": 106.8,
+      "duration_sec": 50.6,
+      "cpu_pct": 107,
       "peak_memory_mb": 1844,
       "disk_read_mb": 124,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 13:08:01",
+      "submit": "2025-03-19 21:45:54",
       "exit_code": 0,
-      "duration_sec": 14.4,
-      "cpu_pct": 2136.4,
-      "peak_memory_mb": 2970,
+      "duration_sec": 14.2,
+      "cpu_pct": 1097.9,
+      "peak_memory_mb": 1536,
       "disk_read_mb": 64,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "harmonypy",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 13:07:31",
+      "submit": "2025-03-19 21:46:44",
       "exit_code": 0,
-      "duration_sec": 20.6,
-      "cpu_pct": 123.5,
+      "duration_sec": 20.2,
+      "cpu_pct": 124,
       "peak_memory_mb": 1639,
       "disk_read_mb": 68,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "harmonypy",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 13:07:31",
+      "submit": "2025-03-19 21:46:44",
       "exit_code": 0,
-      "duration_sec": 51.4,
-      "cpu_pct": 114.6,
-      "peak_memory_mb": 3175,
+      "duration_sec": 51,
+      "cpu_pct": 105.9,
+      "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "harmonypy",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 13:07:31",
+      "submit": "2025-03-19 21:46:44",
       "exit_code": 0,
-      "duration_sec": 14.4,
-      "cpu_pct": 1067.6,
-      "peak_memory_mb": 1536,
+      "duration_sec": 91.4,
+      "cpu_pct": 2278.3,
+      "peak_memory_mb": 5632,
       "disk_read_mb": 62,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 13:04:51",
+      "submit": "2025-03-19 21:46:34",
       "exit_code": 0,
-      "duration_sec": 24.2,
-      "cpu_pct": 130.6,
-      "peak_memory_mb": 2970,
+      "duration_sec": 20.8,
+      "cpu_pct": 122.5,
+      "peak_memory_mb": 1639,
       "disk_read_mb": 70,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 13:04:51",
+      "submit": "2025-03-19 21:46:34",
       "exit_code": 0,
-      "duration_sec": 55.6,
-      "cpu_pct": 103.2,
-      "peak_memory_mb": 4506,
+      "duration_sec": 50.8,
+      "cpu_pct": 106,
+      "peak_memory_mb": 1844,
       "disk_read_mb": 124,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 13:04:51",
+      "submit": "2025-03-19 21:46:34",
       "exit_code": 0,
-      "duration_sec": 14.2,
-      "cpu_pct": 1092.6,
+      "duration_sec": 13.8,
+      "cpu_pct": 1106.2,
       "peak_memory_mb": 1536,
       "disk_read_mb": 62,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "no_integration",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 13:06:50",
+      "submit": "2025-03-19 21:43:54",
       "exit_code": 0,
-      "duration_sec": 22.6,
-      "cpu_pct": 143.6,
-      "peak_memory_mb": 2970,
+      "duration_sec": 27,
+      "cpu_pct": 168.8,
+      "peak_memory_mb": 5632,
       "disk_read_mb": 68,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "no_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 13:06:50",
+      "submit": "2025-03-19 21:43:53",
       "exit_code": 0,
-      "duration_sec": 51.4,
-      "cpu_pct": 115.8,
-      "peak_memory_mb": 3175,
+      "duration_sec": 51.8,
+      "cpu_pct": 104.6,
+      "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "no_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 13:06:50",
+      "submit": "2025-03-19 21:43:54",
       "exit_code": 0,
-      "duration_sec": 14.6,
-      "cpu_pct": 1074.6,
-      "peak_memory_mb": 1536,
+      "duration_sec": 172,
+      "cpu_pct": 1809,
+      "peak_memory_mb": 5632,
       "disk_read_mb": 62,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "perfect_integration",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 13:06:20",
+      "submit": "2025-03-19 21:46:34",
       "exit_code": 0,
-      "duration_sec": 20.8,
-      "cpu_pct": 124.6,
+      "duration_sec": 21.2,
+      "cpu_pct": 123.6,
       "peak_memory_mb": 1639,
       "disk_read_mb": 68,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "perfect_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 13:06:20",
+      "submit": "2025-03-19 21:46:34",
       "exit_code": 0,
-      "duration_sec": 51.2,
-      "cpu_pct": 105.7,
+      "duration_sec": 51.6,
+      "cpu_pct": 105.2,
       "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "perfect_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 13:06:20",
+      "submit": "2025-03-19 21:46:34",
       "exit_code": 0,
-      "duration_sec": 15,
-      "cpu_pct": 1121,
+      "duration_sec": 14.8,
+      "cpu_pct": 1093.6,
       "peak_memory_mb": 1536,
       "disk_read_mb": 60,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 13:07:30",
+      "submit": "2025-03-19 21:43:34",
       "exit_code": 0,
-      "duration_sec": 20.4,
-      "cpu_pct": 123.9,
+      "duration_sec": 20.2,
+      "cpu_pct": 125.2,
       "peak_memory_mb": 1639,
       "disk_read_mb": 68,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 13:07:30",
+      "submit": "2025-03-19 21:43:34",
       "exit_code": 0,
-      "duration_sec": 62.6,
-      "cpu_pct": 113.8,
-      "peak_memory_mb": 4506,
+      "duration_sec": 51.4,
+      "cpu_pct": 105.2,
+      "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 13:07:30",
+      "submit": "2025-03-19 21:43:34",
       "exit_code": 0,
-      "duration_sec": 14.8,
-      "cpu_pct": 1097.9,
+      "duration_sec": 14.2,
+      "cpu_pct": 1094.2,
       "peak_memory_mb": 1536,
       "disk_read_mb": 62,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 13:07:31",
+      "submit": "2025-03-19 21:46:44",
       "exit_code": 0,
       "duration_sec": 20.6,
-      "cpu_pct": 122.8,
+      "cpu_pct": 122.9,
       "peak_memory_mb": 1639,
       "disk_read_mb": 68,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 13:07:31",
+      "submit": "2025-03-19 21:46:44",
       "exit_code": 0,
-      "duration_sec": 51.2,
-      "cpu_pct": 105.7,
+      "duration_sec": 50.6,
+      "cpu_pct": 106.7,
       "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 13:07:31",
+      "submit": "2025-03-19 21:46:44",
       "exit_code": 0,
-      "duration_sec": 54,
-      "cpu_pct": 3713.7,
-      "peak_memory_mb": 4301,
+      "duration_sec": 14.6,
+      "cpu_pct": 1076.6,
+      "peak_memory_mb": 1536,
       "disk_read_mb": 62,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 13:07:31",
+      "submit": "2025-03-19 21:43:34",
       "exit_code": 0,
-      "duration_sec": 21,
-      "cpu_pct": 156.7,
-      "peak_memory_mb": 2970,
+      "duration_sec": 20.6,
+      "cpu_pct": 123,
+      "peak_memory_mb": 1639,
       "disk_read_mb": 68,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 13:07:31",
+      "submit": "2025-03-19 21:43:34",
       "exit_code": 0,
-      "duration_sec": 61.8,
-      "cpu_pct": 108.5,
-      "peak_memory_mb": 4506,
+      "duration_sec": 51.8,
+      "cpu_pct": 105.8,
+      "peak_memory_mb": 1844,
       "disk_read_mb": 122,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 13:07:31",
+      "submit": "2025-03-19 21:43:34",
       "exit_code": 0,
-      "duration_sec": 14.2,
-      "cpu_pct": 1082.9,
-      "peak_memory_mb": 1536,
+      "duration_sec": 172,
+      "cpu_pct": 1822.8,
+      "peak_memory_mb": 5632,
       "disk_read_mb": 62,
       "disk_write_mb": 2
     }
diff --git a/results/cyto_batch_integration/data/metric_info.json b/results/cyto_batch_integration/data/metric_info.json
index 50b4fc9e..050023bd 100644
--- a/results/cyto_batch_integration/data/metric_info.json
+++ b/results/cyto_batch_integration/data/metric_info.json
@@ -8,10 +8,10 @@
     "metric_description": "Earth Mover Distance (EMD) is a metric designed for comparing two distributions.\nIt is also known as the Wasserstein metric.\n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
     "maximize": false
   },
   {
@@ -23,10 +23,10 @@
     "metric_description": "Earth Mover Distance (EMD) is a metric designed for comparing two distributions.\nIt is also known as the Wasserstein metric.\n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
     "maximize": false
   },
   {
@@ -38,10 +38,10 @@
     "metric_description": "The metric compares the number of marker expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
     "references_doi": "10.1038/s41592-019-0686-2",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/n_inconsistent_peaks",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/n_inconsistent_peaks",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/n_inconsistent_peaks:build_main",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
     "maximize": false
   },
   {
@@ -53,10 +53,10 @@
     "metric_description": "The metric compares the number of cell type specific marker expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe (cell type) marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
     "references_doi": "10.1038/s41592-019-0686-2",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/n_inconsistent_peaks",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/n_inconsistent_peaks",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/n_inconsistent_peaks:build_main",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
     "maximize": false
   },
   {
@@ -68,10 +68,10 @@
     "metric_description": "First, a simple linear model `sklearn.linear_model.LinearRegression` is fitted for each paired sample and marker to determine the fraction of variance (R^2) explained by the batch covariate B. |\nThe average batch R_squared is then computed as the average of the $R^2$ values across all paired samples, markers. |\nAs a result, $\\overline{R^2_B}_{global}$ quantifies how much of the total variability in the data is driven by batch effects. Consequently, lower values are desirable. |\n\n$\\overline{R^2_B}_{global} = \\frac{1}{N*M}\\sum_{\\substack{(x_{\\mathrm{int}},\\,x_{\\mathrm{val}})\\\\ \\text{paired samples}}}^{N} \\sum_{i=1}^{M} \\,R^2\\!\\bigl(\\mathrm{marker}_i \\mid B\\bigr)$\n\nWhere:\n- $N$ is the number of paired samples, where x_{\\mathrm{int}} is the replicate that has been batch-corrected and x_{\\mathrm{val}} is replicate used for validation. Paired samples belong to different batches.\n- $M$ is the number of markers\n- $B$ is the batch covariate\n\nA higher value of $\\overline{R^2_B}_{global}$ indicates that the batch variable explains more of the variance in the data, which indicates a higher level of batch effects. |\n",
     "references_doi": null,
     "references_bibtex": "@book{draper1998applied,\ntitle={Applied regression analysis},\nauthor={Draper, Norman R and Smith, Harry},\npublisher={John Wiley \\& Sons}\n}\n",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/average_batch_r2",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/average_batch_r2",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/average_batch_r2:build_main",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
     "maximize": false
   },
   {
@@ -83,10 +83,10 @@
     "metric_description": "First, a simple linear model `sklearn.linear_model.LinearRegression` is fitted for each paired sample, marker and cell type to determine the fraction of variance (R^2) explained by the batch covariate B. |\nThe average batch R_squared is then computed as the average of the $R^2$ values across all paired samples, markers and cell types. |\nAs a result, $\\overline{R^2_B}_{cell\\ type}$ quantifies how much of the total variability in the data is driven by batch effects. Consequently, lower values are desirable. |\n\n$\\overline{R^2_B}_{cell\\ type} = \\frac{1}{N*C*M}\\sum_{\\substack{(x_{\\mathrm{int}},\\,x_{\\mathrm{val}})\\\\ \\text{paired samples}}}^{N} \\sum_{j=1}^{C} \\sum_{i=1}^{M}\\,R^2\\!\\bigl(\\mathrm{marker}_i \\mid B\\bigr)$\n\nWhere:\n- $N$ is the number of paired samples, where x_{\\mathrm{int}} is the replicate that has been batch-corrected and x_{\\mathrm{val}} is replicate used for validation. Paired samples belong to different batches.\n- $C$ is the number of cell types\n- $M$ is the number of markers\n- $B$ is the batch covariate\n\nThe $\\overline{Rˆ2_B}_{global}$ is a variation of the latter metric, where the average is computed across paired samples and markers only, without taking into account the cell types. |\n\nA higher value of $\\overline{R^2_B}_{global}$ or $\\overline{R^2_B}_{cell\\ type}$ indicates that the batch variable explains more of the variance in the data, which indicates a higher level of batch effects. |\n\nA good performance on $\\overline{R^2_B}_{global}$ but not on $\\overline{R^2_B}_{cell\\ type}$ might indicate that the batch effect correction is discarding cell type specific batch effects. |\n",
     "references_doi": null,
     "references_bibtex": "@book{draper1998applied,\ntitle={Applied regression analysis},\nauthor={Draper, Norman R and Smith, Harry},\npublisher={John Wiley \\& Sons}\n}\n",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/4ba62f679e8d99c5884bbc3f3f941654b076b4eb/src/metrics/average_batch_r2",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/average_batch_r2",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/average_batch_r2:build_main",
     "code_version": "build_main",
-    "commit_sha": "4ba62f679e8d99c5884bbc3f3f941654b076b4eb",
+    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
     "maximize": false
   }
 ]
diff --git a/results/cyto_batch_integration/data/quality_control.json b/results/cyto_batch_integration/data/quality_control.json
index 3a6fb7ec..204d3c06 100644
--- a/results/cyto_batch_integration/data/quality_control.json
+++ b/results/cyto_batch_integration/data/quality_control.json
@@ -243,11 +243,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw data", 
         "name": "Number of results", 
-        "value": 18, 
+        "value": 9, 
         "severity": 0, 
-        "severity_value": -10.0, 
+        "severity_value": 0.0, 
         "code": "len(results) == len(method_info) * len(metric_info) * len(dataset_info)", 
-        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_cyto_batch_integration\n  Number of results: 18\n  Number of methods: 9\n  Number of metrics: 6\n  Number of datasets: 1\n"
+        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_cyto_batch_integration\n  Number of results: 9\n  Number of methods: 9\n  Number of metrics: 6\n  Number of datasets: 1\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -402,32 +402,32 @@
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
-        "name": "Dataset 'XXXXX' %missing", 
+        "name": "Dataset 'cyto_spleen_subset' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  dataset id: XXXXX\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  dataset id: cyto_spleen_subset\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration emd_mean", 
-        "value": 0.0, 
+        "value": 0.0734, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.0734, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Worst score: 0.0734%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration emd_mean", 
-        "value": 0.0627, 
+        "value": 0.0734, 
         "severity": 0, 
-        "severity_value": 0.03135, 
+        "severity_value": 0.0367, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Best score: 0.0627%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Best score: 0.0734%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -453,91 +453,91 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type emd_mean", 
-        "value": 0.0, 
+        "value": 0.5949, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5949, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Worst score: 0.5949%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type emd_mean", 
-        "value": 0.5937, 
+        "value": 0.5949, 
         "severity": 0, 
-        "severity_value": 0.29685, 
+        "severity_value": 0.29745, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Best score: 0.5937%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Best score: 0.5949%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy emd_mean", 
-        "value": 0.0, 
+        "value": 0.5774, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5774, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Worst score: 0.5774%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy emd_mean", 
-        "value": 0.5738, 
+        "value": 0.5774, 
         "severity": 0, 
-        "severity_value": 0.2869, 
+        "severity_value": 0.2887, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Best score: 0.5738%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Best score: 0.5774%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect emd_mean", 
-        "value": 0.0, 
+        "value": 0.5541, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5541, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Worst score: 0.5541%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect emd_mean", 
-        "value": 0.5504, 
+        "value": 0.5541, 
         "severity": 0, 
-        "severity_value": 0.2752, 
+        "severity_value": 0.27705, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Best score: 0.5504%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Best score: 0.5541%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration emd_mean", 
-        "value": 0.0, 
+        "value": 0.5025, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5025, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Worst score: 0.5025%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration emd_mean", 
-        "value": 0.4983, 
+        "value": 0.5025, 
         "severity": 0, 
-        "severity_value": 0.24915, 
+        "severity_value": 0.25125, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Best score: 0.4983%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Best score: 0.5025%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score perfect_integration emd_mean", 
-        "value": 0, 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean\n  Worst score: 0%\n"
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -553,111 +553,111 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat emd_mean", 
-        "value": 0.0, 
+        "value": 0.5582, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5582, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Worst score: 0.5582%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat emd_mean", 
-        "value": 0.5545, 
+        "value": 0.5582, 
         "severity": 0, 
-        "severity_value": 0.27725, 
+        "severity_value": 0.2791, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Best score: 0.5545%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Best score: 0.5582%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols emd_mean", 
-        "value": 0.0, 
+        "value": 0.5691, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5691, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean\n  Worst score: 0.0%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean\n  Worst score: 0.5691%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols emd_mean", 
-        "value": 0.5655, 
+        "value": 0.5691, 
         "severity": 0, 
-        "severity_value": 0.28275, 
+        "severity_value": 0.28455, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean\n  Best score: 0.5655%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean\n  Best score: 0.5691%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration emd_max", 
-        "value": 0.0, 
+        "value": 0.0316, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.0316, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Worst score: 0.0316%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration emd_max", 
-        "value": 0.199, 
+        "value": 0.0316, 
         "severity": 0, 
-        "severity_value": 0.0995, 
+        "severity_value": 0.0158, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Best score: 0.199%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Best score: 0.0316%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_batch emd_max", 
-        "value": 0.0, 
+        "value": 0.1293, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.1293, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Worst score: 0.1293%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_batch emd_max", 
-        "value": 0.1417, 
+        "value": 0.1293, 
         "severity": 0, 
-        "severity_value": 0.07085, 
+        "severity_value": 0.06465, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Best score: 0.1417%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Best score: 0.1293%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type emd_max", 
-        "value": 0.0, 
+        "value": 0.2276, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.2276, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Worst score: 0.2276%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type emd_max", 
-        "value": 0.2848, 
+        "value": 0.2276, 
         "severity": 0, 
-        "severity_value": 0.1424, 
+        "severity_value": 0.1138, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Best score: 0.2848%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Best score: 0.2276%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy emd_max", 
-        "value": 0.0, 
+        "value": 0.2317, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.2317, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max\n  Worst score: 0.0%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max\n  Worst score: 0.2317%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -673,11 +673,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect emd_max", 
-        "value": 0.0, 
+        "value": 0.0315, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.0315, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max\n  Worst score: 0.0%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max\n  Worst score: 0.0315%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -713,11 +713,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score perfect_integration emd_max", 
-        "value": 0, 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max\n  Worst score: 0%\n"
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -733,11 +733,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat emd_max", 
-        "value": 0.0, 
+        "value": 0.0744, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.0744, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max\n  Worst score: 0.0%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max\n  Worst score: 0.0744%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -793,111 +793,111 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_batch n_inconsistent_peaks", 
-        "value": 0.0, 
+        "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_batch n_inconsistent_peaks", 
-        "value": 0.1818, 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": 0.0909, 
+        "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks\n  Best score: 0.1818%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type n_inconsistent_peaks", 
-        "value": 0.0, 
+        "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type n_inconsistent_peaks", 
-        "value": 0.1364, 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": 0.0682, 
+        "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Best score: 0.1364%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy n_inconsistent_peaks", 
-        "value": 0.0, 
+        "value": 0.3684, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.3684, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.3684%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy n_inconsistent_peaks", 
-        "value": 0.4545, 
+        "value": 0.3684, 
         "severity": 0, 
-        "severity_value": 0.22725, 
+        "severity_value": 0.1842, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Best score: 0.4545%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Best score: 0.3684%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect n_inconsistent_peaks", 
-        "value": 0.0, 
+        "value": 0.2105, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.2105, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.2105%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect n_inconsistent_peaks", 
-        "value": 0.3182, 
+        "value": 0.2105, 
         "severity": 0, 
-        "severity_value": 0.1591, 
+        "severity_value": 0.10525, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Best score: 0.3182%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Best score: 0.2105%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration n_inconsistent_peaks", 
-        "value": 0.0, 
+        "value": 0.2105, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.2105, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.2105%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration n_inconsistent_peaks", 
-        "value": 0.3182, 
+        "value": 0.2105, 
         "severity": 0, 
-        "severity_value": 0.1591, 
+        "severity_value": 0.10525, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 0.3182%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 0.2105%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score perfect_integration n_inconsistent_peaks", 
-        "value": 0, 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0%\n"
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -913,41 +913,41 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat n_inconsistent_peaks", 
-        "value": 0.0, 
+        "value": 0.1579, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.1579, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.1579%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat n_inconsistent_peaks", 
-        "value": 0.2727, 
+        "value": 0.1579, 
         "severity": 0, 
-        "severity_value": 0.13635, 
+        "severity_value": 0.07895, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Best score: 0.2727%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Best score: 0.1579%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols n_inconsistent_peaks", 
-        "value": 0.0, 
+        "value": 0.2105, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.2105, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.0%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.2105%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols n_inconsistent_peaks", 
-        "value": 0.3182, 
+        "value": 0.2105, 
         "severity": 0, 
-        "severity_value": 0.1591, 
+        "severity_value": 0.10525, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Best score: 0.3182%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Best score: 0.2105%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -973,111 +973,111 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_batch n_inconsistent_peaks_ct", 
-        "value": 0.0, 
+        "value": 0.3, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.3, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.3%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_batch n_inconsistent_peaks_ct", 
-        "value": 0.3548, 
+        "value": 0.3, 
         "severity": 0, 
-        "severity_value": 0.1774, 
+        "severity_value": 0.15, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.3548%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.3%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
-        "value": 0.0, 
+        "value": 0.4667, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.4667, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.4667%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
-        "value": 0.5161, 
+        "value": 0.4667, 
         "severity": 0, 
-        "severity_value": 0.25805, 
+        "severity_value": 0.23335, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5161%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.4667%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy n_inconsistent_peaks_ct", 
-        "value": 0.0, 
+        "value": 0.6, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.6, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.6%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy n_inconsistent_peaks_ct", 
-        "value": 0.6129, 
+        "value": 0.6, 
         "severity": 0, 
-        "severity_value": 0.30645, 
+        "severity_value": 0.3, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.6129%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.6%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect n_inconsistent_peaks_ct", 
-        "value": 0.0, 
+        "value": 0.5833, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5833, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.5833%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect n_inconsistent_peaks_ct", 
-        "value": 0.5968, 
+        "value": 0.5833, 
         "severity": 0, 
-        "severity_value": 0.2984, 
+        "severity_value": 0.29165, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5968%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5833%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration n_inconsistent_peaks_ct", 
-        "value": 0.0, 
+        "value": 0.5833, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5833, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.5833%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration n_inconsistent_peaks_ct", 
-        "value": 0.5968, 
+        "value": 0.5833, 
         "severity": 0, 
-        "severity_value": 0.2984, 
+        "severity_value": 0.29165, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5968%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5833%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score perfect_integration n_inconsistent_peaks_ct", 
-        "value": 0, 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0%\n"
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1093,61 +1093,61 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat n_inconsistent_peaks_ct", 
-        "value": 0.0, 
+        "value": 0.5667, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5667, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.5667%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat n_inconsistent_peaks_ct", 
-        "value": 0.5806, 
+        "value": 0.5667, 
         "severity": 0, 
-        "severity_value": 0.2903, 
+        "severity_value": 0.28335, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5806%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5667%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols n_inconsistent_peaks_ct", 
-        "value": 0.0, 
+        "value": 0.5667, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5667, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.5667%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols n_inconsistent_peaks_ct", 
-        "value": 0.5806, 
+        "value": 0.5667, 
         "severity": 0, 
-        "severity_value": 0.2903, 
+        "severity_value": 0.28335, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5806%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5667%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration average_batch_r2_global", 
-        "value": 0.0, 
+        "value": 0.5577, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5577, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.5577%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration average_batch_r2_global", 
-        "value": 0.5761, 
+        "value": 0.5577, 
         "severity": 0, 
-        "severity_value": 0.28805, 
+        "severity_value": 0.27885, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.5761%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.5577%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1173,91 +1173,91 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type average_batch_r2_global", 
-        "value": 0.0, 
+        "value": 0.7038, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.7038, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Worst score: 0.7038%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type average_batch_r2_global", 
-        "value": 0.6962, 
+        "value": 0.7038, 
         "severity": 0, 
-        "severity_value": 0.3481, 
+        "severity_value": 0.3519, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Best score: 0.6962%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Best score: 0.7038%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy average_batch_r2_global", 
-        "value": 0.0, 
+        "value": 0.6141, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.6141, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Worst score: 0.6141%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy average_batch_r2_global", 
-        "value": 0.6218, 
+        "value": 0.6141, 
         "severity": 0, 
-        "severity_value": 0.3109, 
+        "severity_value": 0.30705, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Best score: 0.6218%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Best score: 0.6141%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect average_batch_r2_global", 
-        "value": 0.0, 
+        "value": 0.6504, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.6504, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Worst score: 0.6504%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect average_batch_r2_global", 
-        "value": 0.6575, 
+        "value": 0.6504, 
         "severity": 0, 
-        "severity_value": 0.32875, 
+        "severity_value": 0.3252, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Best score: 0.6575%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Best score: 0.6504%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration average_batch_r2_global", 
-        "value": 0.0, 
+        "value": 0.1541, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.1541, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.1541%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration average_batch_r2_global", 
-        "value": 0.1712, 
+        "value": 0.1541, 
         "severity": 0, 
-        "severity_value": 0.0856, 
+        "severity_value": 0.07705, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.1712%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.1541%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score perfect_integration average_batch_r2_global", 
-        "value": 0, 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0%\n"
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_global\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1273,61 +1273,61 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat average_batch_r2_global", 
-        "value": 0.0, 
+        "value": 0.6358, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.6358, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Worst score: 0.6358%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat average_batch_r2_global", 
-        "value": 0.6432, 
+        "value": 0.6358, 
         "severity": 0, 
-        "severity_value": 0.3216, 
+        "severity_value": 0.3179, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Best score: 0.6432%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Best score: 0.6358%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols average_batch_r2_global", 
-        "value": 0.0, 
+        "value": 0.4123, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.4123, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Worst score: 0.0%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Worst score: 0.4123%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols average_batch_r2_global", 
-        "value": 0.4241, 
+        "value": 0.4123, 
         "severity": 0, 
-        "severity_value": 0.21205, 
+        "severity_value": 0.20615, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Best score: 0.4241%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Best score: 0.4123%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration average_batch_r2_ct", 
-        "value": 0.0, 
+        "value": 0.1618, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.1618, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.1618%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration average_batch_r2_ct", 
-        "value": 0.1429, 
+        "value": 0.1618, 
         "severity": 0, 
-        "severity_value": 0.07145, 
+        "severity_value": 0.0809, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.1429%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.1618%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1353,11 +1353,11 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type average_batch_r2_ct", 
-        "value": 0.0, 
+        "value": 0.7318, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.7318, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7318%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1373,71 +1373,71 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy average_batch_r2_ct", 
-        "value": 0.0, 
+        "value": 0.5694, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5694, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Worst score: 0.5694%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy average_batch_r2_ct", 
-        "value": 0.57, 
+        "value": 0.5694, 
         "severity": 0, 
-        "severity_value": 0.285, 
+        "severity_value": 0.2847, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Best score: 0.57%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Best score: 0.5694%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect average_batch_r2_ct", 
-        "value": 0.0, 
+        "value": 0.4971, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.4971, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Worst score: 0.4971%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect average_batch_r2_ct", 
-        "value": 0.4977, 
+        "value": 0.4971, 
         "severity": 0, 
-        "severity_value": 0.24885, 
+        "severity_value": 0.24855, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Best score: 0.4977%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Best score: 0.4971%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration average_batch_r2_ct", 
-        "value": 0.0, 
+        "value": 0.449, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.449, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.449%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration average_batch_r2_ct", 
-        "value": 0.4497, 
+        "value": 0.449, 
         "severity": 0, 
-        "severity_value": 0.22485, 
+        "severity_value": 0.2245, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.4497%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.449%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score perfect_integration average_batch_r2_ct", 
-        "value": 0, 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0%\n"
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1453,40 +1453,40 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat average_batch_r2_ct", 
-        "value": 0.0, 
+        "value": 0.5196, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5196, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Worst score: 0.5196%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat average_batch_r2_ct", 
-        "value": 0.5202, 
+        "value": 0.5196, 
         "severity": 0, 
-        "severity_value": 0.2601, 
+        "severity_value": 0.2598, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Best score: 0.5202%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Best score: 0.5196%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols average_batch_r2_ct", 
-        "value": 0.0, 
+        "value": 0.5818, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5818, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Worst score: 0.5818%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols average_batch_r2_ct", 
-        "value": 0.5823, 
+        "value": 0.5818, 
         "severity": 0, 
-        "severity_value": 0.29115, 
+        "severity_value": 0.2909, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Best score: 0.5823%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Best score: 0.5818%\n"
     }
 ]
\ No newline at end of file
diff --git a/results/cyto_batch_integration/data/results.json b/results/cyto_batch_integration/data/results.json
index 169f5c16..a98e407d 100644
--- a/results/cyto_batch_integration/data/results.json
+++ b/results/cyto_batch_integration/data/results.json
@@ -1,6 +1,6 @@
 [
   {
-    "dataset_id": "XXXXX",
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "combat",
     "metric_values": {
       "average_batch_r2_ct": 0.1219,
@@ -11,18 +11,26 @@
       "n_inconsistent_peaks_ct": 26
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.5202,
-      "average_batch_r2_global": 0.6432,
+      "average_batch_r2_ct": 0.5196,
+      "average_batch_r2_global": 0.6358,
       "emd_max": 0.0744,
-      "emd_mean": 0.5545,
-      "n_inconsistent_peaks": 0.2727,
-      "n_inconsistent_peaks_ct": 0.5806
+      "emd_mean": 0.5582,
+      "n_inconsistent_peaks": 0.1579,
+      "n_inconsistent_peaks_ct": 0.5667
     },
-    "mean_score": 0.4409,
-    "resources": {}
+    "mean_score": 0.4188,
+    "resources": {
+      "submit": "2025-03-19 21:42:34",
+      "exit_code": 0,
+      "duration_sec": 4.9,
+      "cpu_pct": 136.7,
+      "peak_memory_mb": 1844,
+      "disk_read_mb": 48,
+      "disk_write_mb": 3
+    }
   },
   {
-    "dataset_id": "XXXXX",
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "cycombine_nocontrols",
     "metric_values": {
       "average_batch_r2_ct": 0.1061,
@@ -33,18 +41,26 @@
       "n_inconsistent_peaks_ct": 26
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.5823,
-      "average_batch_r2_global": 0.4241,
+      "average_batch_r2_ct": 0.5818,
+      "average_batch_r2_global": 0.4123,
       "emd_max": 0,
-      "emd_mean": 0.5655,
-      "n_inconsistent_peaks": 0.3182,
-      "n_inconsistent_peaks_ct": 0.5806
+      "emd_mean": 0.5691,
+      "n_inconsistent_peaks": 0.2105,
+      "n_inconsistent_peaks_ct": 0.5667
     },
-    "mean_score": 0.4118,
-    "resources": {}
+    "mean_score": 0.3901,
+    "resources": {
+      "submit": "2025-03-19 21:42:34",
+      "exit_code": 0,
+      "duration_sec": 14.2,
+      "cpu_pct": 129.1,
+      "peak_memory_mb": 2151,
+      "disk_read_mb": 55,
+      "disk_write_mb": 3
+    }
   },
   {
-    "dataset_id": "XXXXX",
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "harmonypy",
     "metric_values": {
       "average_batch_r2_ct": 0.1093,
@@ -55,18 +71,26 @@
       "n_inconsistent_peaks_ct": 24
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.57,
-      "average_batch_r2_global": 0.6218,
+      "average_batch_r2_ct": 0.5694,
+      "average_batch_r2_global": 0.6141,
       "emd_max": 0.2317,
-      "emd_mean": 0.5738,
-      "n_inconsistent_peaks": 0.4545,
-      "n_inconsistent_peaks_ct": 0.6129
+      "emd_mean": 0.5774,
+      "n_inconsistent_peaks": 0.3684,
+      "n_inconsistent_peaks_ct": 0.6
     },
-    "mean_score": 0.5108,
-    "resources": {}
+    "mean_score": 0.4935,
+    "resources": {
+      "submit": "2025-03-19 21:42:34",
+      "exit_code": 0,
+      "duration_sec": 9,
+      "cpu_pct": 752.8,
+      "peak_memory_mb": 2560,
+      "disk_read_mb": 34,
+      "disk_write_mb": 2
+    }
   },
   {
-    "dataset_id": "XXXXX",
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "limma_remove_batch_effect",
     "metric_values": {
       "average_batch_r2_ct": 0.1276,
@@ -77,18 +101,26 @@
       "n_inconsistent_peaks_ct": 25
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.4977,
-      "average_batch_r2_global": 0.6575,
+      "average_batch_r2_ct": 0.4971,
+      "average_batch_r2_global": 0.6504,
       "emd_max": 0.0315,
-      "emd_mean": 0.5504,
-      "n_inconsistent_peaks": 0.3182,
-      "n_inconsistent_peaks_ct": 0.5968
+      "emd_mean": 0.5541,
+      "n_inconsistent_peaks": 0.2105,
+      "n_inconsistent_peaks_ct": 0.5833
     },
-    "mean_score": 0.442,
-    "resources": {}
+    "mean_score": 0.4212,
+    "resources": {
+      "submit": "2025-03-19 21:42:35",
+      "exit_code": 0,
+      "duration_sec": 4.1,
+      "cpu_pct": 214,
+      "peak_memory_mb": 1127,
+      "disk_read_mb": 30,
+      "disk_write_mb": 2
+    }
   },
   {
-    "dataset_id": "XXXXX",
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "no_integration",
     "metric_values": {
       "average_batch_r2_ct": 0.1398,
@@ -99,18 +131,26 @@
       "n_inconsistent_peaks_ct": 25
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.4497,
-      "average_batch_r2_global": 0.1712,
+      "average_batch_r2_ct": 0.449,
+      "average_batch_r2_global": 0.1541,
       "emd_max": 0,
-      "emd_mean": 0.4983,
-      "n_inconsistent_peaks": 0.3182,
-      "n_inconsistent_peaks_ct": 0.5968
+      "emd_mean": 0.5025,
+      "n_inconsistent_peaks": 0.2105,
+      "n_inconsistent_peaks_ct": 0.5833
     },
-    "mean_score": 0.339,
-    "resources": {}
+    "mean_score": 0.3166,
+    "resources": {
+      "submit": "2025-03-19 21:42:34",
+      "exit_code": 0,
+      "duration_sec": 1.9,
+      "cpu_pct": 250.3,
+      "peak_memory_mb": 764,
+      "disk_read_mb": 20,
+      "disk_write_mb": 2
+    }
   },
   {
-    "dataset_id": "XXXXX",
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "perfect_integration",
     "metric_values": {
       "average_batch_r2_ct": 2.3291e-19,
@@ -129,340 +169,102 @@
       "n_inconsistent_peaks_ct": 1
     },
     "mean_score": 1,
-    "resources": {}
-  },
-  {
-    "dataset_id": "XXXXX",
-    "method_id": "shuffle_integration",
-    "metric_values": {
-      "average_batch_r2_ct": 0.2178,
-      "average_batch_r2_global": 0.0319,
-      "emd_max": 28,
-      "emd_mean": 6.7046,
-      "n_inconsistent_peaks": 22,
-      "n_inconsistent_peaks_ct": 62
-    },
-    "scaled_scores": {
-      "average_batch_r2_ct": 0.1429,
-      "average_batch_r2_global": 0.5761,
-      "emd_max": 0.199,
-      "emd_mean": 0.0627,
-      "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0
-    },
-    "mean_score": 0.1635,
-    "resources": {}
-  },
-  {
-    "dataset_id": "XXXXX",
-    "method_id": "shuffle_integration_by_batch",
-    "metric_values": {
-      "average_batch_r2_ct": 0.2541,
-      "average_batch_r2_global": 0.0752,
-      "emd_max": 30,
-      "emd_mean": 7.1534,
-      "n_inconsistent_peaks": 18,
-      "n_inconsistent_peaks_ct": 40
-    },
-    "scaled_scores": {
-      "average_batch_r2_ct": 0,
-      "average_batch_r2_global": 0,
-      "emd_max": 0.1417,
-      "emd_mean": 0,
-      "n_inconsistent_peaks": 0.1818,
-      "n_inconsistent_peaks_ct": 0.3548
-    },
-    "mean_score": 0.1131,
-    "resources": {}
-  },
-  {
-    "dataset_id": "XXXXX",
-    "method_id": "shuffle_integration_by_cell_type",
-    "metric_values": {
-      "average_batch_r2_ct": 0.0681,
-      "average_batch_r2_global": 0.0228,
-      "emd_max": 25,
-      "emd_mean": 2.9063,
-      "n_inconsistent_peaks": 19,
-      "n_inconsistent_peaks_ct": 30
-    },
-    "scaled_scores": {
-      "average_batch_r2_ct": 0.7318,
-      "average_batch_r2_global": 0.6962,
-      "emd_max": 0.2848,
-      "emd_mean": 0.5937,
-      "n_inconsistent_peaks": 0.1364,
-      "n_inconsistent_peaks_ct": 0.5161
-    },
-    "mean_score": 0.4932,
-    "resources": {}
-  },
-  {
-    "dataset_id": null,
-    "method_id": "combat",
-    "metric_values": {
-      "average_batch_r2_global": "NA",
-      "average_batch_r2_ct": "NA",
-      "n_inconsistent_peaks": "NA",
-      "n_inconsistent_peaks_ct": "NA",
-      "emd_mean": "NA",
-      "emd_max": "NA"
-    },
-    "scaled_scores": {
-      "average_batch_r2_global": 0,
-      "average_batch_r2_ct": 0,
-      "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0,
-      "emd_mean": 0,
-      "emd_max": 0
-    },
-    "mean_score": 0,
-    "resources": {
-      "submit": "2025-03-19 13:03:51",
-      "exit_code": 0,
-      "duration_sec": 4.6,
-      "cpu_pct": 144.5,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 48,
-      "disk_write_mb": 3
-    }
-  },
-  {
-    "dataset_id": null,
-    "method_id": "cycombine_nocontrols",
-    "metric_values": {
-      "average_batch_r2_global": "NA",
-      "average_batch_r2_ct": "NA",
-      "n_inconsistent_peaks": "NA",
-      "n_inconsistent_peaks_ct": "NA",
-      "emd_mean": "NA",
-      "emd_max": "NA"
-    },
-    "scaled_scores": {
-      "average_batch_r2_global": 0,
-      "average_batch_r2_ct": 0,
-      "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0,
-      "emd_mean": 0,
-      "emd_max": 0
-    },
-    "mean_score": 0,
-    "resources": {
-      "submit": "2025-03-19 13:03:51",
-      "exit_code": 0,
-      "duration_sec": 15,
-      "cpu_pct": 156.4,
-      "peak_memory_mb": 3584,
-      "disk_read_mb": 55,
-      "disk_write_mb": 3
-    }
-  },
-  {
-    "dataset_id": null,
-    "method_id": "harmonypy",
-    "metric_values": {
-      "average_batch_r2_global": "NA",
-      "average_batch_r2_ct": "NA",
-      "n_inconsistent_peaks": "NA",
-      "n_inconsistent_peaks_ct": "NA",
-      "emd_mean": "NA",
-      "emd_max": "NA"
-    },
-    "scaled_scores": {
-      "average_batch_r2_global": 0,
-      "average_batch_r2_ct": 0,
-      "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0,
-      "emd_mean": 0,
-      "emd_max": 0
-    },
-    "mean_score": 0,
-    "resources": {
-      "submit": "2025-03-19 13:03:51",
-      "exit_code": 0,
-      "duration_sec": 9.1,
-      "cpu_pct": 737.3,
-      "peak_memory_mb": 2560,
-      "disk_read_mb": 34,
-      "disk_write_mb": 2
-    }
-  },
-  {
-    "dataset_id": null,
-    "method_id": "limma_remove_batch_effect",
-    "metric_values": {
-      "average_batch_r2_global": "NA",
-      "average_batch_r2_ct": "NA",
-      "n_inconsistent_peaks": "NA",
-      "n_inconsistent_peaks_ct": "NA",
-      "emd_mean": "NA",
-      "emd_max": "NA"
-    },
-    "scaled_scores": {
-      "average_batch_r2_global": 0,
-      "average_batch_r2_ct": 0,
-      "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0,
-      "emd_mean": 0,
-      "emd_max": 0
-    },
-    "mean_score": 0,
-    "resources": {
-      "submit": "2025-03-19 13:03:51",
-      "exit_code": 0,
-      "duration_sec": 4.3,
-      "cpu_pct": 206.5,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 30,
-      "disk_write_mb": 2
-    }
-  },
-  {
-    "dataset_id": null,
-    "method_id": "no_integration",
-    "metric_values": {
-      "average_batch_r2_global": "NA",
-      "average_batch_r2_ct": "NA",
-      "n_inconsistent_peaks": "NA",
-      "n_inconsistent_peaks_ct": "NA",
-      "emd_mean": "NA",
-      "emd_max": "NA"
-    },
-    "scaled_scores": {
-      "average_batch_r2_global": 0,
-      "average_batch_r2_ct": 0,
-      "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0,
-      "emd_mean": 0,
-      "emd_max": 0
-    },
-    "mean_score": 0,
-    "resources": {
-      "submit": "2025-03-19 13:03:51",
-      "exit_code": 0,
-      "duration_sec": 1.7,
-      "cpu_pct": 268.7,
-      "peak_memory_mb": 768,
-      "disk_read_mb": 20,
-      "disk_write_mb": 2
-    }
-  },
-  {
-    "dataset_id": null,
-    "method_id": "perfect_integration",
-    "metric_values": {
-      "average_batch_r2_global": "NA",
-      "average_batch_r2_ct": "NA",
-      "n_inconsistent_peaks": "NA",
-      "n_inconsistent_peaks_ct": "NA",
-      "emd_mean": "NA",
-      "emd_max": "NA"
-    },
-    "scaled_scores": {
-      "average_batch_r2_global": 0,
-      "average_batch_r2_ct": 0,
-      "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0,
-      "emd_mean": 0,
-      "emd_max": 0
-    },
-    "mean_score": 0,
     "resources": {
-      "submit": "2025-03-19 13:03:51",
+      "submit": "2025-03-19 21:42:35",
       "exit_code": 0,
-      "duration_sec": 1.7,
-      "cpu_pct": 271.4,
-      "peak_memory_mb": 767,
+      "duration_sec": 1.6,
+      "cpu_pct": 287.3,
+      "peak_memory_mb": 770,
       "disk_read_mb": 19,
       "disk_write_mb": 1
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration",
     "metric_values": {
-      "average_batch_r2_global": "NA",
-      "average_batch_r2_ct": "NA",
-      "n_inconsistent_peaks": "NA",
-      "n_inconsistent_peaks_ct": "NA",
-      "emd_mean": "NA",
-      "emd_max": "NA"
+      "average_batch_r2_ct": 0.2127,
+      "average_batch_r2_global": 0.0326,
+      "emd_max": 33.85,
+      "emd_mean": 6.684,
+      "n_inconsistent_peaks": 19,
+      "n_inconsistent_peaks_ct": 60
     },
     "scaled_scores": {
-      "average_batch_r2_global": 0,
-      "average_batch_r2_ct": 0,
+      "average_batch_r2_ct": 0.1618,
+      "average_batch_r2_global": 0.5577,
+      "emd_max": 0.0316,
+      "emd_mean": 0.0734,
       "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0,
-      "emd_mean": 0,
-      "emd_max": 0
+      "n_inconsistent_peaks_ct": 0
     },
-    "mean_score": 0,
+    "mean_score": 0.1374,
     "resources": {
-      "submit": "2025-03-19 13:03:51",
+      "submit": "2025-03-19 21:42:34",
       "exit_code": 0,
-      "duration_sec": 1.9,
-      "cpu_pct": 204.8,
-      "peak_memory_mb": 764,
+      "duration_sec": 2,
+      "cpu_pct": 197.9,
+      "peak_memory_mb": 760,
       "disk_read_mb": 20,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration_by_batch",
     "metric_values": {
-      "average_batch_r2_global": "NA",
-      "average_batch_r2_ct": "NA",
-      "n_inconsistent_peaks": "NA",
-      "n_inconsistent_peaks_ct": "NA",
-      "emd_mean": "NA",
-      "emd_max": "NA"
+      "average_batch_r2_ct": 0.2538,
+      "average_batch_r2_global": 0.0737,
+      "emd_max": 30.4333,
+      "emd_mean": 7.2138,
+      "n_inconsistent_peaks": 19,
+      "n_inconsistent_peaks_ct": 42
     },
     "scaled_scores": {
-      "average_batch_r2_global": 0,
       "average_batch_r2_ct": 0,
-      "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0,
+      "average_batch_r2_global": 0,
+      "emd_max": 0.1293,
       "emd_mean": 0,
-      "emd_max": 0
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 0.3
     },
-    "mean_score": 0,
+    "mean_score": 0.0716,
     "resources": {
-      "submit": "2025-03-19 13:03:51",
+      "submit": "2025-03-19 21:42:35",
       "exit_code": 0,
-      "duration_sec": 1.9,
-      "cpu_pct": 185,
-      "peak_memory_mb": 763,
+      "duration_sec": 1.7,
+      "cpu_pct": 274.1,
+      "peak_memory_mb": 768,
       "disk_read_mb": 20,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": null,
+    "dataset_id": "cyto_spleen_subset",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_values": {
-      "average_batch_r2_global": "NA",
-      "average_batch_r2_ct": "NA",
-      "n_inconsistent_peaks": "NA",
-      "n_inconsistent_peaks_ct": "NA",
-      "emd_mean": "NA",
-      "emd_max": "NA"
+      "average_batch_r2_ct": 0.0681,
+      "average_batch_r2_global": 0.0218,
+      "emd_max": 27,
+      "emd_mean": 2.9222,
+      "n_inconsistent_peaks": 19,
+      "n_inconsistent_peaks_ct": 32
     },
     "scaled_scores": {
-      "average_batch_r2_global": 0,
-      "average_batch_r2_ct": 0,
+      "average_batch_r2_ct": 0.7318,
+      "average_batch_r2_global": 0.7038,
+      "emd_max": 0.2276,
+      "emd_mean": 0.5949,
       "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0,
-      "emd_mean": 0,
-      "emd_max": 0
+      "n_inconsistent_peaks_ct": 0.4667
     },
-    "mean_score": 0,
+    "mean_score": 0.4541,
     "resources": {
-      "submit": "2025-03-19 13:03:51",
+      "submit": "2025-03-19 21:42:34",
       "exit_code": 0,
-      "duration_sec": 1.8,
-      "cpu_pct": 261.1,
-      "peak_memory_mb": 765,
+      "duration_sec": 2.1,
+      "cpu_pct": 178.5,
+      "peak_memory_mb": 755,
       "disk_read_mb": 20,
       "disk_write_mb": 2
     }

From 658737c8e63e7d9eafe08a5a1f905c1aabbd732f Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Wed, 19 Mar 2025 23:17:04 +0100
Subject: [PATCH 08/14] trigger update

---
 results/cyto_batch_integration/index.qmd | 1 +
 1 file changed, 1 insertion(+)

diff --git a/results/cyto_batch_integration/index.qmd b/results/cyto_batch_integration/index.qmd
index 8103f8a9..b00f887a 100644
--- a/results/cyto_batch_integration/index.qmd
+++ b/results/cyto_batch_integration/index.qmd
@@ -20,3 +20,4 @@ params <- list(data_dir = "./data")
 ```
 
 {{< include ../_include/_task_template.qmd >}}
+

From 7d69d2f2fd96a999b885dcc8fc4ddb170490fbf2 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Thu, 20 Mar 2025 12:30:22 +0100
Subject: [PATCH 09/14] fix subtitle

---
 results/cyto_batch_integration/index.qmd | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/results/cyto_batch_integration/index.qmd b/results/cyto_batch_integration/index.qmd
index b00f887a..a5bb8857 100644
--- a/results/cyto_batch_integration/index.qmd
+++ b/results/cyto_batch_integration/index.qmd
@@ -1,6 +1,6 @@
 ---
 title: "Cyto Batch Integration"
-subtitle: "A one sentence summary of purpose and methodology. Used for creating an overview tables."
+subtitle: "Benchmarking of batch integration algorithms for cytometry data."
 image: thumbnail.svg
 page-layout: full
 css: ../_include/task_template.css
@@ -20,4 +20,3 @@ params <- list(data_dir = "./data")
 ```
 
 {{< include ../_include/_task_template.qmd >}}
-

From c0d63dab5824e98e0e929bfbb5870d73bbd4376b Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Fri, 16 May 2025 15:40:02 +0200
Subject: [PATCH 10/14] update results

---
 .../data/dataset_info.json                    |   16 +-
 .../data/method_info.json                     |   68 +-
 .../data/metric_execution_info.json           |  424 +++---
 .../data/metric_info.json                     |   70 +-
 .../data/quality_control.json                 | 1326 +++++++++++++----
 .../cyto_batch_integration/data/results.json  |  440 +++---
 6 files changed, 1657 insertions(+), 687 deletions(-)

diff --git a/results/cyto_batch_integration/data/dataset_info.json b/results/cyto_batch_integration/data/dataset_info.json
index 965ca937..519fa0e1 100644
--- a/results/cyto_batch_integration/data/dataset_info.json
+++ b/results/cyto_batch_integration/data/dataset_info.json
@@ -1,12 +1,12 @@
 [
   {
-    "dataset_id": "cyto_spleen_subset",
-    "dataset_name": "Cytometry Spleen Subset",
-    "dataset_summary": "Flow cytometry data of spleens of 9 mice, subsampled to 1000 cells per sample.",
-    "dataset_description": "Flow cytometry data of spleens from 4 WT (IKK2 fl/fl CD11c-cre +/+) and 5 KO (IKK2 fl/fl CD11c-cre Tg/+) B6 mice,  measured with a 22-color panel on 2 different instrument settings. Subsampled to 1000 cells per sample. Data has been preprocessed (compensated witha batch-specific compensation matrix, logicle transformed, cleaned with PeacoQC and pregated on live single CD45+ cells).",
-    "data_reference": null,
-    "data_url": null,
-    "date_created": "19-03-2025",
-    "file_size": 1447383
+    "dataset_id": "leomazzi_cyto_spleen",
+    "dataset_name": "Leomazzi Spleen Cytometry",
+    "dataset_summary": "Flow cytometry data of spleens of 8 mice. For each mouse, aliquotes of the same original sample were divided into 2 batches and measured with 2 different instrument settings to allow the creation of sample-paired replicates for benchmarking purposes.",
+    "dataset_description": "Flow cytometry data of spleens from 4 WT (IKK2 fl/fl CD11c-cre +/+) and 4 KO (IKK2 fl/fl CD11c-cre Tg/+) B6 mice, measured with a 22-color panel and 2 different instrument settings. Data has been preprocessed (compensated with a batch-specific compensation matrix, logicle transformed, cleaned with PeacoQC and pregated on live single CD45+ cells).",
+    "data_reference": [],
+    "data_url": "https://saeyslab.sites.vib.be/en",
+    "date_created": "15-05-2025",
+    "file_size": 489781536
   }
 ]
diff --git a/results/cyto_batch_integration/data/method_info.json b/results/cyto_batch_integration/data/method_info.json
index eb1c93d5..a0ec19eb 100644
--- a/results/cyto_batch_integration/data/method_info.json
+++ b/results/cyto_batch_integration/data/method_info.json
@@ -11,9 +11,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/control_methods/shuffle_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/control_methods/shuffle_integration",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
   },
   {
     "task_id": "control_methods",
@@ -27,9 +27,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_batch:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/control_methods/shuffle_integration_by_batch",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/control_methods/shuffle_integration_by_batch",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
   },
   {
     "task_id": "control_methods",
@@ -43,9 +43,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_cell_type:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/control_methods/shuffle_integration_by_cell_type",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/control_methods/shuffle_integration_by_cell_type",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
   },
   {
     "task_id": "methods",
@@ -59,9 +59,9 @@
     "code_url": "https://github.com/slowkow/harmonypy",
     "documentation_url": "https://portals.broadinstitute.org/harmony",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/harmonypy:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/methods/harmonypy",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/harmonypy",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
   },
   {
     "task_id": "methods",
@@ -75,9 +75,9 @@
     "code_url": "https://github.com/bioc/limma",
     "documentation_url": "https://bioinf.wehi.edu.au/limma",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/limma_remove_batch_effect:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/methods/limma_remove_batch_effect",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/limma_remove_batch_effect",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
   },
   {
     "task_id": "control_methods",
@@ -91,9 +91,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/no_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/control_methods/no_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/control_methods/no_integration",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
   },
   {
     "task_id": "control_methods",
@@ -107,9 +107,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/perfect_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/control_methods/perfect_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/control_methods/perfect_integration",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
   },
   {
     "task_id": "methods",
@@ -123,9 +123,9 @@
     "code_url": "https://github.com/brentp/combat.py",
     "documentation_url": "https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/combat:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/methods/combat",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/combat",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
   },
   {
     "task_id": "methods",
@@ -139,8 +139,40 @@
     "code_url": "https://github.com/biosurf/cyCombine",
     "documentation_url": "https://biosurf.org/cyCombine.html",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/cycombine_nocontrols:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/methods/cycombine_nocontrols",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/cycombine_nocontrols",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d"
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+  },
+  {
+    "task_id": "methods",
+    "method_id": "gaussnorm",
+    "method_name": "GaussNorm",
+    "method_summary": "Batch effect correction using a per‐channel basis normalization method (gaussNorm)",
+    "method_description": "This method batch-normalizes a set of cytometry data samples by identifying and aligning the high density regions (landmarks or peaks) for each channel.\nThe data of each channel is shifted in such a way that the identified high density regions are moved to fixed locations called base landmarks.\nNormalization is achieved in three phases:\n1. identifying high-density regions (landmarks) for each flowFrame in the flowSet for a single channel\n2. computing the best matching between the landmarks and a set of fixed reference landmarks for each channel called base landmarks\n3. manipulating the data of each channel in such a way that each landmark is moved to its matching base landmark. Please note that this normalization is on a channel-by-channel basis\n\nNOTE: The default implementation uses `max.lms=2`, although for some channels it is not possible to compute 2 landmarks, resulting in an error.\nIn order to fully automate the batch normalization process, this implementation checks whether it is possible to compute 2 landmarks, and if not, it sets `max.lms=1` for that channel.\n",
+    "is_baseline": false,
+    "references_doi": "10.1002/cyto.a.20823",
+    "references_bibtex": null,
+    "code_url": "https://github.com/RGLab/flowStats",
+    "documentation_url": "https://rdrr.io/bioc/flowStats/src/R/gaussNorm.R",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/gaussnorm:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/gaussnorm",
+    "code_version": "build_main",
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+  },
+  {
+    "task_id": "methods",
+    "method_id": "cytonorm_controls",
+    "method_name": "CytoNorm with controls",
+    "method_summary": "CytoNorm Batch normalization algorithm which uses shared controls across batches.",
+    "method_description": "CytoNorm corrects batch effects by using reference control samples (aliquots of one sample, \ntechnical replicates) included with each batch. \nIt clusters cells, then trains a model on the control samples to learn how marker \nexpression distributions differ across batches for each population.\nIt then uses splines to align these distributions to a common reference (either the mean\nof batches or to a single batch).\nIn this CytoNorm version, batches are aligned to the mean of the batches.\nClustering was performed by FlowSOM, using the default parameters provided by CytoNorm.\n",
+    "is_baseline": false,
+    "references_doi": "10.1002/cyto.a.23904",
+    "references_bibtex": null,
+    "code_url": "https://github.com/saeyslab/CytoNorm",
+    "documentation_url": "https://github.com/saeyslab/CytoNorm",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/cytonorm_controls:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/cytonorm_controls",
+    "code_version": "build_main",
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
   }
 ]
diff --git a/results/cyto_batch_integration/data/metric_execution_info.json b/results/cyto_batch_integration/data/metric_execution_info.json
index acf703cd..bcd66343 100644
--- a/results/cyto_batch_integration/data/metric_execution_info.json
+++ b/results/cyto_batch_integration/data/metric_execution_info.json
@@ -1,379 +1,463 @@
 [
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "combat",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 21:44:34",
+      "submit": "2025-05-15 11:10:54",
       "exit_code": 0,
-      "duration_sec": 20.8,
-      "cpu_pct": 121.9,
-      "peak_memory_mb": 1639,
-      "disk_read_mb": 70,
+      "duration_sec": 192,
+      "cpu_pct": 422.4,
+      "peak_memory_mb": 9728,
+      "disk_read_mb": 3278,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "combat",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 21:44:34",
+      "submit": "2025-05-15 11:10:54",
       "exit_code": 0,
-      "duration_sec": 51.6,
-      "cpu_pct": 104.8,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 124,
-      "disk_write_mb": 2
+      "duration_sec": 488,
+      "cpu_pct": 101.8,
+      "peak_memory_mb": 10445,
+      "disk_read_mb": 6556,
+      "disk_write_mb": 4
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "combat",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 21:44:34",
+      "submit": "2025-05-15 11:10:54",
       "exit_code": 0,
-      "duration_sec": 180,
-      "cpu_pct": 1834.9,
-      "peak_memory_mb": 5632,
-      "disk_read_mb": 64,
+      "duration_sec": 2170,
+      "cpu_pct": 2227.8,
+      "peak_memory_mb": 9524,
+      "disk_read_mb": 3278,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 21:45:54",
+      "submit": "2025-05-15 11:15:44",
       "exit_code": 0,
-      "duration_sec": 20,
-      "cpu_pct": 126.4,
-      "peak_memory_mb": 1639,
-      "disk_read_mb": 70,
+      "duration_sec": 214,
+      "cpu_pct": 378.6,
+      "peak_memory_mb": 9728,
+      "disk_read_mb": 3278,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 21:45:54",
+      "submit": "2025-05-15 11:15:44",
       "exit_code": 0,
-      "duration_sec": 50.6,
-      "cpu_pct": 107,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 124,
-      "disk_write_mb": 2
+      "duration_sec": 500,
+      "cpu_pct": 101.8,
+      "peak_memory_mb": 10445,
+      "disk_read_mb": 6556,
+      "disk_write_mb": 4
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 21:45:54",
+      "submit": "2025-05-15 11:15:44",
       "exit_code": 0,
-      "duration_sec": 14.2,
-      "cpu_pct": 1097.9,
-      "peak_memory_mb": 1536,
-      "disk_read_mb": 64,
+      "duration_sec": 1838,
+      "cpu_pct": 3022.1,
+      "peak_memory_mb": 9421,
+      "disk_read_mb": 3278,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "cytonorm_controls",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-05-15 11:23:44",
+      "exit_code": 0,
+      "duration_sec": 228,
+      "cpu_pct": 283.7,
+      "peak_memory_mb": 9728,
+      "disk_read_mb": 2664,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "cytonorm_controls",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-05-15 11:23:44",
+      "exit_code": 0,
+      "duration_sec": 508,
+      "cpu_pct": 102,
+      "peak_memory_mb": 10343,
+      "disk_read_mb": 5328,
+      "disk_write_mb": 4
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "cytonorm_controls",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-05-15 11:23:44",
+      "exit_code": 0,
+      "duration_sec": 1850,
+      "cpu_pct": 2815.2,
+      "peak_memory_mb": 9421,
+      "disk_read_mb": 2664,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "gaussnorm",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-05-15 11:16:54",
+      "exit_code": 0,
+      "duration_sec": 220,
+      "cpu_pct": 274.3,
+      "peak_memory_mb": 9728,
+      "disk_read_mb": 2868,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "gaussnorm",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-05-15 11:16:54",
+      "exit_code": 0,
+      "duration_sec": 468,
+      "cpu_pct": 105.4,
+      "peak_memory_mb": 10445,
+      "disk_read_mb": 5736,
+      "disk_write_mb": 4
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "gaussnorm",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-05-15 11:16:54",
+      "exit_code": 0,
+      "duration_sec": 2180,
+      "cpu_pct": 1892.7,
+      "peak_memory_mb": 9421,
+      "disk_read_mb": 2868,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "harmonypy",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 21:46:44",
+      "submit": "2025-05-15 14:03:04",
       "exit_code": 0,
-      "duration_sec": 20.2,
-      "cpu_pct": 124,
-      "peak_memory_mb": 1639,
-      "disk_read_mb": 68,
+      "duration_sec": 200,
+      "cpu_pct": 408.9,
+      "peak_memory_mb": 8602,
+      "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "harmonypy",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 21:46:44",
+      "submit": "2025-05-15 14:03:04",
       "exit_code": 0,
-      "duration_sec": 51,
-      "cpu_pct": 105.9,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 122,
-      "disk_write_mb": 2
+      "duration_sec": 456,
+      "cpu_pct": 107.1,
+      "peak_memory_mb": 9626,
+      "disk_read_mb": 4916,
+      "disk_write_mb": 4
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "harmonypy",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 21:46:44",
+      "submit": "2025-05-15 14:03:04",
       "exit_code": 0,
-      "duration_sec": 91.4,
-      "cpu_pct": 2278.3,
-      "peak_memory_mb": 5632,
-      "disk_read_mb": 62,
+      "duration_sec": 1322,
+      "cpu_pct": 3551.4,
+      "peak_memory_mb": 8500,
+      "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 21:46:34",
+      "submit": "2025-05-15 11:11:14",
       "exit_code": 0,
-      "duration_sec": 20.8,
-      "cpu_pct": 122.5,
-      "peak_memory_mb": 1639,
-      "disk_read_mb": 70,
+      "duration_sec": 200,
+      "cpu_pct": 391.8,
+      "peak_memory_mb": 9728,
+      "disk_read_mb": 2664,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 21:46:34",
+      "submit": "2025-05-15 11:11:14",
       "exit_code": 0,
-      "duration_sec": 50.8,
+      "duration_sec": 452,
       "cpu_pct": 106,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 124,
-      "disk_write_mb": 2
+      "peak_memory_mb": 10445,
+      "disk_read_mb": 5328,
+      "disk_write_mb": 4
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 21:46:34",
+      "submit": "2025-05-15 11:11:14",
       "exit_code": 0,
-      "duration_sec": 13.8,
-      "cpu_pct": 1106.2,
-      "peak_memory_mb": 1536,
-      "disk_read_mb": 62,
+      "duration_sec": 1400,
+      "cpu_pct": 2885.9,
+      "peak_memory_mb": 9524,
+      "disk_read_mb": 2664,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "no_integration",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 21:43:54",
+      "submit": "2025-05-15 11:10:04",
       "exit_code": 0,
-      "duration_sec": 27,
-      "cpu_pct": 168.8,
-      "peak_memory_mb": 5632,
-      "disk_read_mb": 68,
+      "duration_sec": 196,
+      "cpu_pct": 410.6,
+      "peak_memory_mb": 8602,
+      "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "no_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 21:43:53",
+      "submit": "2025-05-15 11:10:04",
       "exit_code": 0,
-      "duration_sec": 51.8,
-      "cpu_pct": 104.6,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 122,
-      "disk_write_mb": 2
+      "duration_sec": 456,
+      "cpu_pct": 102,
+      "peak_memory_mb": 9626,
+      "disk_read_mb": 4916,
+      "disk_write_mb": 4
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "no_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 21:43:54",
+      "submit": "2025-05-15 11:10:04",
       "exit_code": 0,
-      "duration_sec": 172,
-      "cpu_pct": 1809,
-      "peak_memory_mb": 5632,
-      "disk_read_mb": 62,
+      "duration_sec": 2168,
+      "cpu_pct": 2291.6,
+      "peak_memory_mb": 8500,
+      "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "perfect_integration",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 21:46:34",
+      "submit": "2025-05-15 11:07:54",
       "exit_code": 0,
-      "duration_sec": 21.2,
-      "cpu_pct": 123.6,
-      "peak_memory_mb": 1639,
-      "disk_read_mb": 68,
+      "duration_sec": 250,
+      "cpu_pct": 465.5,
+      "peak_memory_mb": 8397,
+      "disk_read_mb": 2254,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "perfect_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 21:46:34",
+      "submit": "2025-05-15 11:07:54",
       "exit_code": 0,
-      "duration_sec": 51.6,
-      "cpu_pct": 105.2,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 122,
-      "disk_write_mb": 2
+      "duration_sec": 408,
+      "cpu_pct": 106.3,
+      "peak_memory_mb": 9319,
+      "disk_read_mb": 4508,
+      "disk_write_mb": 4
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "perfect_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 21:46:34",
+      "submit": "2025-05-15 11:07:54",
       "exit_code": 0,
-      "duration_sec": 14.8,
-      "cpu_pct": 1093.6,
-      "peak_memory_mb": 1536,
-      "disk_read_mb": 60,
+      "duration_sec": 2486,
+      "cpu_pct": 2042,
+      "peak_memory_mb": 8295,
+      "disk_read_mb": 2254,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 21:43:34",
+      "submit": "2025-05-15 11:08:24",
       "exit_code": 0,
-      "duration_sec": 20.2,
-      "cpu_pct": 125.2,
-      "peak_memory_mb": 1639,
-      "disk_read_mb": 68,
+      "duration_sec": 210,
+      "cpu_pct": 398.9,
+      "peak_memory_mb": 8602,
+      "disk_read_mb": 2664,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 21:43:34",
+      "submit": "2025-05-15 11:08:24",
       "exit_code": 0,
-      "duration_sec": 51.4,
-      "cpu_pct": 105.2,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 122,
-      "disk_write_mb": 2
+      "duration_sec": 544,
+      "cpu_pct": 103.9,
+      "peak_memory_mb": 9626,
+      "disk_read_mb": 4916,
+      "disk_write_mb": 4
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 21:43:34",
+      "submit": "2025-05-15 11:08:24",
       "exit_code": 0,
-      "duration_sec": 14.2,
-      "cpu_pct": 1094.2,
-      "peak_memory_mb": 1536,
-      "disk_read_mb": 62,
+      "duration_sec": 1654,
+      "cpu_pct": 3033.8,
+      "peak_memory_mb": 8500,
+      "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 21:46:44",
+      "submit": "2025-05-15 11:08:24",
       "exit_code": 0,
-      "duration_sec": 20.6,
-      "cpu_pct": 122.9,
-      "peak_memory_mb": 1639,
-      "disk_read_mb": 68,
+      "duration_sec": 202,
+      "cpu_pct": 396.9,
+      "peak_memory_mb": 8602,
+      "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 21:46:44",
+      "submit": "2025-05-15 11:08:24",
       "exit_code": 0,
-      "duration_sec": 50.6,
-      "cpu_pct": 106.7,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 122,
-      "disk_write_mb": 2
+      "duration_sec": 432,
+      "cpu_pct": 106.9,
+      "peak_memory_mb": 9626,
+      "disk_read_mb": 4916,
+      "disk_write_mb": 4
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 21:46:44",
+      "submit": "2025-05-15 11:08:24",
       "exit_code": 0,
-      "duration_sec": 14.6,
-      "cpu_pct": 1076.6,
-      "peak_memory_mb": 1536,
-      "disk_read_mb": 62,
+      "duration_sec": 1634,
+      "cpu_pct": 2856.4,
+      "peak_memory_mb": 8500,
+      "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-03-19 21:43:34",
+      "submit": "2025-05-15 11:08:04",
       "exit_code": 0,
-      "duration_sec": 20.6,
-      "cpu_pct": 123,
-      "peak_memory_mb": 1639,
-      "disk_read_mb": 68,
+      "duration_sec": 204,
+      "cpu_pct": 386.4,
+      "peak_memory_mb": 8602,
+      "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-03-19 21:43:34",
+      "submit": "2025-05-15 11:08:04",
       "exit_code": 0,
-      "duration_sec": 51.8,
-      "cpu_pct": 105.8,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 122,
-      "disk_write_mb": 2
+      "duration_sec": 488,
+      "cpu_pct": 104.9,
+      "peak_memory_mb": 9626,
+      "disk_read_mb": 4916,
+      "disk_write_mb": 4
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-03-19 21:43:34",
+      "submit": "2025-05-15 11:08:04",
       "exit_code": 0,
-      "duration_sec": 172,
-      "cpu_pct": 1822.8,
-      "peak_memory_mb": 5632,
-      "disk_read_mb": 62,
+      "duration_sec": 2506,
+      "cpu_pct": 2001.9,
+      "peak_memory_mb": 8500,
+      "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
   }
diff --git a/results/cyto_batch_integration/data/metric_info.json b/results/cyto_batch_integration/data/metric_info.json
index 050023bd..28835c1a 100644
--- a/results/cyto_batch_integration/data/metric_info.json
+++ b/results/cyto_batch_integration/data/metric_info.json
@@ -2,31 +2,61 @@
   {
     "task_id": "metrics",
     "component_name": "emd",
-    "metric_id": "emd_mean",
-    "metric_name": "EMD Mean",
-    "metric_summary": "Mean Earth Mover Distance to compute differences in distribution of marker expressions.",
-    "metric_description": "Earth Mover Distance (EMD) is a metric designed for comparing two distributions.\nIt is also known as the Wasserstein metric.\n",
+    "metric_id": "emd_mean_ct",
+    "metric_name": "EMD Mean CT",
+    "metric_summary": "Mean Earth Mover Distance across cell types and markers.",
+    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample, cell type, and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the average of all these EMD values is computed to produce an overall metric score EMD Mean CT.\n\nA high score indicates large overall differences in the distributions of marker expressions \nbetween the paired samples, suggesting poor batch integration.\nA low score means the small differences in marker expression distributions between batches, \nindicating good batch integration.\n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
     "maximize": false
   },
   {
     "task_id": "metrics",
     "component_name": "emd",
-    "metric_id": "emd_max",
-    "metric_name": "EMD Max",
-    "metric_summary": "Max Earth Mover Distance to compute differences in distribution of marker expressions.",
-    "metric_description": "Earth Mover Distance (EMD) is a metric designed for comparing two distributions.\nIt is also known as the Wasserstein metric.\n",
+    "metric_id": "emd_max_ct",
+    "metric_name": "EMD Max CT",
+    "metric_summary": "Max Earth Mover Distance across cell types and markers.",
+    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample, cell type, and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the maximum of all these EMD values is computed as EMD Max CT.\n\nEMD Max CT score reflects the largest difference in marker expression distributions across all cell types, \nmarkers, and paired samples.\nA high score indicates that at least one marker, cell type, or sample pair has a large difference in \ndistribution after batch integration.\nA low score means that even the most poorly corrected marker expression is well integrated across batches.    \n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
+    "maximize": false
+  },
+  {
+    "task_id": "metrics",
+    "component_name": "emd",
+    "metric_id": "emd_mean_global",
+    "metric_name": "EMD Mean Global",
+    "metric_summary": "Mean Earth Mover Distance across samples and markers.",
+    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every marker and paired sample.\nFinally, the average of all these EMD values is computed to produce an overall metric score EMD Mean Global.\n\nA high score indicates that at least one marker and cell type in a given sample pair has a \nlarge difference in distribution after batch integration.\nA low score means that the most poorly corrected marker expression is well integrated across batches.   \n",
+    "references_doi": "10.1023/A:1026543900054",
+    "references_bibtex": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/emd",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
+    "code_version": "build_main",
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
+    "maximize": false
+  },
+  {
+    "task_id": "metrics",
+    "component_name": "emd",
+    "metric_id": "emd_max_global",
+    "metric_name": "EMD Max Global",
+    "metric_summary": "Max Earth Mover Distance across donors and markers.",
+    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the maximum of all these EMD values is computed as EMD Max Global.\n\nEMD Max Global score reflects the largest difference in marker expression distributions \nacross all markers and paired samples.\nA high score indicates that at least one marker in a given sample pair has a large difference in \ndistribution after batch integration.\nA low score means that the most poorly corrected marker expression is well integrated across batches.   \n",
+    "references_doi": "10.1023/A:1026543900054",
+    "references_bibtex": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/emd",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
+    "code_version": "build_main",
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
     "maximize": false
   },
   {
@@ -38,10 +68,10 @@
     "metric_description": "The metric compares the number of marker expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
     "references_doi": "10.1038/s41592-019-0686-2",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/n_inconsistent_peaks",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/n_inconsistent_peaks",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/n_inconsistent_peaks:build_main",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
     "maximize": false
   },
   {
@@ -53,10 +83,10 @@
     "metric_description": "The metric compares the number of cell type specific marker expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe (cell type) marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
     "references_doi": "10.1038/s41592-019-0686-2",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/n_inconsistent_peaks",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/n_inconsistent_peaks",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/n_inconsistent_peaks:build_main",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
     "maximize": false
   },
   {
@@ -68,10 +98,10 @@
     "metric_description": "First, a simple linear model `sklearn.linear_model.LinearRegression` is fitted for each paired sample and marker to determine the fraction of variance (R^2) explained by the batch covariate B. |\nThe average batch R_squared is then computed as the average of the $R^2$ values across all paired samples, markers. |\nAs a result, $\\overline{R^2_B}_{global}$ quantifies how much of the total variability in the data is driven by batch effects. Consequently, lower values are desirable. |\n\n$\\overline{R^2_B}_{global} = \\frac{1}{N*M}\\sum_{\\substack{(x_{\\mathrm{int}},\\,x_{\\mathrm{val}})\\\\ \\text{paired samples}}}^{N} \\sum_{i=1}^{M} \\,R^2\\!\\bigl(\\mathrm{marker}_i \\mid B\\bigr)$\n\nWhere:\n- $N$ is the number of paired samples, where x_{\\mathrm{int}} is the replicate that has been batch-corrected and x_{\\mathrm{val}} is replicate used for validation. Paired samples belong to different batches.\n- $M$ is the number of markers\n- $B$ is the batch covariate\n\nA higher value of $\\overline{R^2_B}_{global}$ indicates that the batch variable explains more of the variance in the data, which indicates a higher level of batch effects. |\n",
     "references_doi": null,
     "references_bibtex": "@book{draper1998applied,\ntitle={Applied regression analysis},\nauthor={Draper, Norman R and Smith, Harry},\npublisher={John Wiley \\& Sons}\n}\n",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/average_batch_r2",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/average_batch_r2",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/average_batch_r2:build_main",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
     "maximize": false
   },
   {
@@ -83,10 +113,10 @@
     "metric_description": "First, a simple linear model `sklearn.linear_model.LinearRegression` is fitted for each paired sample, marker and cell type to determine the fraction of variance (R^2) explained by the batch covariate B. |\nThe average batch R_squared is then computed as the average of the $R^2$ values across all paired samples, markers and cell types. |\nAs a result, $\\overline{R^2_B}_{cell\\ type}$ quantifies how much of the total variability in the data is driven by batch effects. Consequently, lower values are desirable. |\n\n$\\overline{R^2_B}_{cell\\ type} = \\frac{1}{N*C*M}\\sum_{\\substack{(x_{\\mathrm{int}},\\,x_{\\mathrm{val}})\\\\ \\text{paired samples}}}^{N} \\sum_{j=1}^{C} \\sum_{i=1}^{M}\\,R^2\\!\\bigl(\\mathrm{marker}_i \\mid B\\bigr)$\n\nWhere:\n- $N$ is the number of paired samples, where x_{\\mathrm{int}} is the replicate that has been batch-corrected and x_{\\mathrm{val}} is replicate used for validation. Paired samples belong to different batches.\n- $C$ is the number of cell types\n- $M$ is the number of markers\n- $B$ is the batch covariate\n\nThe $\\overline{Rˆ2_B}_{global}$ is a variation of the latter metric, where the average is computed across paired samples and markers only, without taking into account the cell types. |\n\nA higher value of $\\overline{R^2_B}_{global}$ or $\\overline{R^2_B}_{cell\\ type}$ indicates that the batch variable explains more of the variance in the data, which indicates a higher level of batch effects. |\n\nA good performance on $\\overline{R^2_B}_{global}$ but not on $\\overline{R^2_B}_{cell\\ type}$ might indicate that the batch effect correction is discarding cell type specific batch effects. |\n",
     "references_doi": null,
     "references_bibtex": "@book{draper1998applied,\ntitle={Applied regression analysis},\nauthor={Draper, Norman R and Smith, Harry},\npublisher={John Wiley \\& Sons}\n}\n",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/24589bd4d6b6318aa15c07ef5214c2ff52204f5d/src/metrics/average_batch_r2",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/average_batch_r2",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/average_batch_r2:build_main",
     "code_version": "build_main",
-    "commit_sha": "24589bd4d6b6318aa15c07ef5214c2ff52204f5d",
+    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
     "maximize": false
   }
 ]
diff --git a/results/cyto_batch_integration/data/quality_control.json b/results/cyto_batch_integration/data/quality_control.json
index 204d3c06..ae72f161 100644
--- a/results/cyto_batch_integration/data/quality_control.json
+++ b/results/cyto_batch_integration/data/quality_control.json
@@ -93,7 +93,7 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Method info", 
         "name": "Pct 'paper_reference' missing", 
-        "value": 0.4444444444444444, 
+        "value": 0.5454545454545454, 
         "severity": 2, 
         "severity_value": 3.0, 
         "code": "percent_missing(method_info, field)", 
@@ -243,31 +243,51 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw data", 
         "name": "Number of results", 
-        "value": 9, 
+        "value": 11, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "len(results) == len(method_info) * len(metric_info) * len(dataset_info)", 
-        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_cyto_batch_integration\n  Number of results: 9\n  Number of methods: 9\n  Number of metrics: 6\n  Number of datasets: 1\n"
+        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_cyto_batch_integration\n  Number of results: 11\n  Number of methods: 11\n  Number of metrics: 8\n  Number of datasets: 1\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
-        "name": "Metric 'emd_mean' %missing", 
+        "name": "Metric 'emd_mean_ct' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_mean\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_mean_ct\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
-        "name": "Metric 'emd_max' %missing", 
+        "name": "Metric 'emd_max_ct' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_max\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_max_ct\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Metric 'emd_mean_global' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_mean_global\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Metric 'emd_max_global' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_max_global\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -402,372 +422,912 @@
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
-        "name": "Dataset 'cyto_spleen_subset' %missing", 
+        "name": "Method 'gaussnorm' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: gaussnorm\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Method 'cytonorm_controls' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: cytonorm_controls\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Dataset 'leomazzi_cyto_spleen' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  dataset id: cyto_spleen_subset\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  dataset id: leomazzi_cyto_spleen\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration emd_mean", 
-        "value": 0.0734, 
+        "name": "Worst score shuffle_integration emd_mean_ct", 
+        "value": 0.0241, 
         "severity": 0, 
-        "severity_value": -0.0734, 
+        "severity_value": -0.0241, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Worst score: 0.0734%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_ct\n  Worst score: 0.0241%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration emd_mean", 
-        "value": 0.0734, 
+        "name": "Best score shuffle_integration emd_mean_ct", 
+        "value": 0.0241, 
         "severity": 0, 
-        "severity_value": 0.0367, 
+        "severity_value": 0.01205, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean\n  Best score: 0.0734%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_ct\n  Best score: 0.0241%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_batch emd_mean", 
+        "name": "Worst score shuffle_integration_by_batch emd_mean_ct", 
         "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean\n  Worst score: 0%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_ct\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_batch emd_mean", 
+        "name": "Best score shuffle_integration_by_batch emd_mean_ct", 
         "value": 0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean\n  Best score: 0%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_ct\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_cell_type emd_mean", 
-        "value": 0.5949, 
+        "name": "Worst score shuffle_integration_by_cell_type emd_mean_ct", 
+        "value": 0.779, 
         "severity": 0, 
-        "severity_value": -0.5949, 
+        "severity_value": -0.779, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Worst score: 0.5949%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_ct\n  Worst score: 0.779%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_cell_type emd_mean", 
-        "value": 0.5949, 
+        "name": "Best score shuffle_integration_by_cell_type emd_mean_ct", 
+        "value": 0.779, 
         "severity": 0, 
-        "severity_value": 0.29745, 
+        "severity_value": 0.3895, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean\n  Best score: 0.5949%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_ct\n  Best score: 0.779%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score harmonypy emd_mean", 
-        "value": 0.5774, 
+        "name": "Worst score harmonypy emd_mean_ct", 
+        "value": 0.7864, 
         "severity": 0, 
-        "severity_value": -0.5774, 
+        "severity_value": -0.7864, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Worst score: 0.5774%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_ct\n  Worst score: 0.7864%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score harmonypy emd_mean", 
-        "value": 0.5774, 
+        "name": "Best score harmonypy emd_mean_ct", 
+        "value": 0.7864, 
         "severity": 0, 
-        "severity_value": 0.2887, 
+        "severity_value": 0.3932, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean\n  Best score: 0.5774%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_ct\n  Best score: 0.7864%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score limma_remove_batch_effect emd_mean", 
-        "value": 0.5541, 
+        "name": "Worst score limma_remove_batch_effect emd_mean_ct", 
+        "value": 0.7724, 
         "severity": 0, 
-        "severity_value": -0.5541, 
+        "severity_value": -0.7724, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Worst score: 0.5541%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_ct\n  Worst score: 0.7724%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score limma_remove_batch_effect emd_mean", 
-        "value": 0.5541, 
+        "name": "Best score limma_remove_batch_effect emd_mean_ct", 
+        "value": 0.7724, 
         "severity": 0, 
-        "severity_value": 0.27705, 
+        "severity_value": 0.3862, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean\n  Best score: 0.5541%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_ct\n  Best score: 0.7724%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score no_integration emd_mean", 
-        "value": 0.5025, 
+        "name": "Worst score no_integration emd_mean_ct", 
+        "value": 0.7454, 
         "severity": 0, 
-        "severity_value": -0.5025, 
+        "severity_value": -0.7454, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Worst score: 0.5025%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_ct\n  Worst score: 0.7454%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score no_integration emd_mean", 
-        "value": 0.5025, 
+        "name": "Best score no_integration emd_mean_ct", 
+        "value": 0.7454, 
         "severity": 0, 
-        "severity_value": 0.25125, 
+        "severity_value": 0.3727, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean\n  Best score: 0.5025%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_ct\n  Best score: 0.7454%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score perfect_integration emd_mean", 
+        "name": "Worst score perfect_integration emd_mean_ct", 
         "value": 1, 
         "severity": 0, 
         "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean\n  Worst score: 1%\n"
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean_ct\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score perfect_integration emd_mean", 
+        "name": "Best score perfect_integration emd_mean_ct", 
         "value": 1, 
         "severity": 0, 
         "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean\n  Best score: 1%\n"
+        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean_ct\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat emd_mean_ct", 
+        "value": 0.7767, 
+        "severity": 0, 
+        "severity_value": -0.7767, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_ct\n  Worst score: 0.7767%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat emd_mean_ct", 
+        "value": 0.7767, 
+        "severity": 0, 
+        "severity_value": 0.38835, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_ct\n  Best score: 0.7767%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score combat emd_mean", 
-        "value": 0.5582, 
+        "name": "Worst score cycombine_nocontrols emd_mean_ct", 
+        "value": -0.945, 
         "severity": 0, 
-        "severity_value": -0.5582, 
+        "severity_value": 0.945, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Worst score: 0.5582%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_ct\n  Worst score: -0.945%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score combat emd_mean", 
-        "value": 0.5582, 
+        "name": "Best score cycombine_nocontrols emd_mean_ct", 
+        "value": -0.945, 
         "severity": 0, 
-        "severity_value": 0.2791, 
+        "severity_value": -0.4725, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean\n  Best score: 0.5582%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_ct\n  Best score: -0.945%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cycombine_nocontrols emd_mean", 
-        "value": 0.5691, 
+        "name": "Worst score gaussnorm emd_mean_ct", 
+        "value": 0.7424, 
         "severity": 0, 
-        "severity_value": -0.5691, 
+        "severity_value": -0.7424, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean\n  Worst score: 0.5691%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_ct\n  Worst score: 0.7424%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cycombine_nocontrols emd_mean", 
-        "value": 0.5691, 
+        "name": "Best score gaussnorm emd_mean_ct", 
+        "value": 0.7424, 
         "severity": 0, 
-        "severity_value": 0.28455, 
+        "severity_value": 0.3712, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean\n  Best score: 0.5691%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_ct\n  Best score: 0.7424%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration emd_max", 
-        "value": 0.0316, 
+        "name": "Worst score cytonorm_controls emd_mean_ct", 
+        "value": 0.8328, 
         "severity": 0, 
-        "severity_value": -0.0316, 
+        "severity_value": -0.8328, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Worst score: 0.0316%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_ct\n  Worst score: 0.8328%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration emd_max", 
-        "value": 0.0316, 
+        "name": "Best score cytonorm_controls emd_mean_ct", 
+        "value": 0.8328, 
         "severity": 0, 
-        "severity_value": 0.0158, 
+        "severity_value": 0.4164, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max\n  Best score: 0.0316%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_ct\n  Best score: 0.8328%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_batch emd_max", 
-        "value": 0.1293, 
+        "name": "Worst score shuffle_integration emd_max_ct", 
+        "value": 0.0451, 
         "severity": 0, 
-        "severity_value": -0.1293, 
+        "severity_value": -0.0451, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Worst score: 0.1293%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_ct\n  Worst score: 0.0451%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_batch emd_max", 
-        "value": 0.1293, 
+        "name": "Best score shuffle_integration emd_max_ct", 
+        "value": 0.0451, 
         "severity": 0, 
-        "severity_value": 0.06465, 
+        "severity_value": 0.02255, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max\n  Best score: 0.1293%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_ct\n  Best score: 0.0451%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_cell_type emd_max", 
-        "value": 0.2276, 
+        "name": "Worst score shuffle_integration_by_batch emd_max_ct", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": -0.2276, 
+        "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Worst score: 0.2276%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_ct\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_cell_type emd_max", 
-        "value": 0.2276, 
+        "name": "Best score shuffle_integration_by_batch emd_max_ct", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": 0.1138, 
+        "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max\n  Best score: 0.2276%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_ct\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score harmonypy emd_max", 
-        "value": 0.2317, 
+        "name": "Worst score shuffle_integration_by_cell_type emd_max_ct", 
+        "value": 0.5406, 
         "severity": 0, 
-        "severity_value": -0.2317, 
+        "severity_value": -0.5406, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max\n  Worst score: 0.2317%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_ct\n  Worst score: 0.5406%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score harmonypy emd_max", 
-        "value": 0.2317, 
+        "name": "Best score shuffle_integration_by_cell_type emd_max_ct", 
+        "value": 0.5406, 
         "severity": 0, 
-        "severity_value": 0.11585, 
+        "severity_value": 0.2703, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max\n  Best score: 0.2317%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_ct\n  Best score: 0.5406%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score limma_remove_batch_effect emd_max", 
-        "value": 0.0315, 
+        "name": "Worst score harmonypy emd_max_ct", 
+        "value": 0.5602, 
         "severity": 0, 
-        "severity_value": -0.0315, 
+        "severity_value": -0.5602, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max\n  Worst score: 0.0315%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_ct\n  Worst score: 0.5602%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score limma_remove_batch_effect emd_max", 
-        "value": 0.0315, 
+        "name": "Best score harmonypy emd_max_ct", 
+        "value": 0.5602, 
         "severity": 0, 
-        "severity_value": 0.01575, 
+        "severity_value": 0.2801, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max\n  Best score: 0.0315%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_ct\n  Best score: 0.5602%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect emd_max_ct", 
+        "value": 0.5541, 
+        "severity": 0, 
+        "severity_value": -0.5541, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_ct\n  Worst score: 0.5541%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score no_integration emd_max", 
+        "name": "Best score limma_remove_batch_effect emd_max_ct", 
+        "value": 0.5541, 
+        "severity": 0, 
+        "severity_value": 0.27705, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_ct\n  Best score: 0.5541%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration emd_max_ct", 
+        "value": 0.5386, 
+        "severity": 0, 
+        "severity_value": -0.5386, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_ct\n  Worst score: 0.5386%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration emd_max_ct", 
+        "value": 0.5386, 
+        "severity": 0, 
+        "severity_value": 0.2693, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_ct\n  Best score: 0.5386%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration emd_max_ct", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": -1.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max_ct\n  Worst score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration emd_max_ct", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max_ct\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat emd_max_ct", 
+        "value": 0.5453, 
+        "severity": 0, 
+        "severity_value": -0.5453, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_ct\n  Worst score: 0.5453%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat emd_max_ct", 
+        "value": 0.5453, 
+        "severity": 0, 
+        "severity_value": 0.27265, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_ct\n  Best score: 0.5453%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols emd_max_ct", 
+        "value": -0.1133, 
+        "severity": 0, 
+        "severity_value": 0.1133, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_ct\n  Worst score: -0.1133%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols emd_max_ct", 
+        "value": -0.1133, 
+        "severity": 0, 
+        "severity_value": -0.05665, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_ct\n  Best score: -0.1133%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score gaussnorm emd_max_ct", 
+        "value": 0.5412, 
+        "severity": 0, 
+        "severity_value": -0.5412, 
+        "code": "worst_score >= -1", 
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_ct\n  Worst score: 0.5412%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score gaussnorm emd_max_ct", 
+        "value": 0.5412, 
+        "severity": 0, 
+        "severity_value": 0.2706, 
+        "code": "best_score <= 2", 
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_ct\n  Best score: 0.5412%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cytonorm_controls emd_max_ct", 
+        "value": 0.6823, 
+        "severity": 0, 
+        "severity_value": -0.6823, 
+        "code": "worst_score >= -1", 
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_ct\n  Worst score: 0.6823%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cytonorm_controls emd_max_ct", 
+        "value": 0.6823, 
+        "severity": 0, 
+        "severity_value": 0.34115, 
+        "code": "best_score <= 2", 
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_ct\n  Best score: 0.6823%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration emd_mean_global", 
+        "value": 0.1994, 
+        "severity": 0, 
+        "severity_value": -0.1994, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global\n  Worst score: 0.1994%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration emd_mean_global", 
+        "value": 0.1994, 
+        "severity": 0, 
+        "severity_value": 0.0997, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global\n  Best score: 0.1994%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_batch emd_mean_global", 
         "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max\n  Worst score: 0%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_global\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score no_integration emd_max", 
+        "name": "Best score shuffle_integration_by_batch emd_mean_global", 
         "value": 0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max\n  Best score: 0%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_global\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type emd_mean_global", 
+        "value": 0.5174, 
+        "severity": 0, 
+        "severity_value": -0.5174, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global\n  Worst score: 0.5174%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type emd_mean_global", 
+        "value": 0.5174, 
+        "severity": 0, 
+        "severity_value": 0.2587, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global\n  Best score: 0.5174%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy emd_mean_global", 
+        "value": 0.5999, 
+        "severity": 0, 
+        "severity_value": -0.5999, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global\n  Worst score: 0.5999%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy emd_mean_global", 
+        "value": 0.5999, 
+        "severity": 0, 
+        "severity_value": 0.29995, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global\n  Best score: 0.5999%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect emd_mean_global", 
+        "value": 0.5893, 
+        "severity": 0, 
+        "severity_value": -0.5893, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global\n  Worst score: 0.5893%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect emd_mean_global", 
+        "value": 0.5893, 
+        "severity": 0, 
+        "severity_value": 0.29465, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global\n  Best score: 0.5893%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration emd_mean_global", 
+        "value": 0.3614, 
+        "severity": 0, 
+        "severity_value": -0.3614, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global\n  Worst score: 0.3614%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration emd_mean_global", 
+        "value": 0.3614, 
+        "severity": 0, 
+        "severity_value": 0.1807, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global\n  Best score: 0.3614%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score perfect_integration emd_max", 
+        "name": "Worst score perfect_integration emd_mean_global", 
         "value": 1, 
         "severity": 0, 
         "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max\n  Worst score: 1%\n"
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean_global\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score perfect_integration emd_max", 
+        "name": "Best score perfect_integration emd_mean_global", 
         "value": 1, 
         "severity": 0, 
         "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max\n  Best score: 1%\n"
+        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean_global\n  Best score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score combat emd_max", 
-        "value": 0.0744, 
+        "name": "Worst score combat emd_mean_global", 
+        "value": 0.6014, 
         "severity": 0, 
-        "severity_value": -0.0744, 
+        "severity_value": -0.6014, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max\n  Worst score: 0.0744%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global\n  Worst score: 0.6014%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score combat emd_max", 
-        "value": 0.0744, 
+        "name": "Best score combat emd_mean_global", 
+        "value": 0.6014, 
         "severity": 0, 
-        "severity_value": 0.0372, 
+        "severity_value": 0.3007, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max\n  Best score: 0.0744%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global\n  Best score: 0.6014%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cycombine_nocontrols emd_max", 
+        "name": "Worst score cycombine_nocontrols emd_mean_global", 
+        "value": -5.9154, 
+        "severity": 3, 
+        "severity_value": 5.9154, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global\n  Worst score: -5.9154%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols emd_mean_global", 
+        "value": -5.9154, 
+        "severity": 0, 
+        "severity_value": -2.9577, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global\n  Best score: -5.9154%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score gaussnorm emd_mean_global", 
+        "value": 0.457, 
+        "severity": 0, 
+        "severity_value": -0.457, 
+        "code": "worst_score >= -1", 
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global\n  Worst score: 0.457%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score gaussnorm emd_mean_global", 
+        "value": 0.457, 
+        "severity": 0, 
+        "severity_value": 0.2285, 
+        "code": "best_score <= 2", 
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global\n  Best score: 0.457%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cytonorm_controls emd_mean_global", 
+        "value": 0.6441, 
+        "severity": 0, 
+        "severity_value": -0.6441, 
+        "code": "worst_score >= -1", 
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global\n  Worst score: 0.6441%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cytonorm_controls emd_mean_global", 
+        "value": 0.6441, 
+        "severity": 0, 
+        "severity_value": 0.32205, 
+        "code": "best_score <= 2", 
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global\n  Best score: 0.6441%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration emd_max_global", 
+        "value": 0.1301, 
+        "severity": 0, 
+        "severity_value": -0.1301, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global\n  Worst score: 0.1301%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration emd_max_global", 
+        "value": 0.1301, 
+        "severity": 0, 
+        "severity_value": 0.06505, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global\n  Best score: 0.1301%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_batch emd_max_global", 
         "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max\n  Worst score: 0%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_global\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cycombine_nocontrols emd_max", 
+        "name": "Best score shuffle_integration_by_batch emd_max_global", 
         "value": 0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max\n  Best score: 0%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_global\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type emd_max_global", 
+        "value": 0.5869, 
+        "severity": 0, 
+        "severity_value": -0.5869, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global\n  Worst score: 0.5869%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type emd_max_global", 
+        "value": 0.5869, 
+        "severity": 0, 
+        "severity_value": 0.29345, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global\n  Best score: 0.5869%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy emd_max_global", 
+        "value": 0.5859, 
+        "severity": 0, 
+        "severity_value": -0.5859, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global\n  Worst score: 0.5859%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy emd_max_global", 
+        "value": 0.5859, 
+        "severity": 0, 
+        "severity_value": 0.29295, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global\n  Best score: 0.5859%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect emd_max_global", 
+        "value": 0.5722, 
+        "severity": 0, 
+        "severity_value": -0.5722, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global\n  Worst score: 0.5722%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect emd_max_global", 
+        "value": 0.5722, 
+        "severity": 0, 
+        "severity_value": 0.2861, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global\n  Best score: 0.5722%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration emd_max_global", 
+        "value": 0.237, 
+        "severity": 0, 
+        "severity_value": -0.237, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global\n  Worst score: 0.237%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration emd_max_global", 
+        "value": 0.237, 
+        "severity": 0, 
+        "severity_value": 0.1185, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global\n  Best score: 0.237%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration emd_max_global", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": -1.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max_global\n  Worst score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration emd_max_global", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max_global\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat emd_max_global", 
+        "value": 0.5295, 
+        "severity": 0, 
+        "severity_value": -0.5295, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global\n  Worst score: 0.5295%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat emd_max_global", 
+        "value": 0.5295, 
+        "severity": 0, 
+        "severity_value": 0.26475, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global\n  Best score: 0.5295%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols emd_max_global", 
+        "value": -2.5698, 
+        "severity": 2, 
+        "severity_value": 2.5698, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global\n  Worst score: -2.5698%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols emd_max_global", 
+        "value": -2.5698, 
+        "severity": 0, 
+        "severity_value": -1.2849, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global\n  Best score: -2.5698%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score gaussnorm emd_max_global", 
+        "value": 0.4733, 
+        "severity": 0, 
+        "severity_value": -0.4733, 
+        "code": "worst_score >= -1", 
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global\n  Worst score: 0.4733%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score gaussnorm emd_max_global", 
+        "value": 0.4733, 
+        "severity": 0, 
+        "severity_value": 0.23665, 
+        "code": "best_score <= 2", 
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global\n  Best score: 0.4733%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cytonorm_controls emd_max_global", 
+        "value": 0.6241, 
+        "severity": 0, 
+        "severity_value": -0.6241, 
+        "code": "worst_score >= -1", 
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global\n  Worst score: 0.6241%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cytonorm_controls emd_max_global", 
+        "value": 0.6241, 
+        "severity": 0, 
+        "severity_value": 0.31205, 
+        "code": "best_score <= 2", 
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global\n  Best score: 0.6241%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -813,81 +1373,81 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type n_inconsistent_peaks", 
-        "value": 0, 
+        "value": 0.5, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Worst score: 0%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.5%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type n_inconsistent_peaks", 
-        "value": 0, 
+        "value": 0.5, 
         "severity": 0, 
-        "severity_value": 0.0, 
+        "severity_value": 0.25, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Best score: 0%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Best score: 0.5%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy n_inconsistent_peaks", 
-        "value": 0.3684, 
+        "value": 0.75, 
         "severity": 0, 
-        "severity_value": -0.3684, 
+        "severity_value": -0.75, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.3684%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy n_inconsistent_peaks", 
-        "value": 0.3684, 
+        "value": 0.75, 
         "severity": 0, 
-        "severity_value": 0.1842, 
+        "severity_value": 0.375, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Best score: 0.3684%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect n_inconsistent_peaks", 
-        "value": 0.2105, 
+        "value": 0.75, 
         "severity": 0, 
-        "severity_value": -0.2105, 
+        "severity_value": -0.75, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.2105%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect n_inconsistent_peaks", 
-        "value": 0.2105, 
+        "value": 0.75, 
         "severity": 0, 
-        "severity_value": 0.10525, 
+        "severity_value": 0.375, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Best score: 0.2105%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration n_inconsistent_peaks", 
-        "value": 0.2105, 
+        "value": 0.75, 
         "severity": 0, 
-        "severity_value": -0.2105, 
+        "severity_value": -0.75, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.2105%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration n_inconsistent_peaks", 
-        "value": 0.2105, 
+        "value": 0.75, 
         "severity": 0, 
-        "severity_value": 0.10525, 
+        "severity_value": 0.375, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 0.2105%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -913,161 +1473,201 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat n_inconsistent_peaks", 
-        "value": 0.1579, 
+        "value": 0.625, 
         "severity": 0, 
-        "severity_value": -0.1579, 
+        "severity_value": -0.625, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.1579%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.625%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat n_inconsistent_peaks", 
-        "value": 0.1579, 
+        "value": 0.625, 
         "severity": 0, 
-        "severity_value": 0.07895, 
+        "severity_value": 0.3125, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Best score: 0.1579%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Best score: 0.625%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols n_inconsistent_peaks", 
-        "value": 0.2105, 
+        "value": 0.75, 
         "severity": 0, 
-        "severity_value": -0.2105, 
+        "severity_value": -0.75, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.2105%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols n_inconsistent_peaks", 
-        "value": 0.2105, 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": 0.375, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score gaussnorm n_inconsistent_peaks", 
+        "value": 0.625, 
         "severity": 0, 
-        "severity_value": 0.10525, 
+        "severity_value": -0.625, 
+        "code": "worst_score >= -1", 
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.625%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score gaussnorm n_inconsistent_peaks", 
+        "value": 0.625, 
+        "severity": 0, 
+        "severity_value": 0.3125, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Best score: 0.2105%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks\n  Best score: 0.625%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cytonorm_controls n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": -0.75, 
+        "code": "worst_score >= -1", 
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cytonorm_controls n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": 0.375, 
+        "code": "best_score <= 2", 
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration n_inconsistent_peaks_ct", 
-        "value": 0, 
+        "value": 0.0272, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.0272, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0272%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration n_inconsistent_peaks_ct", 
-        "value": 0, 
+        "value": 0.0272, 
         "severity": 0, 
-        "severity_value": 0.0, 
+        "severity_value": 0.0136, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.0272%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_batch n_inconsistent_peaks_ct", 
-        "value": 0.3, 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": -0.3, 
+        "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.3%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_batch n_inconsistent_peaks_ct", 
-        "value": 0.3, 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": 0.15, 
+        "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.3%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
-        "value": 0.4667, 
+        "value": 0.7687, 
         "severity": 0, 
-        "severity_value": -0.4667, 
+        "severity_value": -0.7687, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.4667%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.7687%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
-        "value": 0.4667, 
+        "value": 0.7687, 
         "severity": 0, 
-        "severity_value": 0.23335, 
+        "severity_value": 0.38435, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.4667%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.7687%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy n_inconsistent_peaks_ct", 
-        "value": 0.6, 
+        "value": 0.8741, 
         "severity": 0, 
-        "severity_value": -0.6, 
+        "severity_value": -0.8741, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.6%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8741%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy n_inconsistent_peaks_ct", 
-        "value": 0.6, 
+        "value": 0.8741, 
         "severity": 0, 
-        "severity_value": 0.3, 
+        "severity_value": 0.43705, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.6%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8741%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect n_inconsistent_peaks_ct", 
-        "value": 0.5833, 
+        "value": 0.8707, 
         "severity": 0, 
-        "severity_value": -0.5833, 
+        "severity_value": -0.8707, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.5833%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8707%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect n_inconsistent_peaks_ct", 
-        "value": 0.5833, 
+        "value": 0.8707, 
         "severity": 0, 
-        "severity_value": 0.29165, 
+        "severity_value": 0.43535, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5833%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8707%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration n_inconsistent_peaks_ct", 
-        "value": 0.5833, 
+        "value": 0.8707, 
         "severity": 0, 
-        "severity_value": -0.5833, 
+        "severity_value": -0.8707, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.5833%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8707%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration n_inconsistent_peaks_ct", 
-        "value": 0.5833, 
+        "value": 0.8707, 
         "severity": 0, 
-        "severity_value": 0.29165, 
+        "severity_value": 0.43535, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5833%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8707%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1093,61 +1693,101 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat n_inconsistent_peaks_ct", 
-        "value": 0.5667, 
+        "value": 0.8673, 
         "severity": 0, 
-        "severity_value": -0.5667, 
+        "severity_value": -0.8673, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.5667%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8673%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat n_inconsistent_peaks_ct", 
-        "value": 0.5667, 
+        "value": 0.8673, 
         "severity": 0, 
-        "severity_value": 0.28335, 
+        "severity_value": 0.43365, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5667%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8673%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols n_inconsistent_peaks_ct", 
-        "value": 0.5667, 
+        "value": 0.8265, 
         "severity": 0, 
-        "severity_value": -0.5667, 
+        "severity_value": -0.8265, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.5667%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8265%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols n_inconsistent_peaks_ct", 
-        "value": 0.5667, 
+        "value": 0.8265, 
+        "severity": 0, 
+        "severity_value": 0.41325, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8265%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score gaussnorm n_inconsistent_peaks_ct", 
+        "value": 0.8844, 
+        "severity": 0, 
+        "severity_value": -0.8844, 
+        "code": "worst_score >= -1", 
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8844%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score gaussnorm n_inconsistent_peaks_ct", 
+        "value": 0.8844, 
+        "severity": 0, 
+        "severity_value": 0.4422, 
+        "code": "best_score <= 2", 
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8844%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cytonorm_controls n_inconsistent_peaks_ct", 
+        "value": 0.881, 
         "severity": 0, 
-        "severity_value": 0.28335, 
+        "severity_value": -0.881, 
+        "code": "worst_score >= -1", 
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.881%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cytonorm_controls n_inconsistent_peaks_ct", 
+        "value": 0.881, 
+        "severity": 0, 
+        "severity_value": 0.4405, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.5667%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.881%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration average_batch_r2_global", 
-        "value": 0.5577, 
+        "value": 0.5227, 
         "severity": 0, 
-        "severity_value": -0.5577, 
+        "severity_value": -0.5227, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.5577%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.5227%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration average_batch_r2_global", 
-        "value": 0.5577, 
+        "value": 0.5227, 
         "severity": 0, 
-        "severity_value": 0.27885, 
+        "severity_value": 0.26135, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.5577%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.5227%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1173,81 +1813,81 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type average_batch_r2_global", 
-        "value": 0.7038, 
+        "value": 0.7149, 
         "severity": 0, 
-        "severity_value": -0.7038, 
+        "severity_value": -0.7149, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Worst score: 0.7038%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Worst score: 0.7149%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type average_batch_r2_global", 
-        "value": 0.7038, 
+        "value": 0.7149, 
         "severity": 0, 
-        "severity_value": 0.3519, 
+        "severity_value": 0.35745, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Best score: 0.7038%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Best score: 0.7149%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy average_batch_r2_global", 
-        "value": 0.6141, 
+        "value": 0.7588, 
         "severity": 0, 
-        "severity_value": -0.6141, 
+        "severity_value": -0.7588, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Worst score: 0.6141%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Worst score: 0.7588%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy average_batch_r2_global", 
-        "value": 0.6141, 
+        "value": 0.7588, 
         "severity": 0, 
-        "severity_value": 0.30705, 
+        "severity_value": 0.3794, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Best score: 0.6141%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Best score: 0.7588%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect average_batch_r2_global", 
-        "value": 0.6504, 
+        "value": 0.7621, 
         "severity": 0, 
-        "severity_value": -0.6504, 
+        "severity_value": -0.7621, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Worst score: 0.6504%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Worst score: 0.7621%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect average_batch_r2_global", 
-        "value": 0.6504, 
+        "value": 0.7621, 
         "severity": 0, 
-        "severity_value": 0.3252, 
+        "severity_value": 0.38105, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Best score: 0.6504%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Best score: 0.7621%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration average_batch_r2_global", 
-        "value": 0.1541, 
+        "value": 0.2169, 
         "severity": 0, 
-        "severity_value": -0.1541, 
+        "severity_value": -0.2169, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.1541%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.2169%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration average_batch_r2_global", 
-        "value": 0.1541, 
+        "value": 0.2169, 
         "severity": 0, 
-        "severity_value": 0.07705, 
+        "severity_value": 0.10845, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.1541%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.2169%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1273,61 +1913,101 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat average_batch_r2_global", 
-        "value": 0.6358, 
+        "value": 0.7542, 
         "severity": 0, 
-        "severity_value": -0.6358, 
+        "severity_value": -0.7542, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Worst score: 0.6358%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Worst score: 0.7542%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat average_batch_r2_global", 
-        "value": 0.6358, 
+        "value": 0.7542, 
         "severity": 0, 
-        "severity_value": 0.3179, 
+        "severity_value": 0.3771, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Best score: 0.6358%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Best score: 0.7542%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols average_batch_r2_global", 
-        "value": 0.4123, 
-        "severity": 0, 
-        "severity_value": -0.4123, 
+        "value": -9.7697, 
+        "severity": 3, 
+        "severity_value": 9.7697, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Worst score: 0.4123%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Worst score: -9.7697%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols average_batch_r2_global", 
-        "value": 0.4123, 
+        "value": -9.7697, 
+        "severity": 0, 
+        "severity_value": -4.88485, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Best score: -9.7697%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score gaussnorm average_batch_r2_global", 
+        "value": 0.5404, 
+        "severity": 0, 
+        "severity_value": -0.5404, 
+        "code": "worst_score >= -1", 
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_global\n  Worst score: 0.5404%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score gaussnorm average_batch_r2_global", 
+        "value": 0.5404, 
         "severity": 0, 
-        "severity_value": 0.20615, 
+        "severity_value": 0.2702, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Best score: 0.4123%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_global\n  Best score: 0.5404%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cytonorm_controls average_batch_r2_global", 
+        "value": 0.7639, 
+        "severity": 0, 
+        "severity_value": -0.7639, 
+        "code": "worst_score >= -1", 
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_global\n  Worst score: 0.7639%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cytonorm_controls average_batch_r2_global", 
+        "value": 0.7639, 
+        "severity": 0, 
+        "severity_value": 0.38195, 
+        "code": "best_score <= 2", 
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_global\n  Best score: 0.7639%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration average_batch_r2_ct", 
-        "value": 0.1618, 
+        "value": 0.0627, 
         "severity": 0, 
-        "severity_value": -0.1618, 
+        "severity_value": -0.0627, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.1618%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0627%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration average_batch_r2_ct", 
-        "value": 0.1618, 
+        "value": 0.0627, 
         "severity": 0, 
-        "severity_value": 0.0809, 
+        "severity_value": 0.03135, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.1618%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.0627%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1353,81 +2033,81 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type average_batch_r2_ct", 
-        "value": 0.7318, 
+        "value": 0.8416, 
         "severity": 0, 
-        "severity_value": -0.7318, 
+        "severity_value": -0.8416, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7318%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Worst score: 0.8416%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type average_batch_r2_ct", 
-        "value": 0.7318, 
+        "value": 0.8416, 
         "severity": 0, 
-        "severity_value": 0.3659, 
+        "severity_value": 0.4208, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Best score: 0.7318%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Best score: 0.8416%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy average_batch_r2_ct", 
-        "value": 0.5694, 
+        "value": 0.7975, 
         "severity": 0, 
-        "severity_value": -0.5694, 
+        "severity_value": -0.7975, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Worst score: 0.5694%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7975%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy average_batch_r2_ct", 
-        "value": 0.5694, 
+        "value": 0.7975, 
         "severity": 0, 
-        "severity_value": 0.2847, 
+        "severity_value": 0.39875, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Best score: 0.5694%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Best score: 0.7975%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect average_batch_r2_ct", 
-        "value": 0.4971, 
+        "value": 0.755, 
         "severity": 0, 
-        "severity_value": -0.4971, 
+        "severity_value": -0.755, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Worst score: 0.4971%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Worst score: 0.755%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect average_batch_r2_ct", 
-        "value": 0.4971, 
+        "value": 0.755, 
         "severity": 0, 
-        "severity_value": 0.24855, 
+        "severity_value": 0.3775, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Best score: 0.4971%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Best score: 0.755%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration average_batch_r2_ct", 
-        "value": 0.449, 
+        "value": 0.7073, 
         "severity": 0, 
-        "severity_value": -0.449, 
+        "severity_value": -0.7073, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.449%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7073%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration average_batch_r2_ct", 
-        "value": 0.449, 
+        "value": 0.7073, 
         "severity": 0, 
-        "severity_value": 0.2245, 
+        "severity_value": 0.35365, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.449%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.7073%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1453,40 +2133,80 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat average_batch_r2_ct", 
-        "value": 0.5196, 
+        "value": 0.7595, 
         "severity": 0, 
-        "severity_value": -0.5196, 
+        "severity_value": -0.7595, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Worst score: 0.5196%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7595%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat average_batch_r2_ct", 
-        "value": 0.5196, 
+        "value": 0.7595, 
         "severity": 0, 
-        "severity_value": 0.2598, 
+        "severity_value": 0.37975, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Best score: 0.5196%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Best score: 0.7595%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols average_batch_r2_ct", 
-        "value": 0.5818, 
-        "severity": 0, 
-        "severity_value": -0.5818, 
+        "value": -1.3847, 
+        "severity": 1, 
+        "severity_value": 1.3847, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Worst score: 0.5818%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Worst score: -1.3847%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols average_batch_r2_ct", 
-        "value": 0.5818, 
+        "value": -1.3847, 
+        "severity": 0, 
+        "severity_value": -0.69235, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Best score: -1.3847%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score gaussnorm average_batch_r2_ct", 
+        "value": 0.7243, 
+        "severity": 0, 
+        "severity_value": -0.7243, 
+        "code": "worst_score >= -1", 
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7243%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score gaussnorm average_batch_r2_ct", 
+        "value": 0.7243, 
+        "severity": 0, 
+        "severity_value": 0.36215, 
+        "code": "best_score <= 2", 
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_ct\n  Best score: 0.7243%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cytonorm_controls average_batch_r2_ct", 
+        "value": 0.8645, 
+        "severity": 0, 
+        "severity_value": -0.8645, 
+        "code": "worst_score >= -1", 
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_ct\n  Worst score: 0.8645%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cytonorm_controls average_batch_r2_ct", 
+        "value": 0.8645, 
         "severity": 0, 
-        "severity_value": 0.2909, 
+        "severity_value": 0.43225, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Best score: 0.5818%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_ct\n  Best score: 0.8645%\n"
     }
 ]
\ No newline at end of file
diff --git a/results/cyto_batch_integration/data/results.json b/results/cyto_batch_integration/data/results.json
index a98e407d..c51e6461 100644
--- a/results/cyto_batch_integration/data/results.json
+++ b/results/cyto_batch_integration/data/results.json
@@ -1,272 +1,376 @@
 [
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "combat",
     "metric_values": {
-      "average_batch_r2_ct": 0.1219,
-      "average_batch_r2_global": 0.0268,
-      "emd_max": 32.3545,
-      "emd_mean": 3.1871,
-      "n_inconsistent_peaks": 16,
-      "n_inconsistent_peaks_ct": 26
+      "average_batch_r2_ct": 0.0523,
+      "average_batch_r2_global": 0.008,
+      "emd_max_ct": 1.2193,
+      "emd_max_global": 0.3177,
+      "emd_mean_ct": 0.1348,
+      "emd_mean_global": 0.0749,
+      "n_inconsistent_peaks": 3,
+      "n_inconsistent_peaks_ct": 39
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.5196,
-      "average_batch_r2_global": 0.6358,
-      "emd_max": 0.0744,
-      "emd_mean": 0.5582,
-      "n_inconsistent_peaks": 0.1579,
-      "n_inconsistent_peaks_ct": 0.5667
+      "average_batch_r2_ct": 0.7595,
+      "average_batch_r2_global": 0.7542,
+      "emd_max_ct": 0.5453,
+      "emd_max_global": 0.5295,
+      "emd_mean_ct": 0.7767,
+      "emd_mean_global": 0.6014,
+      "n_inconsistent_peaks": 0.625,
+      "n_inconsistent_peaks_ct": 0.8673
     },
-    "mean_score": 0.4188,
+    "mean_score": 0.6824,
     "resources": {
-      "submit": "2025-03-19 21:42:34",
+      "submit": "2025-05-15 11:06:35",
       "exit_code": 0,
-      "duration_sec": 4.9,
-      "cpu_pct": 136.7,
-      "peak_memory_mb": 1844,
-      "disk_read_mb": 48,
-      "disk_write_mb": 3
+      "duration_sec": 100,
+      "cpu_pct": 238.2,
+      "peak_memory_mb": 10752,
+      "disk_read_mb": 512,
+      "disk_write_mb": 808
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "cycombine_nocontrols",
     "metric_values": {
-      "average_batch_r2_ct": 0.1061,
-      "average_batch_r2_global": 0.0433,
-      "emd_max": 34.9545,
-      "emd_mean": 3.1082,
-      "n_inconsistent_peaks": 15,
-      "n_inconsistent_peaks_ct": 26
+      "average_batch_r2_ct": 0.5186,
+      "average_batch_r2_global": 0.3526,
+      "emd_max_ct": 2.9856,
+      "emd_max_global": 2.4106,
+      "emd_mean_ct": 1.1746,
+      "emd_mean_global": 1.2999,
+      "n_inconsistent_peaks": 2,
+      "n_inconsistent_peaks_ct": 51
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.5818,
-      "average_batch_r2_global": 0.4123,
-      "emd_max": 0,
-      "emd_mean": 0.5691,
-      "n_inconsistent_peaks": 0.2105,
-      "n_inconsistent_peaks_ct": 0.5667
+      "average_batch_r2_ct": -1.3847,
+      "average_batch_r2_global": -9.7697,
+      "emd_max_ct": -0.1133,
+      "emd_max_global": -2.5698,
+      "emd_mean_ct": -0.945,
+      "emd_mean_global": -5.9154,
+      "n_inconsistent_peaks": 0.75,
+      "n_inconsistent_peaks_ct": 0.8265
     },
-    "mean_score": 0.3901,
+    "mean_score": 0.1971,
     "resources": {
-      "submit": "2025-03-19 21:42:34",
+      "submit": "2025-05-15 11:06:35",
       "exit_code": 0,
-      "duration_sec": 14.2,
-      "cpu_pct": 129.1,
-      "peak_memory_mb": 2151,
-      "disk_read_mb": 55,
-      "disk_write_mb": 3
+      "duration_sec": 363,
+      "cpu_pct": 103.3,
+      "peak_memory_mb": 15156,
+      "disk_read_mb": 520,
+      "disk_write_mb": 816
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "cytonorm_controls",
+    "metric_values": {
+      "average_batch_r2_ct": 0.0295,
+      "average_batch_r2_global": 0.0077,
+      "emd_max_ct": 0.8521,
+      "emd_max_global": 0.2538,
+      "emd_mean_ct": 0.101,
+      "emd_mean_global": 0.0669,
+      "n_inconsistent_peaks": 2,
+      "n_inconsistent_peaks_ct": 35
+    },
+    "scaled_scores": {
+      "average_batch_r2_ct": 0.8645,
+      "average_batch_r2_global": 0.7639,
+      "emd_max_ct": 0.6823,
+      "emd_max_global": 0.6241,
+      "emd_mean_ct": 0.8328,
+      "emd_mean_global": 0.6441,
+      "n_inconsistent_peaks": 0.75,
+      "n_inconsistent_peaks_ct": 0.881
+    },
+    "mean_score": 0.7553,
+    "resources": {
+      "submit": "2025-05-15 11:06:35",
+      "exit_code": 0,
+      "duration_sec": 835,
+      "cpu_pct": 101.4,
+      "peak_memory_mb": 13312,
+      "disk_read_mb": 2151,
+      "disk_write_mb": 2356
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "gaussnorm",
+    "metric_values": {
+      "average_batch_r2_ct": 0.06,
+      "average_batch_r2_global": 0.015,
+      "emd_max_ct": 1.2303,
+      "emd_max_global": 0.3556,
+      "emd_mean_ct": 0.1555,
+      "emd_mean_global": 0.1021,
+      "n_inconsistent_peaks": 3,
+      "n_inconsistent_peaks_ct": 34
+    },
+    "scaled_scores": {
+      "average_batch_r2_ct": 0.7243,
+      "average_batch_r2_global": 0.5404,
+      "emd_max_ct": 0.5412,
+      "emd_max_global": 0.4733,
+      "emd_mean_ct": 0.7424,
+      "emd_mean_global": 0.457,
+      "n_inconsistent_peaks": 0.625,
+      "n_inconsistent_peaks_ct": 0.8844
+    },
+    "mean_score": 0.6235,
+    "resources": {
+      "submit": "2025-05-15 11:06:35",
+      "exit_code": 0,
+      "duration_sec": 429,
+      "cpu_pct": 98.7,
+      "peak_memory_mb": 12391,
+      "disk_read_mb": 957,
+      "disk_write_mb": 1127
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "harmonypy",
     "metric_values": {
-      "average_batch_r2_ct": 0.1093,
-      "average_batch_r2_global": 0.0284,
-      "emd_max": 26.8545,
-      "emd_mean": 3.0487,
-      "n_inconsistent_peaks": 12,
-      "n_inconsistent_peaks_ct": 24
+      "average_batch_r2_ct": 0.044,
+      "average_batch_r2_global": 0.0079,
+      "emd_max_ct": 1.1795,
+      "emd_max_global": 0.2796,
+      "emd_mean_ct": 0.129,
+      "emd_mean_global": 0.0752,
+      "n_inconsistent_peaks": 2,
+      "n_inconsistent_peaks_ct": 37
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.5694,
-      "average_batch_r2_global": 0.6141,
-      "emd_max": 0.2317,
-      "emd_mean": 0.5774,
-      "n_inconsistent_peaks": 0.3684,
-      "n_inconsistent_peaks_ct": 0.6
+      "average_batch_r2_ct": 0.7975,
+      "average_batch_r2_global": 0.7588,
+      "emd_max_ct": 0.5602,
+      "emd_max_global": 0.5859,
+      "emd_mean_ct": 0.7864,
+      "emd_mean_global": 0.5999,
+      "n_inconsistent_peaks": 0.75,
+      "n_inconsistent_peaks_ct": 0.8741
     },
-    "mean_score": 0.4935,
+    "mean_score": 0.7141,
     "resources": {
-      "submit": "2025-03-19 21:42:34",
+      "submit": "2025-05-15 11:06:35",
       "exit_code": 0,
-      "duration_sec": 9,
-      "cpu_pct": 752.8,
-      "peak_memory_mb": 2560,
-      "disk_read_mb": 34,
-      "disk_write_mb": 2
+      "duration_sec": 4939,
+      "cpu_pct": 484.8,
+      "peak_memory_mb": 30823,
+      "disk_read_mb": 498,
+      "disk_write_mb": 466
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "limma_remove_batch_effect",
     "metric_values": {
-      "average_batch_r2_ct": 0.1276,
-      "average_batch_r2_global": 0.0258,
-      "emd_max": 33.8545,
-      "emd_mean": 3.2163,
-      "n_inconsistent_peaks": 15,
-      "n_inconsistent_peaks_ct": 25
+      "average_batch_r2_ct": 0.0533,
+      "average_batch_r2_global": 0.0078,
+      "emd_max_ct": 1.1958,
+      "emd_max_global": 0.2889,
+      "emd_mean_ct": 0.1375,
+      "emd_mean_global": 0.0772,
+      "n_inconsistent_peaks": 2,
+      "n_inconsistent_peaks_ct": 38
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.4971,
-      "average_batch_r2_global": 0.6504,
-      "emd_max": 0.0315,
-      "emd_mean": 0.5541,
-      "n_inconsistent_peaks": 0.2105,
-      "n_inconsistent_peaks_ct": 0.5833
+      "average_batch_r2_ct": 0.755,
+      "average_batch_r2_global": 0.7621,
+      "emd_max_ct": 0.5541,
+      "emd_max_global": 0.5722,
+      "emd_mean_ct": 0.7724,
+      "emd_mean_global": 0.5893,
+      "n_inconsistent_peaks": 0.75,
+      "n_inconsistent_peaks_ct": 0.8707
     },
-    "mean_score": 0.4212,
+    "mean_score": 0.7032,
     "resources": {
-      "submit": "2025-03-19 21:42:35",
+      "submit": "2025-05-15 11:06:35",
       "exit_code": 0,
-      "duration_sec": 4.1,
-      "cpu_pct": 214,
-      "peak_memory_mb": 1127,
-      "disk_read_mb": 30,
-      "disk_write_mb": 2
+      "duration_sec": 102,
+      "cpu_pct": 110.9,
+      "peak_memory_mb": 11572,
+      "disk_read_mb": 495,
+      "disk_write_mb": 554
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "no_integration",
     "metric_values": {
-      "average_batch_r2_ct": 0.1398,
-      "average_batch_r2_global": 0.0623,
-      "emd_max": 34.9545,
-      "emd_mean": 3.5885,
-      "n_inconsistent_peaks": 15,
-      "n_inconsistent_peaks_ct": 25
+      "average_batch_r2_ct": 0.0637,
+      "average_batch_r2_global": 0.0256,
+      "emd_max_ct": 1.2375,
+      "emd_max_global": 0.5152,
+      "emd_mean_ct": 0.1538,
+      "emd_mean_global": 0.1201,
+      "n_inconsistent_peaks": 2,
+      "n_inconsistent_peaks_ct": 38
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.449,
-      "average_batch_r2_global": 0.1541,
-      "emd_max": 0,
-      "emd_mean": 0.5025,
-      "n_inconsistent_peaks": 0.2105,
-      "n_inconsistent_peaks_ct": 0.5833
+      "average_batch_r2_ct": 0.7073,
+      "average_batch_r2_global": 0.2169,
+      "emd_max_ct": 0.5386,
+      "emd_max_global": 0.237,
+      "emd_mean_ct": 0.7454,
+      "emd_mean_global": 0.3614,
+      "n_inconsistent_peaks": 0.75,
+      "n_inconsistent_peaks_ct": 0.8707
     },
-    "mean_score": 0.3166,
+    "mean_score": 0.5534,
     "resources": {
-      "submit": "2025-03-19 21:42:34",
+      "submit": "2025-05-15 11:06:35",
       "exit_code": 0,
-      "duration_sec": 1.9,
-      "cpu_pct": 250.3,
-      "peak_memory_mb": 764,
-      "disk_read_mb": 20,
-      "disk_write_mb": 2
+      "duration_sec": 48.4,
+      "cpu_pct": 116.7,
+      "peak_memory_mb": 6656,
+      "disk_read_mb": 486,
+      "disk_write_mb": 466
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "perfect_integration",
     "metric_values": {
-      "average_batch_r2_ct": 2.3291e-19,
+      "average_batch_r2_ct": -3.7381e-19,
       "average_batch_r2_global": 0,
-      "emd_max": 0,
-      "emd_mean": 0,
+      "emd_max_ct": 0,
+      "emd_max_global": 0,
+      "emd_mean_ct": 0,
+      "emd_mean_global": 0,
       "n_inconsistent_peaks": 0,
       "n_inconsistent_peaks_ct": 0
     },
     "scaled_scores": {
       "average_batch_r2_ct": 1,
       "average_batch_r2_global": 1,
-      "emd_max": 1,
-      "emd_mean": 1,
+      "emd_max_ct": 1,
+      "emd_max_global": 1,
+      "emd_mean_ct": 1,
+      "emd_mean_global": 1,
       "n_inconsistent_peaks": 1,
       "n_inconsistent_peaks_ct": 1
     },
     "mean_score": 1,
     "resources": {
-      "submit": "2025-03-19 21:42:35",
+      "submit": "2025-05-15 11:06:35",
       "exit_code": 0,
-      "duration_sec": 1.6,
-      "cpu_pct": 287.3,
-      "peak_memory_mb": 770,
-      "disk_read_mb": 19,
-      "disk_write_mb": 1
+      "duration_sec": 35.1,
+      "cpu_pct": 109.9,
+      "peak_memory_mb": 6247,
+      "disk_read_mb": 321,
+      "disk_write_mb": 302
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration",
     "metric_values": {
-      "average_batch_r2_ct": 0.2127,
-      "average_batch_r2_global": 0.0326,
-      "emd_max": 33.85,
-      "emd_mean": 6.684,
-      "n_inconsistent_peaks": 19,
-      "n_inconsistent_peaks_ct": 60
+      "average_batch_r2_ct": 0.2038,
+      "average_batch_r2_global": 0.0156,
+      "emd_max_ct": 2.5607,
+      "emd_max_global": 0.5874,
+      "emd_mean_ct": 0.5893,
+      "emd_mean_global": 0.1505,
+      "n_inconsistent_peaks": 8,
+      "n_inconsistent_peaks_ct": 286
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.1618,
-      "average_batch_r2_global": 0.5577,
-      "emd_max": 0.0316,
-      "emd_mean": 0.0734,
+      "average_batch_r2_ct": 0.0627,
+      "average_batch_r2_global": 0.5227,
+      "emd_max_ct": 0.0451,
+      "emd_max_global": 0.1301,
+      "emd_mean_ct": 0.0241,
+      "emd_mean_global": 0.1994,
       "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0
+      "n_inconsistent_peaks_ct": 0.0272
     },
-    "mean_score": 0.1374,
+    "mean_score": 0.1264,
     "resources": {
-      "submit": "2025-03-19 21:42:34",
+      "submit": "2025-05-15 11:06:35",
       "exit_code": 0,
-      "duration_sec": 2,
-      "cpu_pct": 197.9,
-      "peak_memory_mb": 760,
-      "disk_read_mb": 20,
-      "disk_write_mb": 2
+      "duration_sec": 54.8,
+      "cpu_pct": 98,
+      "peak_memory_mb": 7783,
+      "disk_read_mb": 486,
+      "disk_write_mb": 481
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_batch",
     "metric_values": {
-      "average_batch_r2_ct": 0.2538,
-      "average_batch_r2_global": 0.0737,
-      "emd_max": 30.4333,
-      "emd_mean": 7.2138,
-      "n_inconsistent_peaks": 19,
-      "n_inconsistent_peaks_ct": 42
+      "average_batch_r2_ct": 0.2175,
+      "average_batch_r2_global": 0.0327,
+      "emd_max_ct": 2.6817,
+      "emd_max_global": 0.6753,
+      "emd_mean_ct": 0.6039,
+      "emd_mean_global": 0.188,
+      "n_inconsistent_peaks": 8,
+      "n_inconsistent_peaks_ct": 294
     },
     "scaled_scores": {
       "average_batch_r2_ct": 0,
       "average_batch_r2_global": 0,
-      "emd_max": 0.1293,
-      "emd_mean": 0,
+      "emd_max_ct": 0,
+      "emd_max_global": 0,
+      "emd_mean_ct": 0,
+      "emd_mean_global": 0,
       "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0.3
+      "n_inconsistent_peaks_ct": 0
     },
-    "mean_score": 0.0716,
+    "mean_score": 0,
     "resources": {
-      "submit": "2025-03-19 21:42:35",
+      "submit": "2025-05-15 11:06:35",
       "exit_code": 0,
-      "duration_sec": 1.7,
-      "cpu_pct": 274.1,
-      "peak_memory_mb": 768,
-      "disk_read_mb": 20,
-      "disk_write_mb": 2
+      "duration_sec": 54,
+      "cpu_pct": 96.9,
+      "peak_memory_mb": 7476,
+      "disk_read_mb": 486,
+      "disk_write_mb": 480
     }
   },
   {
-    "dataset_id": "cyto_spleen_subset",
+    "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_values": {
-      "average_batch_r2_ct": 0.0681,
-      "average_batch_r2_global": 0.0218,
-      "emd_max": 27,
-      "emd_mean": 2.9222,
-      "n_inconsistent_peaks": 19,
-      "n_inconsistent_peaks_ct": 32
+      "average_batch_r2_ct": 0.0344,
+      "average_batch_r2_global": 0.0093,
+      "emd_max_ct": 1.232,
+      "emd_max_global": 0.279,
+      "emd_mean_ct": 0.1335,
+      "emd_mean_global": 0.0907,
+      "n_inconsistent_peaks": 4,
+      "n_inconsistent_peaks_ct": 68
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.7318,
-      "average_batch_r2_global": 0.7038,
-      "emd_max": 0.2276,
-      "emd_mean": 0.5949,
-      "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0.4667
+      "average_batch_r2_ct": 0.8416,
+      "average_batch_r2_global": 0.7149,
+      "emd_max_ct": 0.5406,
+      "emd_max_global": 0.5869,
+      "emd_mean_ct": 0.779,
+      "emd_mean_global": 0.5174,
+      "n_inconsistent_peaks": 0.5,
+      "n_inconsistent_peaks_ct": 0.7687
     },
-    "mean_score": 0.4541,
+    "mean_score": 0.6561,
     "resources": {
-      "submit": "2025-03-19 21:42:34",
+      "submit": "2025-05-15 11:06:35",
       "exit_code": 0,
-      "duration_sec": 2.1,
-      "cpu_pct": 178.5,
-      "peak_memory_mb": 755,
-      "disk_read_mb": 20,
-      "disk_write_mb": 2
+      "duration_sec": 58.3,
+      "cpu_pct": 105.1,
+      "peak_memory_mb": 7476,
+      "disk_read_mb": 486,
+      "disk_write_mb": 480
     }
   }
 ]

From 3005a7891b58477221b429b0e714ff067eb44628 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Fri, 16 May 2025 16:44:06 +0200
Subject: [PATCH 11/14] fix data reference

---
 results/cyto_batch_integration/data/dataset_info.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/results/cyto_batch_integration/data/dataset_info.json b/results/cyto_batch_integration/data/dataset_info.json
index 519fa0e1..e85fcd50 100644
--- a/results/cyto_batch_integration/data/dataset_info.json
+++ b/results/cyto_batch_integration/data/dataset_info.json
@@ -4,7 +4,7 @@
     "dataset_name": "Leomazzi Spleen Cytometry",
     "dataset_summary": "Flow cytometry data of spleens of 8 mice. For each mouse, aliquotes of the same original sample were divided into 2 batches and measured with 2 different instrument settings to allow the creation of sample-paired replicates for benchmarking purposes.",
     "dataset_description": "Flow cytometry data of spleens from 4 WT (IKK2 fl/fl CD11c-cre +/+) and 4 KO (IKK2 fl/fl CD11c-cre Tg/+) B6 mice, measured with a 22-color panel and 2 different instrument settings. Data has been preprocessed (compensated with a batch-specific compensation matrix, logicle transformed, cleaned with PeacoQC and pregated on live single CD45+ cells).",
-    "data_reference": [],
+    "data_reference": null,
     "data_url": "https://saeyslab.sites.vib.be/en",
     "date_created": "15-05-2025",
     "file_size": 489781536

From b9c0aa5125fd5ea949b42cfccafef70ff2c5313c Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Fri, 16 May 2025 20:16:55 +0200
Subject: [PATCH 12/14] fix netlify config

---
 _publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_publish.yml b/_publish.yml
index 01f35491..a5c612a3 100644
--- a/_publish.yml
+++ b/_publish.yml
@@ -1,4 +1,4 @@
 - source: project
   netlify:
     - id: 397b6416-708f-4133-afe9-9a07ed2e03bf
-      url: 'https://openproblems.bio'
+      url: 'https://openproblems.netlify.app'

From fd3d59323b584bcd80a7d71246191cbd5341e3c3 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Thu, 22 May 2025 09:39:10 +0200
Subject: [PATCH 13/14] update data

---
 .../data/dataset_info.json                    |   2 +-
 .../data/method_info.json                     |  44 +-
 .../data/metric_execution_info.json           | 254 +++---
 .../data/metric_info.json                     |  32 +-
 .../data/quality_control.json                 | 764 +++++++++---------
 .../cyto_batch_integration/data/results.json  | 278 +++----
 6 files changed, 687 insertions(+), 687 deletions(-)

diff --git a/results/cyto_batch_integration/data/dataset_info.json b/results/cyto_batch_integration/data/dataset_info.json
index e85fcd50..8a42df39 100644
--- a/results/cyto_batch_integration/data/dataset_info.json
+++ b/results/cyto_batch_integration/data/dataset_info.json
@@ -6,7 +6,7 @@
     "dataset_description": "Flow cytometry data of spleens from 4 WT (IKK2 fl/fl CD11c-cre +/+) and 4 KO (IKK2 fl/fl CD11c-cre Tg/+) B6 mice, measured with a 22-color panel and 2 different instrument settings. Data has been preprocessed (compensated with a batch-specific compensation matrix, logicle transformed, cleaned with PeacoQC and pregated on live single CD45+ cells).",
     "data_reference": null,
     "data_url": "https://saeyslab.sites.vib.be/en",
-    "date_created": "15-05-2025",
+    "date_created": "22-05-2025",
     "file_size": 489781536
   }
 ]
diff --git a/results/cyto_batch_integration/data/method_info.json b/results/cyto_batch_integration/data/method_info.json
index a0ec19eb..b8e6339e 100644
--- a/results/cyto_batch_integration/data/method_info.json
+++ b/results/cyto_batch_integration/data/method_info.json
@@ -11,9 +11,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/control_methods/shuffle_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/control_methods/shuffle_integration",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
   },
   {
     "task_id": "control_methods",
@@ -27,9 +27,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_batch:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/control_methods/shuffle_integration_by_batch",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/control_methods/shuffle_integration_by_batch",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
   },
   {
     "task_id": "control_methods",
@@ -43,9 +43,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_cell_type:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/control_methods/shuffle_integration_by_cell_type",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/control_methods/shuffle_integration_by_cell_type",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
   },
   {
     "task_id": "methods",
@@ -59,9 +59,9 @@
     "code_url": "https://github.com/slowkow/harmonypy",
     "documentation_url": "https://portals.broadinstitute.org/harmony",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/harmonypy:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/harmonypy",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/harmonypy",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
   },
   {
     "task_id": "methods",
@@ -75,9 +75,9 @@
     "code_url": "https://github.com/bioc/limma",
     "documentation_url": "https://bioinf.wehi.edu.au/limma",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/limma_remove_batch_effect:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/limma_remove_batch_effect",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/limma_remove_batch_effect",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
   },
   {
     "task_id": "control_methods",
@@ -91,9 +91,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/no_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/control_methods/no_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/control_methods/no_integration",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
   },
   {
     "task_id": "control_methods",
@@ -107,9 +107,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/perfect_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/control_methods/perfect_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/control_methods/perfect_integration",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
   },
   {
     "task_id": "methods",
@@ -123,9 +123,9 @@
     "code_url": "https://github.com/brentp/combat.py",
     "documentation_url": "https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/combat:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/combat",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/combat",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
   },
   {
     "task_id": "methods",
@@ -139,9 +139,9 @@
     "code_url": "https://github.com/biosurf/cyCombine",
     "documentation_url": "https://biosurf.org/cyCombine.html",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/cycombine_nocontrols:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/cycombine_nocontrols",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/cycombine_nocontrols",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
   },
   {
     "task_id": "methods",
@@ -155,9 +155,9 @@
     "code_url": "https://github.com/RGLab/flowStats",
     "documentation_url": "https://rdrr.io/bioc/flowStats/src/R/gaussNorm.R",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/gaussnorm:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/gaussnorm",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/gaussnorm",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
   },
   {
     "task_id": "methods",
@@ -171,8 +171,8 @@
     "code_url": "https://github.com/saeyslab/CytoNorm",
     "documentation_url": "https://github.com/saeyslab/CytoNorm",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/cytonorm_controls:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/methods/cytonorm_controls",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/cytonorm_controls",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb"
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
   }
 ]
diff --git a/results/cyto_batch_integration/data/metric_execution_info.json b/results/cyto_batch_integration/data/metric_execution_info.json
index bcd66343..b11bbe09 100644
--- a/results/cyto_batch_integration/data/metric_execution_info.json
+++ b/results/cyto_batch_integration/data/metric_execution_info.json
@@ -4,10 +4,10 @@
     "method_id": "combat",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-15 11:10:54",
+      "submit": "2025-05-22 05:49:12",
       "exit_code": 0,
-      "duration_sec": 192,
-      "cpu_pct": 422.4,
+      "duration_sec": 224,
+      "cpu_pct": 390,
       "peak_memory_mb": 9728,
       "disk_read_mb": 3278,
       "disk_write_mb": 2
@@ -18,11 +18,11 @@
     "method_id": "combat",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-15 11:10:54",
+      "submit": "2025-05-22 05:49:12",
       "exit_code": 0,
-      "duration_sec": 488,
-      "cpu_pct": 101.8,
-      "peak_memory_mb": 10445,
+      "duration_sec": 444,
+      "cpu_pct": 102.2,
+      "peak_memory_mb": 7680,
       "disk_read_mb": 6556,
       "disk_write_mb": 4
     }
@@ -32,10 +32,10 @@
     "method_id": "combat",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-15 11:10:54",
+      "submit": "2025-05-22 05:49:12",
       "exit_code": 0,
-      "duration_sec": 2170,
-      "cpu_pct": 2227.8,
+      "duration_sec": 1994,
+      "cpu_pct": 2554.4,
       "peak_memory_mb": 9524,
       "disk_read_mb": 3278,
       "disk_write_mb": 2
@@ -46,11 +46,11 @@
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-15 11:15:44",
+      "submit": "2025-05-22 05:58:31",
       "exit_code": 0,
-      "duration_sec": 214,
-      "cpu_pct": 378.6,
-      "peak_memory_mb": 9728,
+      "duration_sec": 190,
+      "cpu_pct": 171,
+      "peak_memory_mb": 5735,
       "disk_read_mb": 3278,
       "disk_write_mb": 2
     }
@@ -60,11 +60,11 @@
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-15 11:15:44",
+      "submit": "2025-05-22 05:58:31",
       "exit_code": 0,
-      "duration_sec": 500,
-      "cpu_pct": 101.8,
-      "peak_memory_mb": 10445,
+      "duration_sec": 448,
+      "cpu_pct": 99.2,
+      "peak_memory_mb": 6349,
       "disk_read_mb": 6556,
       "disk_write_mb": 4
     }
@@ -74,11 +74,11 @@
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-15 11:15:44",
+      "submit": "2025-05-22 05:58:31",
       "exit_code": 0,
-      "duration_sec": 1838,
-      "cpu_pct": 3022.1,
-      "peak_memory_mb": 9421,
+      "duration_sec": 1292,
+      "cpu_pct": 735.3,
+      "peak_memory_mb": 5428,
       "disk_read_mb": 3278,
       "disk_write_mb": 2
     }
@@ -88,11 +88,11 @@
     "method_id": "cytonorm_controls",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-15 11:23:44",
+      "submit": "2025-05-22 06:08:31",
       "exit_code": 0,
-      "duration_sec": 228,
-      "cpu_pct": 283.7,
-      "peak_memory_mb": 9728,
+      "duration_sec": 208,
+      "cpu_pct": 249.5,
+      "peak_memory_mb": 7066,
       "disk_read_mb": 2664,
       "disk_write_mb": 2
     }
@@ -102,11 +102,11 @@
     "method_id": "cytonorm_controls",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-15 11:23:44",
+      "submit": "2025-05-22 06:08:31",
       "exit_code": 0,
-      "duration_sec": 508,
-      "cpu_pct": 102,
-      "peak_memory_mb": 10343,
+      "duration_sec": 460,
+      "cpu_pct": 102.1,
+      "peak_memory_mb": 10445,
       "disk_read_mb": 5328,
       "disk_write_mb": 4
     }
@@ -116,11 +116,11 @@
     "method_id": "cytonorm_controls",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-15 11:23:44",
+      "submit": "2025-05-22 06:08:31",
       "exit_code": 0,
-      "duration_sec": 1850,
-      "cpu_pct": 2815.2,
-      "peak_memory_mb": 9421,
+      "duration_sec": 1318,
+      "cpu_pct": 3544.8,
+      "peak_memory_mb": 9524,
       "disk_read_mb": 2664,
       "disk_write_mb": 2
     }
@@ -130,11 +130,11 @@
     "method_id": "gaussnorm",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-15 11:16:54",
+      "submit": "2025-05-22 05:57:41",
       "exit_code": 0,
-      "duration_sec": 220,
-      "cpu_pct": 274.3,
-      "peak_memory_mb": 9728,
+      "duration_sec": 188,
+      "cpu_pct": 173.5,
+      "peak_memory_mb": 5735,
       "disk_read_mb": 2868,
       "disk_write_mb": 2
     }
@@ -144,11 +144,11 @@
     "method_id": "gaussnorm",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-15 11:16:54",
+      "submit": "2025-05-22 05:57:41",
       "exit_code": 0,
-      "duration_sec": 468,
-      "cpu_pct": 105.4,
-      "peak_memory_mb": 10445,
+      "duration_sec": 456,
+      "cpu_pct": 97,
+      "peak_memory_mb": 6349,
       "disk_read_mb": 5736,
       "disk_write_mb": 4
     }
@@ -158,11 +158,11 @@
     "method_id": "gaussnorm",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-15 11:16:54",
+      "submit": "2025-05-22 05:57:41",
       "exit_code": 0,
-      "duration_sec": 2180,
-      "cpu_pct": 1892.7,
-      "peak_memory_mb": 9421,
+      "duration_sec": 1302,
+      "cpu_pct": 733.7,
+      "peak_memory_mb": 5428,
       "disk_read_mb": 2868,
       "disk_write_mb": 2
     }
@@ -172,11 +172,11 @@
     "method_id": "harmonypy",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-15 14:03:04",
+      "submit": "2025-05-22 06:48:21",
       "exit_code": 0,
-      "duration_sec": 200,
-      "cpu_pct": 408.9,
-      "peak_memory_mb": 8602,
+      "duration_sec": 176,
+      "cpu_pct": 179.5,
+      "peak_memory_mb": 4608,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
@@ -186,11 +186,11 @@
     "method_id": "harmonypy",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-15 14:03:04",
+      "submit": "2025-05-22 06:48:21",
       "exit_code": 0,
-      "duration_sec": 456,
-      "cpu_pct": 107.1,
-      "peak_memory_mb": 9626,
+      "duration_sec": 448,
+      "cpu_pct": 99.9,
+      "peak_memory_mb": 6861,
       "disk_read_mb": 4916,
       "disk_write_mb": 4
     }
@@ -200,11 +200,11 @@
     "method_id": "harmonypy",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-15 14:03:04",
+      "submit": "2025-05-22 06:48:21",
       "exit_code": 0,
-      "duration_sec": 1322,
-      "cpu_pct": 3551.4,
-      "peak_memory_mb": 8500,
+      "duration_sec": 1346,
+      "cpu_pct": 1436.2,
+      "peak_memory_mb": 5837,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
@@ -214,10 +214,10 @@
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-15 11:11:14",
+      "submit": "2025-05-22 05:52:11",
       "exit_code": 0,
-      "duration_sec": 200,
-      "cpu_pct": 391.8,
+      "duration_sec": 204,
+      "cpu_pct": 395.5,
       "peak_memory_mb": 9728,
       "disk_read_mb": 2664,
       "disk_write_mb": 2
@@ -228,11 +228,11 @@
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-15 11:11:14",
+      "submit": "2025-05-22 05:52:11",
       "exit_code": 0,
-      "duration_sec": 452,
-      "cpu_pct": 106,
-      "peak_memory_mb": 10445,
+      "duration_sec": 444,
+      "cpu_pct": 99.3,
+      "peak_memory_mb": 6349,
       "disk_read_mb": 5328,
       "disk_write_mb": 4
     }
@@ -242,11 +242,11 @@
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-15 11:11:14",
+      "submit": "2025-05-22 05:52:11",
       "exit_code": 0,
-      "duration_sec": 1400,
-      "cpu_pct": 2885.9,
-      "peak_memory_mb": 9524,
+      "duration_sec": 1326,
+      "cpu_pct": 1423.4,
+      "peak_memory_mb": 6861,
       "disk_read_mb": 2664,
       "disk_write_mb": 2
     }
@@ -256,11 +256,11 @@
     "method_id": "no_integration",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-15 11:10:04",
+      "submit": "2025-05-22 05:52:21",
       "exit_code": 0,
-      "duration_sec": 196,
-      "cpu_pct": 410.6,
-      "peak_memory_mb": 8602,
+      "duration_sec": 180,
+      "cpu_pct": 177.3,
+      "peak_memory_mb": 4608,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
@@ -270,11 +270,11 @@
     "method_id": "no_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-15 11:10:04",
+      "submit": "2025-05-22 05:52:21",
       "exit_code": 0,
-      "duration_sec": 456,
-      "cpu_pct": 102,
-      "peak_memory_mb": 9626,
+      "duration_sec": 448,
+      "cpu_pct": 99.5,
+      "peak_memory_mb": 6861,
       "disk_read_mb": 4916,
       "disk_write_mb": 4
     }
@@ -284,11 +284,11 @@
     "method_id": "no_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-15 11:10:04",
+      "submit": "2025-05-22 05:52:21",
       "exit_code": 0,
-      "duration_sec": 2168,
-      "cpu_pct": 2291.6,
-      "peak_memory_mb": 8500,
+      "duration_sec": 1328,
+      "cpu_pct": 1441.4,
+      "peak_memory_mb": 5837,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
@@ -298,11 +298,11 @@
     "method_id": "perfect_integration",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-15 11:07:54",
+      "submit": "2025-05-22 05:52:01",
       "exit_code": 0,
-      "duration_sec": 250,
-      "cpu_pct": 465.5,
-      "peak_memory_mb": 8397,
+      "duration_sec": 226,
+      "cpu_pct": 205.1,
+      "peak_memory_mb": 4404,
       "disk_read_mb": 2254,
       "disk_write_mb": 2
     }
@@ -312,11 +312,11 @@
     "method_id": "perfect_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-15 11:07:54",
+      "submit": "2025-05-22 05:52:01",
       "exit_code": 0,
-      "duration_sec": 408,
-      "cpu_pct": 106.3,
-      "peak_memory_mb": 9319,
+      "duration_sec": 380,
+      "cpu_pct": 100,
+      "peak_memory_mb": 5325,
       "disk_read_mb": 4508,
       "disk_write_mb": 4
     }
@@ -326,11 +326,11 @@
     "method_id": "perfect_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-15 11:07:54",
+      "submit": "2025-05-22 05:52:01",
       "exit_code": 0,
-      "duration_sec": 2486,
-      "cpu_pct": 2042,
-      "peak_memory_mb": 8295,
+      "duration_sec": 1306,
+      "cpu_pct": 744.4,
+      "peak_memory_mb": 4301,
       "disk_read_mb": 2254,
       "disk_write_mb": 2
     }
@@ -340,11 +340,11 @@
     "method_id": "shuffle_integration",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-15 11:08:24",
+      "submit": "2025-05-22 05:52:21",
       "exit_code": 0,
-      "duration_sec": 210,
-      "cpu_pct": 398.9,
-      "peak_memory_mb": 8602,
+      "duration_sec": 186,
+      "cpu_pct": 265.4,
+      "peak_memory_mb": 5940,
       "disk_read_mb": 2664,
       "disk_write_mb": 2
     }
@@ -354,11 +354,11 @@
     "method_id": "shuffle_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-15 11:08:24",
+      "submit": "2025-05-22 05:52:21",
       "exit_code": 0,
-      "duration_sec": 544,
-      "cpu_pct": 103.9,
-      "peak_memory_mb": 9626,
+      "duration_sec": 420,
+      "cpu_pct": 100.1,
+      "peak_memory_mb": 5530,
       "disk_read_mb": 4916,
       "disk_write_mb": 4
     }
@@ -368,11 +368,11 @@
     "method_id": "shuffle_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-15 11:08:24",
+      "submit": "2025-05-22 05:52:21",
       "exit_code": 0,
-      "duration_sec": 1654,
-      "cpu_pct": 3033.8,
-      "peak_memory_mb": 8500,
+      "duration_sec": 1324,
+      "cpu_pct": 1441.1,
+      "peak_memory_mb": 5837,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
@@ -382,11 +382,11 @@
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-15 11:08:24",
+      "submit": "2025-05-22 05:51:01",
       "exit_code": 0,
-      "duration_sec": 202,
-      "cpu_pct": 396.9,
-      "peak_memory_mb": 8602,
+      "duration_sec": 178,
+      "cpu_pct": 180,
+      "peak_memory_mb": 4608,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
@@ -396,11 +396,11 @@
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-15 11:08:24",
+      "submit": "2025-05-22 05:51:01",
       "exit_code": 0,
-      "duration_sec": 432,
-      "cpu_pct": 106.9,
-      "peak_memory_mb": 9626,
+      "duration_sec": 436,
+      "cpu_pct": 104,
+      "peak_memory_mb": 6964,
       "disk_read_mb": 4916,
       "disk_write_mb": 4
     }
@@ -410,10 +410,10 @@
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-15 11:08:24",
+      "submit": "2025-05-22 05:51:01",
       "exit_code": 0,
-      "duration_sec": 1634,
-      "cpu_pct": 2856.4,
+      "duration_sec": 1988,
+      "cpu_pct": 2474.6,
       "peak_memory_mb": 8500,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
@@ -424,11 +424,11 @@
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-15 11:08:04",
+      "submit": "2025-05-22 05:51:01",
       "exit_code": 0,
-      "duration_sec": 204,
-      "cpu_pct": 386.4,
-      "peak_memory_mb": 8602,
+      "duration_sec": 180,
+      "cpu_pct": 177.2,
+      "peak_memory_mb": 4608,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
@@ -438,11 +438,11 @@
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-15 11:08:04",
+      "submit": "2025-05-22 05:51:01",
       "exit_code": 0,
-      "duration_sec": 488,
-      "cpu_pct": 104.9,
-      "peak_memory_mb": 9626,
+      "duration_sec": 440,
+      "cpu_pct": 99.5,
+      "peak_memory_mb": 6861,
       "disk_read_mb": 4916,
       "disk_write_mb": 4
     }
@@ -452,10 +452,10 @@
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-15 11:08:04",
+      "submit": "2025-05-22 05:51:01",
       "exit_code": 0,
-      "duration_sec": 2506,
-      "cpu_pct": 2001.9,
+      "duration_sec": 1986,
+      "cpu_pct": 2497.5,
       "peak_memory_mb": 8500,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
diff --git a/results/cyto_batch_integration/data/metric_info.json b/results/cyto_batch_integration/data/metric_info.json
index 28835c1a..d4e63040 100644
--- a/results/cyto_batch_integration/data/metric_info.json
+++ b/results/cyto_batch_integration/data/metric_info.json
@@ -8,10 +8,10 @@
     "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample, cell type, and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the average of all these EMD values is computed to produce an overall metric score EMD Mean CT.\n\nA high score indicates large overall differences in the distributions of marker expressions \nbetween the paired samples, suggesting poor batch integration.\nA low score means the small differences in marker expression distributions between batches, \nindicating good batch integration.\n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
     "maximize": false
   },
   {
@@ -23,10 +23,10 @@
     "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample, cell type, and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the maximum of all these EMD values is computed as EMD Max CT.\n\nEMD Max CT score reflects the largest difference in marker expression distributions across all cell types, \nmarkers, and paired samples.\nA high score indicates that at least one marker, cell type, or sample pair has a large difference in \ndistribution after batch integration.\nA low score means that even the most poorly corrected marker expression is well integrated across batches.    \n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
     "maximize": false
   },
   {
@@ -38,10 +38,10 @@
     "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every marker and paired sample.\nFinally, the average of all these EMD values is computed to produce an overall metric score EMD Mean Global.\n\nA high score indicates that at least one marker and cell type in a given sample pair has a \nlarge difference in distribution after batch integration.\nA low score means that the most poorly corrected marker expression is well integrated across batches.   \n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
     "maximize": false
   },
   {
@@ -53,10 +53,10 @@
     "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the maximum of all these EMD values is computed as EMD Max Global.\n\nEMD Max Global score reflects the largest difference in marker expression distributions \nacross all markers and paired samples.\nA high score indicates that at least one marker in a given sample pair has a large difference in \ndistribution after batch integration.\nA low score means that the most poorly corrected marker expression is well integrated across batches.   \n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
     "maximize": false
   },
   {
@@ -68,10 +68,10 @@
     "metric_description": "The metric compares the number of marker expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
     "references_doi": "10.1038/s41592-019-0686-2",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/n_inconsistent_peaks",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/n_inconsistent_peaks",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/n_inconsistent_peaks:build_main",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
     "maximize": false
   },
   {
@@ -83,10 +83,10 @@
     "metric_description": "The metric compares the number of cell type specific marker expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe (cell type) marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
     "references_doi": "10.1038/s41592-019-0686-2",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/n_inconsistent_peaks",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/n_inconsistent_peaks",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/n_inconsistent_peaks:build_main",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
     "maximize": false
   },
   {
@@ -98,10 +98,10 @@
     "metric_description": "First, a simple linear model `sklearn.linear_model.LinearRegression` is fitted for each paired sample and marker to determine the fraction of variance (R^2) explained by the batch covariate B. |\nThe average batch R_squared is then computed as the average of the $R^2$ values across all paired samples, markers. |\nAs a result, $\\overline{R^2_B}_{global}$ quantifies how much of the total variability in the data is driven by batch effects. Consequently, lower values are desirable. |\n\n$\\overline{R^2_B}_{global} = \\frac{1}{N*M}\\sum_{\\substack{(x_{\\mathrm{int}},\\,x_{\\mathrm{val}})\\\\ \\text{paired samples}}}^{N} \\sum_{i=1}^{M} \\,R^2\\!\\bigl(\\mathrm{marker}_i \\mid B\\bigr)$\n\nWhere:\n- $N$ is the number of paired samples, where x_{\\mathrm{int}} is the replicate that has been batch-corrected and x_{\\mathrm{val}} is replicate used for validation. Paired samples belong to different batches.\n- $M$ is the number of markers\n- $B$ is the batch covariate\n\nA higher value of $\\overline{R^2_B}_{global}$ indicates that the batch variable explains more of the variance in the data, which indicates a higher level of batch effects. |\n",
     "references_doi": null,
     "references_bibtex": "@book{draper1998applied,\ntitle={Applied regression analysis},\nauthor={Draper, Norman R and Smith, Harry},\npublisher={John Wiley \\& Sons}\n}\n",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/average_batch_r2",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/average_batch_r2",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/average_batch_r2:build_main",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
     "maximize": false
   },
   {
@@ -113,10 +113,10 @@
     "metric_description": "First, a simple linear model `sklearn.linear_model.LinearRegression` is fitted for each paired sample, marker and cell type to determine the fraction of variance (R^2) explained by the batch covariate B. |\nThe average batch R_squared is then computed as the average of the $R^2$ values across all paired samples, markers and cell types. |\nAs a result, $\\overline{R^2_B}_{cell\\ type}$ quantifies how much of the total variability in the data is driven by batch effects. Consequently, lower values are desirable. |\n\n$\\overline{R^2_B}_{cell\\ type} = \\frac{1}{N*C*M}\\sum_{\\substack{(x_{\\mathrm{int}},\\,x_{\\mathrm{val}})\\\\ \\text{paired samples}}}^{N} \\sum_{j=1}^{C} \\sum_{i=1}^{M}\\,R^2\\!\\bigl(\\mathrm{marker}_i \\mid B\\bigr)$\n\nWhere:\n- $N$ is the number of paired samples, where x_{\\mathrm{int}} is the replicate that has been batch-corrected and x_{\\mathrm{val}} is replicate used for validation. Paired samples belong to different batches.\n- $C$ is the number of cell types\n- $M$ is the number of markers\n- $B$ is the batch covariate\n\nThe $\\overline{Rˆ2_B}_{global}$ is a variation of the latter metric, where the average is computed across paired samples and markers only, without taking into account the cell types. |\n\nA higher value of $\\overline{R^2_B}_{global}$ or $\\overline{R^2_B}_{cell\\ type}$ indicates that the batch variable explains more of the variance in the data, which indicates a higher level of batch effects. |\n\nA good performance on $\\overline{R^2_B}_{global}$ but not on $\\overline{R^2_B}_{cell\\ type}$ might indicate that the batch effect correction is discarding cell type specific batch effects. |\n",
     "references_doi": null,
     "references_bibtex": "@book{draper1998applied,\ntitle={Applied regression analysis},\nauthor={Draper, Norman R and Smith, Harry},\npublisher={John Wiley \\& Sons}\n}\n",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/a353793dddff0b2744140bcbd7917e3c27e1efbb/src/metrics/average_batch_r2",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/average_batch_r2",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/average_batch_r2:build_main",
     "code_version": "build_main",
-    "commit_sha": "a353793dddff0b2744140bcbd7917e3c27e1efbb",
+    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
     "maximize": false
   }
 ]
diff --git a/results/cyto_batch_integration/data/quality_control.json b/results/cyto_batch_integration/data/quality_control.json
index ae72f161..62c4bfda 100644
--- a/results/cyto_batch_integration/data/quality_control.json
+++ b/results/cyto_batch_integration/data/quality_control.json
@@ -453,21 +453,21 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration emd_mean_ct", 
-        "value": 0.0241, 
+        "value": 0.0245, 
         "severity": 0, 
-        "severity_value": -0.0241, 
+        "severity_value": -0.0245, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_ct\n  Worst score: 0.0241%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_ct\n  Worst score: 0.0245%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration emd_mean_ct", 
-        "value": 0.0241, 
+        "value": 0.0245, 
         "severity": 0, 
-        "severity_value": 0.01205, 
+        "severity_value": 0.01225, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_ct\n  Best score: 0.0241%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_ct\n  Best score: 0.0245%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -493,81 +493,81 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type emd_mean_ct", 
-        "value": 0.779, 
+        "value": 0.7809, 
         "severity": 0, 
-        "severity_value": -0.779, 
+        "severity_value": -0.7809, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_ct\n  Worst score: 0.779%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_ct\n  Worst score: 0.7809%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type emd_mean_ct", 
-        "value": 0.779, 
+        "value": 0.7809, 
         "severity": 0, 
-        "severity_value": 0.3895, 
+        "severity_value": 0.39045, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_ct\n  Best score: 0.779%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_ct\n  Best score: 0.7809%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy emd_mean_ct", 
-        "value": 0.7864, 
+        "value": 0.7862, 
         "severity": 0, 
-        "severity_value": -0.7864, 
+        "severity_value": -0.7862, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_ct\n  Worst score: 0.7864%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_ct\n  Worst score: 0.7862%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy emd_mean_ct", 
-        "value": 0.7864, 
+        "value": 0.7862, 
         "severity": 0, 
-        "severity_value": 0.3932, 
+        "severity_value": 0.3931, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_ct\n  Best score: 0.7864%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_ct\n  Best score: 0.7862%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect emd_mean_ct", 
-        "value": 0.7724, 
+        "value": 0.7721, 
         "severity": 0, 
-        "severity_value": -0.7724, 
+        "severity_value": -0.7721, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_ct\n  Worst score: 0.7724%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_ct\n  Worst score: 0.7721%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect emd_mean_ct", 
-        "value": 0.7724, 
+        "value": 0.7721, 
         "severity": 0, 
-        "severity_value": 0.3862, 
+        "severity_value": 0.38605, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_ct\n  Best score: 0.7724%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_ct\n  Best score: 0.7721%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration emd_mean_ct", 
-        "value": 0.7454, 
+        "value": 0.7451, 
         "severity": 0, 
-        "severity_value": -0.7454, 
+        "severity_value": -0.7451, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_ct\n  Worst score: 0.7454%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_ct\n  Worst score: 0.7451%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration emd_mean_ct", 
-        "value": 0.7454, 
+        "value": 0.7451, 
         "severity": 0, 
-        "severity_value": 0.3727, 
+        "severity_value": 0.37255, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_ct\n  Best score: 0.7454%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_ct\n  Best score: 0.7451%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -593,101 +593,101 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat emd_mean_ct", 
-        "value": 0.7767, 
+        "value": 0.7765, 
         "severity": 0, 
-        "severity_value": -0.7767, 
+        "severity_value": -0.7765, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_ct\n  Worst score: 0.7767%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_ct\n  Worst score: 0.7765%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat emd_mean_ct", 
-        "value": 0.7767, 
+        "value": 0.7765, 
         "severity": 0, 
-        "severity_value": 0.38835, 
+        "severity_value": 0.38825, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_ct\n  Best score: 0.7767%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_ct\n  Best score: 0.7765%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols emd_mean_ct", 
-        "value": -0.945, 
+        "value": 0.8229, 
         "severity": 0, 
-        "severity_value": 0.945, 
+        "severity_value": -0.8229, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_ct\n  Worst score: -0.945%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_ct\n  Worst score: 0.8229%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols emd_mean_ct", 
-        "value": -0.945, 
+        "value": 0.8229, 
         "severity": 0, 
-        "severity_value": -0.4725, 
+        "severity_value": 0.41145, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_ct\n  Best score: -0.945%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_ct\n  Best score: 0.8229%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score gaussnorm emd_mean_ct", 
-        "value": 0.7424, 
+        "value": 0.7422, 
         "severity": 0, 
-        "severity_value": -0.7424, 
+        "severity_value": -0.7422, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_ct\n  Worst score: 0.7424%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_ct\n  Worst score: 0.7422%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score gaussnorm emd_mean_ct", 
-        "value": 0.7424, 
+        "value": 0.7422, 
         "severity": 0, 
-        "severity_value": 0.3712, 
+        "severity_value": 0.3711, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_ct\n  Best score: 0.7424%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_ct\n  Best score: 0.7422%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cytonorm_controls emd_mean_ct", 
-        "value": 0.8328, 
+        "value": 0.8327, 
         "severity": 0, 
-        "severity_value": -0.8328, 
+        "severity_value": -0.8327, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_ct\n  Worst score: 0.8328%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_ct\n  Worst score: 0.8327%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cytonorm_controls emd_mean_ct", 
-        "value": 0.8328, 
+        "value": 0.8327, 
         "severity": 0, 
-        "severity_value": 0.4164, 
+        "severity_value": 0.41635, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_ct\n  Best score: 0.8328%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_ct\n  Best score: 0.8327%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration emd_max_ct", 
-        "value": 0.0451, 
+        "value": 0.0446, 
         "severity": 0, 
-        "severity_value": -0.0451, 
+        "severity_value": -0.0446, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_ct\n  Worst score: 0.0451%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_ct\n  Worst score: 0.0446%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration emd_max_ct", 
-        "value": 0.0451, 
+        "value": 0.0446, 
         "severity": 0, 
-        "severity_value": 0.02255, 
+        "severity_value": 0.0223, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_ct\n  Best score: 0.0451%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_ct\n  Best score: 0.0446%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -713,81 +713,81 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type emd_max_ct", 
-        "value": 0.5406, 
+        "value": 0.54, 
         "severity": 0, 
-        "severity_value": -0.5406, 
+        "severity_value": -0.54, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_ct\n  Worst score: 0.5406%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_ct\n  Worst score: 0.54%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type emd_max_ct", 
-        "value": 0.5406, 
+        "value": 0.54, 
         "severity": 0, 
-        "severity_value": 0.2703, 
+        "severity_value": 0.27, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_ct\n  Best score: 0.5406%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_ct\n  Best score: 0.54%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy emd_max_ct", 
-        "value": 0.5602, 
+        "value": 0.5606, 
         "severity": 0, 
-        "severity_value": -0.5602, 
+        "severity_value": -0.5606, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_ct\n  Worst score: 0.5602%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_ct\n  Worst score: 0.5606%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy emd_max_ct", 
-        "value": 0.5602, 
+        "value": 0.5606, 
         "severity": 0, 
-        "severity_value": 0.2801, 
+        "severity_value": 0.2803, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_ct\n  Best score: 0.5602%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_ct\n  Best score: 0.5606%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect emd_max_ct", 
-        "value": 0.5541, 
+        "value": 0.5546, 
         "severity": 0, 
-        "severity_value": -0.5541, 
+        "severity_value": -0.5546, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_ct\n  Worst score: 0.5541%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_ct\n  Worst score: 0.5546%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect emd_max_ct", 
-        "value": 0.5541, 
+        "value": 0.5546, 
         "severity": 0, 
-        "severity_value": 0.27705, 
+        "severity_value": 0.2773, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_ct\n  Best score: 0.5541%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_ct\n  Best score: 0.5546%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration emd_max_ct", 
-        "value": 0.5386, 
+        "value": 0.5391, 
         "severity": 0, 
-        "severity_value": -0.5386, 
+        "severity_value": -0.5391, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_ct\n  Worst score: 0.5386%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_ct\n  Worst score: 0.5391%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration emd_max_ct", 
-        "value": 0.5386, 
+        "value": 0.5391, 
         "severity": 0, 
-        "severity_value": 0.2693, 
+        "severity_value": 0.26955, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_ct\n  Best score: 0.5386%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_ct\n  Best score: 0.5391%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -813,101 +813,101 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat emd_max_ct", 
-        "value": 0.5453, 
+        "value": 0.5458, 
         "severity": 0, 
-        "severity_value": -0.5453, 
+        "severity_value": -0.5458, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_ct\n  Worst score: 0.5453%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_ct\n  Worst score: 0.5458%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat emd_max_ct", 
-        "value": 0.5453, 
+        "value": 0.5458, 
         "severity": 0, 
-        "severity_value": 0.27265, 
+        "severity_value": 0.2729, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_ct\n  Best score: 0.5453%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_ct\n  Best score: 0.5458%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols emd_max_ct", 
-        "value": -0.1133, 
+        "value": 0.6009, 
         "severity": 0, 
-        "severity_value": 0.1133, 
+        "severity_value": -0.6009, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_ct\n  Worst score: -0.1133%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_ct\n  Worst score: 0.6009%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols emd_max_ct", 
-        "value": -0.1133, 
+        "value": 0.6009, 
         "severity": 0, 
-        "severity_value": -0.05665, 
+        "severity_value": 0.30045, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_ct\n  Best score: -0.1133%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_ct\n  Best score: 0.6009%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score gaussnorm emd_max_ct", 
-        "value": 0.5412, 
+        "value": 0.5418, 
         "severity": 0, 
-        "severity_value": -0.5412, 
+        "severity_value": -0.5418, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_ct\n  Worst score: 0.5412%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_ct\n  Worst score: 0.5418%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score gaussnorm emd_max_ct", 
-        "value": 0.5412, 
+        "value": 0.5418, 
         "severity": 0, 
-        "severity_value": 0.2706, 
+        "severity_value": 0.2709, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_ct\n  Best score: 0.5412%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_ct\n  Best score: 0.5418%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cytonorm_controls emd_max_ct", 
-        "value": 0.6823, 
+        "value": 0.6826, 
         "severity": 0, 
-        "severity_value": -0.6823, 
+        "severity_value": -0.6826, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_ct\n  Worst score: 0.6823%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_ct\n  Worst score: 0.6826%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cytonorm_controls emd_max_ct", 
-        "value": 0.6823, 
+        "value": 0.6826, 
         "severity": 0, 
-        "severity_value": 0.34115, 
+        "severity_value": 0.3413, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_ct\n  Best score: 0.6823%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_ct\n  Best score: 0.6826%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration emd_mean_global", 
-        "value": 0.1994, 
+        "value": 0.1979, 
         "severity": 0, 
-        "severity_value": -0.1994, 
+        "severity_value": -0.1979, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global\n  Worst score: 0.1994%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global\n  Worst score: 0.1979%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration emd_mean_global", 
-        "value": 0.1994, 
+        "value": 0.1979, 
         "severity": 0, 
-        "severity_value": 0.0997, 
+        "severity_value": 0.09895, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global\n  Best score: 0.1994%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global\n  Best score: 0.1979%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -933,81 +933,81 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type emd_mean_global", 
-        "value": 0.5174, 
+        "value": 0.5178, 
         "severity": 0, 
-        "severity_value": -0.5174, 
+        "severity_value": -0.5178, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global\n  Worst score: 0.5174%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global\n  Worst score: 0.5178%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type emd_mean_global", 
-        "value": 0.5174, 
+        "value": 0.5178, 
         "severity": 0, 
-        "severity_value": 0.2587, 
+        "severity_value": 0.2589, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global\n  Best score: 0.5174%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global\n  Best score: 0.5178%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy emd_mean_global", 
-        "value": 0.5999, 
+        "value": 0.5995, 
         "severity": 0, 
-        "severity_value": -0.5999, 
+        "severity_value": -0.5995, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global\n  Worst score: 0.5999%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global\n  Worst score: 0.5995%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy emd_mean_global", 
-        "value": 0.5999, 
+        "value": 0.5995, 
         "severity": 0, 
-        "severity_value": 0.29995, 
+        "severity_value": 0.29975, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global\n  Best score: 0.5999%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global\n  Best score: 0.5995%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect emd_mean_global", 
-        "value": 0.5893, 
+        "value": 0.5889, 
         "severity": 0, 
-        "severity_value": -0.5893, 
+        "severity_value": -0.5889, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global\n  Worst score: 0.5893%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global\n  Worst score: 0.5889%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect emd_mean_global", 
-        "value": 0.5893, 
+        "value": 0.5889, 
         "severity": 0, 
-        "severity_value": 0.29465, 
+        "severity_value": 0.29445, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global\n  Best score: 0.5893%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global\n  Best score: 0.5889%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration emd_mean_global", 
-        "value": 0.3614, 
+        "value": 0.3608, 
         "severity": 0, 
-        "severity_value": -0.3614, 
+        "severity_value": -0.3608, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global\n  Worst score: 0.3614%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global\n  Worst score: 0.3608%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration emd_mean_global", 
-        "value": 0.3614, 
+        "value": 0.3608, 
         "severity": 0, 
-        "severity_value": 0.1807, 
+        "severity_value": 0.1804, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global\n  Best score: 0.3614%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global\n  Best score: 0.3608%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1033,101 +1033,101 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat emd_mean_global", 
-        "value": 0.6014, 
+        "value": 0.6011, 
         "severity": 0, 
-        "severity_value": -0.6014, 
+        "severity_value": -0.6011, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global\n  Worst score: 0.6014%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global\n  Worst score: 0.6011%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat emd_mean_global", 
-        "value": 0.6014, 
+        "value": 0.6011, 
         "severity": 0, 
-        "severity_value": 0.3007, 
+        "severity_value": 0.30055, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global\n  Best score: 0.6014%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global\n  Best score: 0.6011%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols emd_mean_global", 
-        "value": -5.9154, 
-        "severity": 3, 
-        "severity_value": 5.9154, 
+        "value": 0.5821, 
+        "severity": 0, 
+        "severity_value": -0.5821, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global\n  Worst score: -5.9154%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global\n  Worst score: 0.5821%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols emd_mean_global", 
-        "value": -5.9154, 
+        "value": 0.5821, 
         "severity": 0, 
-        "severity_value": -2.9577, 
+        "severity_value": 0.29105, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global\n  Best score: -5.9154%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global\n  Best score: 0.5821%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score gaussnorm emd_mean_global", 
-        "value": 0.457, 
+        "value": 0.4566, 
         "severity": 0, 
-        "severity_value": -0.457, 
+        "severity_value": -0.4566, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global\n  Worst score: 0.457%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global\n  Worst score: 0.4566%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score gaussnorm emd_mean_global", 
-        "value": 0.457, 
+        "value": 0.4566, 
         "severity": 0, 
-        "severity_value": 0.2285, 
+        "severity_value": 0.2283, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global\n  Best score: 0.457%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global\n  Best score: 0.4566%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cytonorm_controls emd_mean_global", 
-        "value": 0.6441, 
+        "value": 0.6438, 
         "severity": 0, 
-        "severity_value": -0.6441, 
+        "severity_value": -0.6438, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global\n  Worst score: 0.6441%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global\n  Worst score: 0.6438%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cytonorm_controls emd_mean_global", 
-        "value": 0.6441, 
+        "value": 0.6438, 
         "severity": 0, 
-        "severity_value": 0.32205, 
+        "severity_value": 0.3219, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global\n  Best score: 0.6441%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global\n  Best score: 0.6438%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration emd_max_global", 
-        "value": 0.1301, 
+        "value": 0.1312, 
         "severity": 0, 
-        "severity_value": -0.1301, 
+        "severity_value": -0.1312, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global\n  Worst score: 0.1301%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global\n  Worst score: 0.1312%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration emd_max_global", 
-        "value": 0.1301, 
+        "value": 0.1312, 
         "severity": 0, 
-        "severity_value": 0.06505, 
+        "severity_value": 0.0656, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global\n  Best score: 0.1301%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global\n  Best score: 0.1312%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1153,81 +1153,81 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type emd_max_global", 
-        "value": 0.5869, 
+        "value": 0.5831, 
         "severity": 0, 
-        "severity_value": -0.5869, 
+        "severity_value": -0.5831, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global\n  Worst score: 0.5869%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global\n  Worst score: 0.5831%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type emd_max_global", 
-        "value": 0.5869, 
+        "value": 0.5831, 
         "severity": 0, 
-        "severity_value": 0.29345, 
+        "severity_value": 0.29155, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global\n  Best score: 0.5869%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global\n  Best score: 0.5831%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy emd_max_global", 
-        "value": 0.5859, 
+        "value": 0.5861, 
         "severity": 0, 
-        "severity_value": -0.5859, 
+        "severity_value": -0.5861, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global\n  Worst score: 0.5859%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global\n  Worst score: 0.5861%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy emd_max_global", 
-        "value": 0.5859, 
+        "value": 0.5861, 
         "severity": 0, 
-        "severity_value": 0.29295, 
+        "severity_value": 0.29305, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global\n  Best score: 0.5859%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global\n  Best score: 0.5861%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect emd_max_global", 
-        "value": 0.5722, 
+        "value": 0.5724, 
         "severity": 0, 
-        "severity_value": -0.5722, 
+        "severity_value": -0.5724, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global\n  Worst score: 0.5722%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global\n  Worst score: 0.5724%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect emd_max_global", 
-        "value": 0.5722, 
+        "value": 0.5724, 
         "severity": 0, 
-        "severity_value": 0.2861, 
+        "severity_value": 0.2862, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global\n  Best score: 0.5722%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global\n  Best score: 0.5724%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration emd_max_global", 
-        "value": 0.237, 
+        "value": 0.2374, 
         "severity": 0, 
-        "severity_value": -0.237, 
+        "severity_value": -0.2374, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global\n  Worst score: 0.237%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global\n  Worst score: 0.2374%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration emd_max_global", 
-        "value": 0.237, 
+        "value": 0.2374, 
         "severity": 0, 
-        "severity_value": 0.1185, 
+        "severity_value": 0.1187, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global\n  Best score: 0.237%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global\n  Best score: 0.2374%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1253,81 +1253,81 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat emd_max_global", 
-        "value": 0.5295, 
+        "value": 0.5298, 
         "severity": 0, 
-        "severity_value": -0.5295, 
+        "severity_value": -0.5298, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global\n  Worst score: 0.5295%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global\n  Worst score: 0.5298%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat emd_max_global", 
-        "value": 0.5295, 
+        "value": 0.5298, 
         "severity": 0, 
-        "severity_value": 0.26475, 
+        "severity_value": 0.2649, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global\n  Best score: 0.5295%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global\n  Best score: 0.5298%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols emd_max_global", 
-        "value": -2.5698, 
-        "severity": 2, 
-        "severity_value": 2.5698, 
+        "value": 0.5339, 
+        "severity": 0, 
+        "severity_value": -0.5339, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global\n  Worst score: -2.5698%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global\n  Worst score: 0.5339%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols emd_max_global", 
-        "value": -2.5698, 
+        "value": 0.5339, 
         "severity": 0, 
-        "severity_value": -1.2849, 
+        "severity_value": 0.26695, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global\n  Best score: -2.5698%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global\n  Best score: 0.5339%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score gaussnorm emd_max_global", 
-        "value": 0.4733, 
+        "value": 0.4736, 
         "severity": 0, 
-        "severity_value": -0.4733, 
+        "severity_value": -0.4736, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global\n  Worst score: 0.4733%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global\n  Worst score: 0.4736%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score gaussnorm emd_max_global", 
-        "value": 0.4733, 
+        "value": 0.4736, 
         "severity": 0, 
-        "severity_value": 0.23665, 
+        "severity_value": 0.2368, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global\n  Best score: 0.4733%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global\n  Best score: 0.4736%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cytonorm_controls emd_max_global", 
-        "value": 0.6241, 
+        "value": 0.6243, 
         "severity": 0, 
-        "severity_value": -0.6241, 
+        "severity_value": -0.6243, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global\n  Worst score: 0.6241%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global\n  Worst score: 0.6243%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cytonorm_controls emd_max_global", 
-        "value": 0.6241, 
+        "value": 0.6243, 
         "severity": 0, 
-        "severity_value": 0.31205, 
+        "severity_value": 0.31215, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global\n  Best score: 0.6241%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global\n  Best score: 0.6243%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1553,21 +1553,21 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration n_inconsistent_peaks_ct", 
-        "value": 0.0272, 
+        "value": 0.0278, 
         "severity": 0, 
-        "severity_value": -0.0272, 
+        "severity_value": -0.0278, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0272%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0278%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration n_inconsistent_peaks_ct", 
-        "value": 0.0272, 
+        "value": 0.0278, 
         "severity": 0, 
-        "severity_value": 0.0136, 
+        "severity_value": 0.0139, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.0272%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.0278%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1593,81 +1593,81 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
-        "value": 0.7687, 
+        "value": 0.75, 
         "severity": 0, 
-        "severity_value": -0.7687, 
+        "severity_value": -0.75, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.7687%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.75%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
-        "value": 0.7687, 
+        "value": 0.75, 
         "severity": 0, 
-        "severity_value": 0.38435, 
+        "severity_value": 0.375, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.7687%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.75%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy n_inconsistent_peaks_ct", 
-        "value": 0.8741, 
+        "value": 0.8715, 
         "severity": 0, 
-        "severity_value": -0.8741, 
+        "severity_value": -0.8715, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8741%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8715%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy n_inconsistent_peaks_ct", 
-        "value": 0.8741, 
+        "value": 0.8715, 
         "severity": 0, 
-        "severity_value": 0.43705, 
+        "severity_value": 0.43575, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8741%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8715%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect n_inconsistent_peaks_ct", 
-        "value": 0.8707, 
+        "value": 0.8681, 
         "severity": 0, 
-        "severity_value": -0.8707, 
+        "severity_value": -0.8681, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8707%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8681%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect n_inconsistent_peaks_ct", 
-        "value": 0.8707, 
+        "value": 0.8681, 
         "severity": 0, 
-        "severity_value": 0.43535, 
+        "severity_value": 0.43405, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8707%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8681%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration n_inconsistent_peaks_ct", 
-        "value": 0.8707, 
+        "value": 0.8681, 
         "severity": 0, 
-        "severity_value": -0.8707, 
+        "severity_value": -0.8681, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8707%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8681%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration n_inconsistent_peaks_ct", 
-        "value": 0.8707, 
+        "value": 0.8681, 
         "severity": 0, 
-        "severity_value": 0.43535, 
+        "severity_value": 0.43405, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8707%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8681%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1693,101 +1693,101 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat n_inconsistent_peaks_ct", 
-        "value": 0.8673, 
+        "value": 0.8646, 
         "severity": 0, 
-        "severity_value": -0.8673, 
+        "severity_value": -0.8646, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8673%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8646%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat n_inconsistent_peaks_ct", 
-        "value": 0.8673, 
+        "value": 0.8646, 
         "severity": 0, 
-        "severity_value": 0.43365, 
+        "severity_value": 0.4323, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8673%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8646%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols n_inconsistent_peaks_ct", 
-        "value": 0.8265, 
+        "value": 0.8646, 
         "severity": 0, 
-        "severity_value": -0.8265, 
+        "severity_value": -0.8646, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8265%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8646%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols n_inconsistent_peaks_ct", 
-        "value": 0.8265, 
+        "value": 0.8646, 
         "severity": 0, 
-        "severity_value": 0.41325, 
+        "severity_value": 0.4323, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8265%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8646%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score gaussnorm n_inconsistent_peaks_ct", 
-        "value": 0.8844, 
+        "value": 0.8819, 
         "severity": 0, 
-        "severity_value": -0.8844, 
+        "severity_value": -0.8819, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8844%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8819%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score gaussnorm n_inconsistent_peaks_ct", 
-        "value": 0.8844, 
+        "value": 0.8819, 
         "severity": 0, 
-        "severity_value": 0.4422, 
+        "severity_value": 0.44095, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8844%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8819%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cytonorm_controls n_inconsistent_peaks_ct", 
-        "value": 0.881, 
+        "value": 0.8785, 
         "severity": 0, 
-        "severity_value": -0.881, 
+        "severity_value": -0.8785, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.881%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8785%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cytonorm_controls n_inconsistent_peaks_ct", 
-        "value": 0.881, 
+        "value": 0.8785, 
         "severity": 0, 
-        "severity_value": 0.4405, 
+        "severity_value": 0.43925, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.881%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8785%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration average_batch_r2_global", 
-        "value": 0.5227, 
+        "value": 0.5228, 
         "severity": 0, 
-        "severity_value": -0.5227, 
+        "severity_value": -0.5228, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.5227%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.5228%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration average_batch_r2_global", 
-        "value": 0.5227, 
+        "value": 0.5228, 
         "severity": 0, 
-        "severity_value": 0.26135, 
+        "severity_value": 0.2614, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.5227%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.5228%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1813,81 +1813,81 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type average_batch_r2_global", 
-        "value": 0.7149, 
+        "value": 0.7144, 
         "severity": 0, 
-        "severity_value": -0.7149, 
+        "severity_value": -0.7144, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Worst score: 0.7149%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Worst score: 0.7144%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type average_batch_r2_global", 
-        "value": 0.7149, 
+        "value": 0.7144, 
         "severity": 0, 
-        "severity_value": 0.35745, 
+        "severity_value": 0.3572, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Best score: 0.7149%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Best score: 0.7144%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy average_batch_r2_global", 
-        "value": 0.7588, 
+        "value": 0.7585, 
         "severity": 0, 
-        "severity_value": -0.7588, 
+        "severity_value": -0.7585, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Worst score: 0.7588%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Worst score: 0.7585%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy average_batch_r2_global", 
-        "value": 0.7588, 
+        "value": 0.7585, 
         "severity": 0, 
-        "severity_value": 0.3794, 
+        "severity_value": 0.37925, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Best score: 0.7588%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Best score: 0.7585%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect average_batch_r2_global", 
-        "value": 0.7621, 
+        "value": 0.7619, 
         "severity": 0, 
-        "severity_value": -0.7621, 
+        "severity_value": -0.7619, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Worst score: 0.7621%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Worst score: 0.7619%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect average_batch_r2_global", 
-        "value": 0.7621, 
+        "value": 0.7619, 
         "severity": 0, 
-        "severity_value": 0.38105, 
+        "severity_value": 0.38095, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Best score: 0.7621%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Best score: 0.7619%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration average_batch_r2_global", 
-        "value": 0.2169, 
+        "value": 0.2159, 
         "severity": 0, 
-        "severity_value": -0.2169, 
+        "severity_value": -0.2159, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.2169%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.2159%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration average_batch_r2_global", 
-        "value": 0.2169, 
+        "value": 0.2159, 
         "severity": 0, 
-        "severity_value": 0.10845, 
+        "severity_value": 0.10795, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.2169%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.2159%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -1913,101 +1913,101 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat average_batch_r2_global", 
-        "value": 0.7542, 
+        "value": 0.754, 
         "severity": 0, 
-        "severity_value": -0.7542, 
+        "severity_value": -0.754, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Worst score: 0.7542%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Worst score: 0.754%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat average_batch_r2_global", 
-        "value": 0.7542, 
+        "value": 0.754, 
         "severity": 0, 
-        "severity_value": 0.3771, 
+        "severity_value": 0.377, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Best score: 0.7542%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Best score: 0.754%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols average_batch_r2_global", 
-        "value": -9.7697, 
-        "severity": 3, 
-        "severity_value": 9.7697, 
+        "value": 0.6772, 
+        "severity": 0, 
+        "severity_value": -0.6772, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Worst score: -9.7697%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Worst score: 0.6772%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols average_batch_r2_global", 
-        "value": -9.7697, 
+        "value": 0.6772, 
         "severity": 0, 
-        "severity_value": -4.88485, 
+        "severity_value": 0.3386, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Best score: -9.7697%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Best score: 0.6772%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score gaussnorm average_batch_r2_global", 
-        "value": 0.5404, 
+        "value": 0.5398, 
         "severity": 0, 
-        "severity_value": -0.5404, 
+        "severity_value": -0.5398, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_global\n  Worst score: 0.5404%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_global\n  Worst score: 0.5398%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score gaussnorm average_batch_r2_global", 
-        "value": 0.5404, 
+        "value": 0.5398, 
         "severity": 0, 
-        "severity_value": 0.2702, 
+        "severity_value": 0.2699, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_global\n  Best score: 0.5404%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_global\n  Best score: 0.5398%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cytonorm_controls average_batch_r2_global", 
-        "value": 0.7639, 
+        "value": 0.7636, 
         "severity": 0, 
-        "severity_value": -0.7639, 
+        "severity_value": -0.7636, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_global\n  Worst score: 0.7639%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_global\n  Worst score: 0.7636%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cytonorm_controls average_batch_r2_global", 
-        "value": 0.7639, 
+        "value": 0.7636, 
         "severity": 0, 
-        "severity_value": 0.38195, 
+        "severity_value": 0.3818, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_global\n  Best score: 0.7639%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_global\n  Best score: 0.7636%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration average_batch_r2_ct", 
-        "value": 0.0627, 
+        "value": 0.0641, 
         "severity": 0, 
-        "severity_value": -0.0627, 
+        "severity_value": -0.0641, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0627%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0641%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration average_batch_r2_ct", 
-        "value": 0.0627, 
+        "value": 0.0641, 
         "severity": 0, 
-        "severity_value": 0.03135, 
+        "severity_value": 0.03205, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.0627%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.0641%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -2033,81 +2033,81 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type average_batch_r2_ct", 
-        "value": 0.8416, 
+        "value": 0.843, 
         "severity": 0, 
-        "severity_value": -0.8416, 
+        "severity_value": -0.843, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Worst score: 0.8416%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Worst score: 0.843%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type average_batch_r2_ct", 
-        "value": 0.8416, 
+        "value": 0.843, 
         "severity": 0, 
-        "severity_value": 0.4208, 
+        "severity_value": 0.4215, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Best score: 0.8416%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Best score: 0.843%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy average_batch_r2_ct", 
-        "value": 0.7975, 
+        "value": 0.7966, 
         "severity": 0, 
-        "severity_value": -0.7975, 
+        "severity_value": -0.7966, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7975%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7966%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy average_batch_r2_ct", 
-        "value": 0.7975, 
+        "value": 0.7966, 
         "severity": 0, 
-        "severity_value": 0.39875, 
+        "severity_value": 0.3983, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Best score: 0.7975%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Best score: 0.7966%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect average_batch_r2_ct", 
-        "value": 0.755, 
+        "value": 0.754, 
         "severity": 0, 
-        "severity_value": -0.755, 
+        "severity_value": -0.754, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Worst score: 0.755%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Worst score: 0.754%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect average_batch_r2_ct", 
-        "value": 0.755, 
+        "value": 0.754, 
         "severity": 0, 
-        "severity_value": 0.3775, 
+        "severity_value": 0.377, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Best score: 0.755%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Best score: 0.754%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration average_batch_r2_ct", 
-        "value": 0.7073, 
+        "value": 0.706, 
         "severity": 0, 
-        "severity_value": -0.7073, 
+        "severity_value": -0.706, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7073%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.706%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration average_batch_r2_ct", 
-        "value": 0.7073, 
+        "value": 0.706, 
         "severity": 0, 
-        "severity_value": 0.35365, 
+        "severity_value": 0.353, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.7073%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.706%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -2133,80 +2133,80 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat average_batch_r2_ct", 
-        "value": 0.7595, 
+        "value": 0.7585, 
         "severity": 0, 
-        "severity_value": -0.7595, 
+        "severity_value": -0.7585, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7595%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7585%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat average_batch_r2_ct", 
-        "value": 0.7595, 
+        "value": 0.7585, 
         "severity": 0, 
-        "severity_value": 0.37975, 
+        "severity_value": 0.37925, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Best score: 0.7595%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Best score: 0.7585%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols average_batch_r2_ct", 
-        "value": -1.3847, 
-        "severity": 1, 
-        "severity_value": 1.3847, 
+        "value": 0.854, 
+        "severity": 0, 
+        "severity_value": -0.854, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Worst score: -1.3847%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Worst score: 0.854%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols average_batch_r2_ct", 
-        "value": -1.3847, 
+        "value": 0.854, 
         "severity": 0, 
-        "severity_value": -0.69235, 
+        "severity_value": 0.427, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Best score: -1.3847%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Best score: 0.854%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score gaussnorm average_batch_r2_ct", 
-        "value": 0.7243, 
+        "value": 0.7231, 
         "severity": 0, 
-        "severity_value": -0.7243, 
+        "severity_value": -0.7231, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7243%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7231%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score gaussnorm average_batch_r2_ct", 
-        "value": 0.7243, 
+        "value": 0.7231, 
         "severity": 0, 
-        "severity_value": 0.36215, 
+        "severity_value": 0.36155, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_ct\n  Best score: 0.7243%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_ct\n  Best score: 0.7231%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cytonorm_controls average_batch_r2_ct", 
-        "value": 0.8645, 
+        "value": 0.8639, 
         "severity": 0, 
-        "severity_value": -0.8645, 
+        "severity_value": -0.8639, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_ct\n  Worst score: 0.8645%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_ct\n  Worst score: 0.8639%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cytonorm_controls average_batch_r2_ct", 
-        "value": 0.8645, 
+        "value": 0.8639, 
         "severity": 0, 
-        "severity_value": 0.43225, 
+        "severity_value": 0.43195, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_ct\n  Best score: 0.8645%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_ct\n  Best score: 0.8639%\n"
     }
 ]
\ No newline at end of file
diff --git a/results/cyto_batch_integration/data/results.json b/results/cyto_batch_integration/data/results.json
index c51e6461..99155bc5 100644
--- a/results/cyto_batch_integration/data/results.json
+++ b/results/cyto_batch_integration/data/results.json
@@ -13,22 +13,22 @@
       "n_inconsistent_peaks_ct": 39
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.7595,
-      "average_batch_r2_global": 0.7542,
-      "emd_max_ct": 0.5453,
-      "emd_max_global": 0.5295,
-      "emd_mean_ct": 0.7767,
-      "emd_mean_global": 0.6014,
+      "average_batch_r2_ct": 0.7585,
+      "average_batch_r2_global": 0.754,
+      "emd_max_ct": 0.5458,
+      "emd_max_global": 0.5298,
+      "emd_mean_ct": 0.7765,
+      "emd_mean_global": 0.6011,
       "n_inconsistent_peaks": 0.625,
-      "n_inconsistent_peaks_ct": 0.8673
+      "n_inconsistent_peaks_ct": 0.8646
     },
-    "mean_score": 0.6824,
+    "mean_score": 0.6819,
     "resources": {
-      "submit": "2025-05-15 11:06:35",
+      "submit": "2025-05-22 05:46:32",
       "exit_code": 0,
-      "duration_sec": 100,
-      "cpu_pct": 238.2,
-      "peak_memory_mb": 10752,
+      "duration_sec": 92,
+      "cpu_pct": 138.9,
+      "peak_memory_mb": 6656,
       "disk_read_mb": 512,
       "disk_write_mb": 808
     }
@@ -37,34 +37,34 @@
     "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "cycombine_nocontrols",
     "metric_values": {
-      "average_batch_r2_ct": 0.5186,
-      "average_batch_r2_global": 0.3526,
-      "emd_max_ct": 2.9856,
-      "emd_max_global": 2.4106,
-      "emd_mean_ct": 1.1746,
-      "emd_mean_global": 1.2999,
+      "average_batch_r2_ct": 0.0316,
+      "average_batch_r2_global": 0.0106,
+      "emd_max_ct": 1.0715,
+      "emd_max_global": 0.3149,
+      "emd_mean_ct": 0.1069,
+      "emd_mean_global": 0.0785,
       "n_inconsistent_peaks": 2,
-      "n_inconsistent_peaks_ct": 51
+      "n_inconsistent_peaks_ct": 39
     },
     "scaled_scores": {
-      "average_batch_r2_ct": -1.3847,
-      "average_batch_r2_global": -9.7697,
-      "emd_max_ct": -0.1133,
-      "emd_max_global": -2.5698,
-      "emd_mean_ct": -0.945,
-      "emd_mean_global": -5.9154,
+      "average_batch_r2_ct": 0.854,
+      "average_batch_r2_global": 0.6772,
+      "emd_max_ct": 0.6009,
+      "emd_max_global": 0.5339,
+      "emd_mean_ct": 0.8229,
+      "emd_mean_global": 0.5821,
       "n_inconsistent_peaks": 0.75,
-      "n_inconsistent_peaks_ct": 0.8265
+      "n_inconsistent_peaks_ct": 0.8646
     },
-    "mean_score": 0.1971,
+    "mean_score": 0.7107,
     "resources": {
-      "submit": "2025-05-15 11:06:35",
+      "submit": "2025-05-22 05:46:32",
       "exit_code": 0,
-      "duration_sec": 363,
-      "cpu_pct": 103.3,
-      "peak_memory_mb": 15156,
+      "duration_sec": 389,
+      "cpu_pct": 102.6,
+      "peak_memory_mb": 15872,
       "disk_read_mb": 520,
-      "disk_write_mb": 816
+      "disk_write_mb": 808
     }
   },
   {
@@ -81,22 +81,22 @@
       "n_inconsistent_peaks_ct": 35
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.8645,
-      "average_batch_r2_global": 0.7639,
-      "emd_max_ct": 0.6823,
-      "emd_max_global": 0.6241,
-      "emd_mean_ct": 0.8328,
-      "emd_mean_global": 0.6441,
+      "average_batch_r2_ct": 0.8639,
+      "average_batch_r2_global": 0.7636,
+      "emd_max_ct": 0.6826,
+      "emd_max_global": 0.6243,
+      "emd_mean_ct": 0.8327,
+      "emd_mean_global": 0.6438,
       "n_inconsistent_peaks": 0.75,
-      "n_inconsistent_peaks_ct": 0.881
+      "n_inconsistent_peaks_ct": 0.8785
     },
-    "mean_score": 0.7553,
+    "mean_score": 0.7549,
     "resources": {
-      "submit": "2025-05-15 11:06:35",
+      "submit": "2025-05-22 05:46:32",
       "exit_code": 0,
-      "duration_sec": 835,
-      "cpu_pct": 101.4,
-      "peak_memory_mb": 13312,
+      "duration_sec": 990,
+      "cpu_pct": 100.6,
+      "peak_memory_mb": 13415,
       "disk_read_mb": 2151,
       "disk_write_mb": 2356
     }
@@ -115,22 +115,22 @@
       "n_inconsistent_peaks_ct": 34
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.7243,
-      "average_batch_r2_global": 0.5404,
-      "emd_max_ct": 0.5412,
-      "emd_max_global": 0.4733,
-      "emd_mean_ct": 0.7424,
-      "emd_mean_global": 0.457,
+      "average_batch_r2_ct": 0.7231,
+      "average_batch_r2_global": 0.5398,
+      "emd_max_ct": 0.5418,
+      "emd_max_global": 0.4736,
+      "emd_mean_ct": 0.7422,
+      "emd_mean_global": 0.4566,
       "n_inconsistent_peaks": 0.625,
-      "n_inconsistent_peaks_ct": 0.8844
+      "n_inconsistent_peaks_ct": 0.8819
     },
-    "mean_score": 0.6235,
+    "mean_score": 0.623,
     "resources": {
-      "submit": "2025-05-15 11:06:35",
+      "submit": "2025-05-22 05:46:32",
       "exit_code": 0,
-      "duration_sec": 429,
-      "cpu_pct": 98.7,
-      "peak_memory_mb": 12391,
+      "duration_sec": 396,
+      "cpu_pct": 101.6,
+      "peak_memory_mb": 9319,
       "disk_read_mb": 957,
       "disk_write_mb": 1127
     }
@@ -149,22 +149,22 @@
       "n_inconsistent_peaks_ct": 37
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.7975,
-      "average_batch_r2_global": 0.7588,
-      "emd_max_ct": 0.5602,
-      "emd_max_global": 0.5859,
-      "emd_mean_ct": 0.7864,
-      "emd_mean_global": 0.5999,
+      "average_batch_r2_ct": 0.7966,
+      "average_batch_r2_global": 0.7585,
+      "emd_max_ct": 0.5606,
+      "emd_max_global": 0.5861,
+      "emd_mean_ct": 0.7862,
+      "emd_mean_global": 0.5995,
       "n_inconsistent_peaks": 0.75,
-      "n_inconsistent_peaks_ct": 0.8741
+      "n_inconsistent_peaks_ct": 0.8715
     },
-    "mean_score": 0.7141,
+    "mean_score": 0.7137,
     "resources": {
-      "submit": "2025-05-15 11:06:35",
+      "submit": "2025-05-22 05:46:32",
       "exit_code": 0,
-      "duration_sec": 4939,
-      "cpu_pct": 484.8,
-      "peak_memory_mb": 30823,
+      "duration_sec": 3454,
+      "cpu_pct": 398.9,
+      "peak_memory_mb": 24884,
       "disk_read_mb": 498,
       "disk_write_mb": 466
     }
@@ -183,22 +183,22 @@
       "n_inconsistent_peaks_ct": 38
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.755,
-      "average_batch_r2_global": 0.7621,
-      "emd_max_ct": 0.5541,
-      "emd_max_global": 0.5722,
-      "emd_mean_ct": 0.7724,
-      "emd_mean_global": 0.5893,
+      "average_batch_r2_ct": 0.754,
+      "average_batch_r2_global": 0.7619,
+      "emd_max_ct": 0.5546,
+      "emd_max_global": 0.5724,
+      "emd_mean_ct": 0.7721,
+      "emd_mean_global": 0.5889,
       "n_inconsistent_peaks": 0.75,
-      "n_inconsistent_peaks_ct": 0.8707
+      "n_inconsistent_peaks_ct": 0.8681
     },
-    "mean_score": 0.7032,
+    "mean_score": 0.7027,
     "resources": {
-      "submit": "2025-05-15 11:06:35",
+      "submit": "2025-05-22 05:46:32",
       "exit_code": 0,
-      "duration_sec": 102,
-      "cpu_pct": 110.9,
-      "peak_memory_mb": 11572,
+      "duration_sec": 94,
+      "cpu_pct": 103.9,
+      "peak_memory_mb": 7578,
       "disk_read_mb": 495,
       "disk_write_mb": 554
     }
@@ -217,21 +217,21 @@
       "n_inconsistent_peaks_ct": 38
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.7073,
-      "average_batch_r2_global": 0.2169,
-      "emd_max_ct": 0.5386,
-      "emd_max_global": 0.237,
-      "emd_mean_ct": 0.7454,
-      "emd_mean_global": 0.3614,
+      "average_batch_r2_ct": 0.706,
+      "average_batch_r2_global": 0.2159,
+      "emd_max_ct": 0.5391,
+      "emd_max_global": 0.2374,
+      "emd_mean_ct": 0.7451,
+      "emd_mean_global": 0.3608,
       "n_inconsistent_peaks": 0.75,
-      "n_inconsistent_peaks_ct": 0.8707
+      "n_inconsistent_peaks_ct": 0.8681
     },
-    "mean_score": 0.5534,
+    "mean_score": 0.5528,
     "resources": {
-      "submit": "2025-05-15 11:06:35",
+      "submit": "2025-05-22 05:46:32",
       "exit_code": 0,
-      "duration_sec": 48.4,
-      "cpu_pct": 116.7,
+      "duration_sec": 52.8,
+      "cpu_pct": 101.9,
       "peak_memory_mb": 6656,
       "disk_read_mb": 486,
       "disk_write_mb": 466
@@ -262,10 +262,10 @@
     },
     "mean_score": 1,
     "resources": {
-      "submit": "2025-05-15 11:06:35",
+      "submit": "2025-05-22 05:46:32",
       "exit_code": 0,
-      "duration_sec": 35.1,
-      "cpu_pct": 109.9,
+      "duration_sec": 38.7,
+      "cpu_pct": 100.7,
       "peak_memory_mb": 6247,
       "disk_read_mb": 321,
       "disk_write_mb": 302
@@ -275,31 +275,31 @@
     "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration",
     "metric_values": {
-      "average_batch_r2_ct": 0.2038,
+      "average_batch_r2_ct": 0.2027,
       "average_batch_r2_global": 0.0156,
-      "emd_max_ct": 2.5607,
-      "emd_max_global": 0.5874,
-      "emd_mean_ct": 0.5893,
-      "emd_mean_global": 0.1505,
+      "emd_max_ct": 2.5651,
+      "emd_max_global": 0.587,
+      "emd_mean_ct": 0.5885,
+      "emd_mean_global": 0.1507,
       "n_inconsistent_peaks": 8,
-      "n_inconsistent_peaks_ct": 286
+      "n_inconsistent_peaks_ct": 280
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.0627,
-      "average_batch_r2_global": 0.5227,
-      "emd_max_ct": 0.0451,
-      "emd_max_global": 0.1301,
-      "emd_mean_ct": 0.0241,
-      "emd_mean_global": 0.1994,
+      "average_batch_r2_ct": 0.0641,
+      "average_batch_r2_global": 0.5228,
+      "emd_max_ct": 0.0446,
+      "emd_max_global": 0.1312,
+      "emd_mean_ct": 0.0245,
+      "emd_mean_global": 0.1979,
       "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0.0272
+      "n_inconsistent_peaks_ct": 0.0278
     },
-    "mean_score": 0.1264,
+    "mean_score": 0.1266,
     "resources": {
-      "submit": "2025-05-15 11:06:35",
+      "submit": "2025-05-22 05:46:32",
       "exit_code": 0,
-      "duration_sec": 54.8,
-      "cpu_pct": 98,
+      "duration_sec": 59.6,
+      "cpu_pct": 96.5,
       "peak_memory_mb": 7783,
       "disk_read_mb": 486,
       "disk_write_mb": 481
@@ -309,14 +309,14 @@
     "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_batch",
     "metric_values": {
-      "average_batch_r2_ct": 0.2175,
+      "average_batch_r2_ct": 0.2165,
       "average_batch_r2_global": 0.0327,
-      "emd_max_ct": 2.6817,
-      "emd_max_global": 0.6753,
-      "emd_mean_ct": 0.6039,
-      "emd_mean_global": 0.188,
+      "emd_max_ct": 2.6847,
+      "emd_max_global": 0.6756,
+      "emd_mean_ct": 0.6033,
+      "emd_mean_global": 0.1878,
       "n_inconsistent_peaks": 8,
-      "n_inconsistent_peaks_ct": 294
+      "n_inconsistent_peaks_ct": 288
     },
     "scaled_scores": {
       "average_batch_r2_ct": 0,
@@ -330,11 +330,11 @@
     },
     "mean_score": 0,
     "resources": {
-      "submit": "2025-05-15 11:06:35",
+      "submit": "2025-05-22 05:46:32",
       "exit_code": 0,
-      "duration_sec": 54,
-      "cpu_pct": 96.9,
-      "peak_memory_mb": 7476,
+      "duration_sec": 53.6,
+      "cpu_pct": 88.7,
+      "peak_memory_mb": 3482,
       "disk_read_mb": 486,
       "disk_write_mb": 480
     }
@@ -343,32 +343,32 @@
     "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_values": {
-      "average_batch_r2_ct": 0.0344,
+      "average_batch_r2_ct": 0.034,
       "average_batch_r2_global": 0.0093,
-      "emd_max_ct": 1.232,
-      "emd_max_global": 0.279,
-      "emd_mean_ct": 0.1335,
-      "emd_mean_global": 0.0907,
+      "emd_max_ct": 1.2351,
+      "emd_max_global": 0.2816,
+      "emd_mean_ct": 0.1322,
+      "emd_mean_global": 0.0906,
       "n_inconsistent_peaks": 4,
-      "n_inconsistent_peaks_ct": 68
+      "n_inconsistent_peaks_ct": 72
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.8416,
-      "average_batch_r2_global": 0.7149,
-      "emd_max_ct": 0.5406,
-      "emd_max_global": 0.5869,
-      "emd_mean_ct": 0.779,
-      "emd_mean_global": 0.5174,
+      "average_batch_r2_ct": 0.843,
+      "average_batch_r2_global": 0.7144,
+      "emd_max_ct": 0.54,
+      "emd_max_global": 0.5831,
+      "emd_mean_ct": 0.7809,
+      "emd_mean_global": 0.5178,
       "n_inconsistent_peaks": 0.5,
-      "n_inconsistent_peaks_ct": 0.7687
+      "n_inconsistent_peaks_ct": 0.75
     },
-    "mean_score": 0.6561,
+    "mean_score": 0.6537,
     "resources": {
-      "submit": "2025-05-15 11:06:35",
+      "submit": "2025-05-22 05:46:32",
       "exit_code": 0,
-      "duration_sec": 58.3,
-      "cpu_pct": 105.1,
-      "peak_memory_mb": 7476,
+      "duration_sec": 54.6,
+      "cpu_pct": 95,
+      "peak_memory_mb": 3482,
       "disk_read_mb": 486,
       "disk_write_mb": 480
     }

From f2a388f7cbddade088d1f1aadf723fb4f031056a Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Fri, 23 May 2025 17:02:37 +0200
Subject: [PATCH 14/14] update data

---
 .../data/dataset_info.json                    |    2 +-
 .../data/method_info.json                     |   70 +-
 .../data/metric_execution_info.json           |  498 ++--
 .../data/metric_info.json                     |  107 +-
 .../data/quality_control.json                 | 2276 ++++++++++++-----
 .../cyto_batch_integration/data/results.json  |  452 ++--
 6 files changed, 2351 insertions(+), 1054 deletions(-)

diff --git a/results/cyto_batch_integration/data/dataset_info.json b/results/cyto_batch_integration/data/dataset_info.json
index 8a42df39..1b4f6782 100644
--- a/results/cyto_batch_integration/data/dataset_info.json
+++ b/results/cyto_batch_integration/data/dataset_info.json
@@ -6,7 +6,7 @@
     "dataset_description": "Flow cytometry data of spleens from 4 WT (IKK2 fl/fl CD11c-cre +/+) and 4 KO (IKK2 fl/fl CD11c-cre Tg/+) B6 mice, measured with a 22-color panel and 2 different instrument settings. Data has been preprocessed (compensated with a batch-specific compensation matrix, logicle transformed, cleaned with PeacoQC and pregated on live single CD45+ cells).",
     "data_reference": null,
     "data_url": "https://saeyslab.sites.vib.be/en",
-    "date_created": "22-05-2025",
+    "date_created": "23-05-2025",
     "file_size": 489781536
   }
 ]
diff --git a/results/cyto_batch_integration/data/method_info.json b/results/cyto_batch_integration/data/method_info.json
index b8e6339e..e8343b10 100644
--- a/results/cyto_batch_integration/data/method_info.json
+++ b/results/cyto_batch_integration/data/method_info.json
@@ -11,9 +11,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/control_methods/shuffle_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/control_methods/shuffle_integration",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
   },
   {
     "task_id": "control_methods",
@@ -27,9 +27,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_batch:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/control_methods/shuffle_integration_by_batch",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/control_methods/shuffle_integration_by_batch",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
   },
   {
     "task_id": "control_methods",
@@ -43,9 +43,9 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/shuffle_integration_by_cell_type:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/control_methods/shuffle_integration_by_cell_type",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/control_methods/shuffle_integration_by_cell_type",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
   },
   {
     "task_id": "methods",
@@ -59,9 +59,9 @@
     "code_url": "https://github.com/slowkow/harmonypy",
     "documentation_url": "https://portals.broadinstitute.org/harmony",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/harmonypy:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/harmonypy",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/methods/harmonypy",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
   },
   {
     "task_id": "methods",
@@ -75,9 +75,9 @@
     "code_url": "https://github.com/bioc/limma",
     "documentation_url": "https://bioinf.wehi.edu.au/limma",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/limma_remove_batch_effect:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/limma_remove_batch_effect",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/methods/limma_remove_batch_effect",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
   },
   {
     "task_id": "control_methods",
@@ -91,25 +91,41 @@
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/no_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/control_methods/no_integration",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/control_methods/no_integration",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
   },
   {
     "task_id": "control_methods",
-    "method_id": "perfect_integration",
-    "method_name": "Perfect Integration",
-    "method_summary": "Positive control method which imitates what perfect batch integration.",
-    "method_description": "The method actually just return the validation data but just changing the batch\nand sample ID to those that are in the unintegrated_censored.\nBecause the marker expression is the exactly same as the validation data, there won't\nbe any batch effect present.\n",
+    "method_id": "perfect_integration_horizontal",
+    "method_name": "Perfect Integration Horizontal",
+    "method_summary": "Positive control method for horizontal metrics which reprsents perfect batch integration.",
+    "method_description": "The method actually just return the validation data but just changing the batch\nand sample ID to those that are in the unintegrated_censored.\nBecause the marker expression is the exactly same as the validation data, there won't\nbe any batch effect present when computing horizontal metrics.\nBatch effect will be present when computing vertical metrics as the validation data\ncontain samples from different batches, unintegrated.\n",
     "is_baseline": true,
     "references_doi": null,
     "references_bibtex": null,
     "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
     "documentation_url": null,
-    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/perfect_integration:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/control_methods/perfect_integration",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/perfect_integration_horizontal:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/control_methods/perfect_integration_horizontal",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
+  },
+  {
+    "task_id": "control_methods",
+    "method_id": "perfect_integration_vertical",
+    "method_name": "Perfect Integration Vertical",
+    "method_summary": "Positive control method for vertical metrics reflecting a scenario in which all samples belong to the same batch",
+    "method_description": "This control method return all samples from batch 1. \nBecause the samples all came from one batch, we do not expect to see any technical\nvariation caused by batch effects, but we still expect a sample-level effect due to the \nunderlying differences in biology of the samples.\nThe vertical metrics should return a good score.\nHowever, poor scores are expected for horizontal metrics because some samples (those\nfrom unintegrated data) will be compared against the validation data, which still \ncontains variation due to batch effect.\n",
+    "is_baseline": true,
+    "references_doi": null,
+    "references_bibtex": null,
+    "code_url": "https://github.com/openproblems-bio/task_cyto_batch_integration",
+    "documentation_url": null,
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/control_methods/perfect_integration_vertical:build_main",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/control_methods/perfect_integration_vertical",
+    "code_version": "build_main",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
   },
   {
     "task_id": "methods",
@@ -123,9 +139,9 @@
     "code_url": "https://github.com/brentp/combat.py",
     "documentation_url": "https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/combat:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/combat",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/methods/combat",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
   },
   {
     "task_id": "methods",
@@ -139,9 +155,9 @@
     "code_url": "https://github.com/biosurf/cyCombine",
     "documentation_url": "https://biosurf.org/cyCombine.html",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/cycombine_nocontrols:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/cycombine_nocontrols",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/methods/cycombine_nocontrols",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
   },
   {
     "task_id": "methods",
@@ -155,9 +171,9 @@
     "code_url": "https://github.com/RGLab/flowStats",
     "documentation_url": "https://rdrr.io/bioc/flowStats/src/R/gaussNorm.R",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/gaussnorm:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/gaussnorm",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/methods/gaussnorm",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
   },
   {
     "task_id": "methods",
@@ -171,8 +187,8 @@
     "code_url": "https://github.com/saeyslab/CytoNorm",
     "documentation_url": "https://github.com/saeyslab/CytoNorm",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/methods/cytonorm_controls:build_main",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/methods/cytonorm_controls",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/methods/cytonorm_controls",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4"
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f"
   }
 ]
diff --git a/results/cyto_batch_integration/data/metric_execution_info.json b/results/cyto_batch_integration/data/metric_execution_info.json
index b11bbe09..967136fc 100644
--- a/results/cyto_batch_integration/data/metric_execution_info.json
+++ b/results/cyto_batch_integration/data/metric_execution_info.json
@@ -4,11 +4,11 @@
     "method_id": "combat",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-22 05:49:12",
+      "submit": "2025-05-23 12:46:51",
       "exit_code": 0,
-      "duration_sec": 224,
-      "cpu_pct": 390,
-      "peak_memory_mb": 9728,
+      "duration_sec": 184,
+      "cpu_pct": 174.1,
+      "peak_memory_mb": 5735,
       "disk_read_mb": 3278,
       "disk_write_mb": 2
     }
@@ -18,13 +18,27 @@
     "method_id": "combat",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-22 05:49:12",
+      "submit": "2025-05-23 12:46:51",
+      "exit_code": 0,
+      "duration_sec": 912,
+      "cpu_pct": 97.7,
+      "peak_memory_mb": 5837,
+      "disk_read_mb": 9834,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "combat",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 12:46:51",
       "exit_code": 0,
-      "duration_sec": 444,
-      "cpu_pct": 102.2,
-      "peak_memory_mb": 7680,
-      "disk_read_mb": 6556,
-      "disk_write_mb": 4
+      "duration_sec": 795,
+      "cpu_pct": 101,
+      "peak_memory_mb": 10343,
+      "disk_read_mb": 2253,
+      "disk_write_mb": 696
     }
   },
   {
@@ -32,11 +46,11 @@
     "method_id": "combat",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-22 05:49:12",
+      "submit": "2025-05-23 12:46:51",
       "exit_code": 0,
-      "duration_sec": 1994,
-      "cpu_pct": 2554.4,
-      "peak_memory_mb": 9524,
+      "duration_sec": 1292,
+      "cpu_pct": 736.9,
+      "peak_memory_mb": 5530,
       "disk_read_mb": 3278,
       "disk_write_mb": 2
     }
@@ -46,10 +60,10 @@
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-22 05:58:31",
+      "submit": "2025-05-23 12:56:51",
       "exit_code": 0,
-      "duration_sec": 190,
-      "cpu_pct": 171,
+      "duration_sec": 182,
+      "cpu_pct": 175.7,
       "peak_memory_mb": 5735,
       "disk_read_mb": 3278,
       "disk_write_mb": 2
@@ -60,13 +74,27 @@
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-22 05:58:31",
+      "submit": "2025-05-23 12:56:51",
+      "exit_code": 0,
+      "duration_sec": 912,
+      "cpu_pct": 98,
+      "peak_memory_mb": 5940,
+      "disk_read_mb": 9834,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "cycombine_nocontrols",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 12:56:51",
       "exit_code": 0,
-      "duration_sec": 448,
-      "cpu_pct": 99.2,
-      "peak_memory_mb": 6349,
-      "disk_read_mb": 6556,
-      "disk_write_mb": 4
+      "duration_sec": 773,
+      "cpu_pct": 100.3,
+      "peak_memory_mb": 8704,
+      "disk_read_mb": 2253,
+      "disk_write_mb": 696
     }
   },
   {
@@ -74,10 +102,10 @@
     "method_id": "cycombine_nocontrols",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-22 05:58:31",
+      "submit": "2025-05-23 12:56:51",
       "exit_code": 0,
-      "duration_sec": 1292,
-      "cpu_pct": 735.3,
+      "duration_sec": 1298,
+      "cpu_pct": 733.2,
       "peak_memory_mb": 5428,
       "disk_read_mb": 3278,
       "disk_write_mb": 2
@@ -88,10 +116,10 @@
     "method_id": "cytonorm_controls",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-22 06:08:31",
+      "submit": "2025-05-23 13:03:11",
       "exit_code": 0,
-      "duration_sec": 208,
-      "cpu_pct": 249.5,
+      "duration_sec": 182,
+      "cpu_pct": 260.5,
       "peak_memory_mb": 7066,
       "disk_read_mb": 2664,
       "disk_write_mb": 2
@@ -102,13 +130,27 @@
     "method_id": "cytonorm_controls",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-22 06:08:31",
+      "submit": "2025-05-23 13:03:11",
+      "exit_code": 0,
+      "duration_sec": 912,
+      "cpu_pct": 100.3,
+      "peak_memory_mb": 7271,
+      "disk_read_mb": 7992,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "cytonorm_controls",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 13:03:11",
       "exit_code": 0,
-      "duration_sec": 460,
-      "cpu_pct": 102.1,
-      "peak_memory_mb": 10445,
-      "disk_read_mb": 5328,
-      "disk_write_mb": 4
+      "duration_sec": 767,
+      "cpu_pct": 100.3,
+      "peak_memory_mb": 9012,
+      "disk_read_mb": 2048,
+      "disk_write_mb": 696
     }
   },
   {
@@ -116,10 +158,10 @@
     "method_id": "cytonorm_controls",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-22 06:08:31",
+      "submit": "2025-05-23 13:03:11",
       "exit_code": 0,
-      "duration_sec": 1318,
-      "cpu_pct": 3544.8,
+      "duration_sec": 1476,
+      "cpu_pct": 3346.1,
       "peak_memory_mb": 9524,
       "disk_read_mb": 2664,
       "disk_write_mb": 2
@@ -130,11 +172,11 @@
     "method_id": "gaussnorm",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-22 05:57:41",
+      "submit": "2025-05-23 12:57:31",
       "exit_code": 0,
-      "duration_sec": 188,
-      "cpu_pct": 173.5,
-      "peak_memory_mb": 5735,
+      "duration_sec": 196,
+      "cpu_pct": 390.3,
+      "peak_memory_mb": 9728,
       "disk_read_mb": 2868,
       "disk_write_mb": 2
     }
@@ -144,13 +186,27 @@
     "method_id": "gaussnorm",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-22 05:57:41",
+      "submit": "2025-05-23 12:57:31",
       "exit_code": 0,
-      "duration_sec": 456,
-      "cpu_pct": 97,
-      "peak_memory_mb": 6349,
-      "disk_read_mb": 5736,
-      "disk_write_mb": 4
+      "duration_sec": 990,
+      "cpu_pct": 104.4,
+      "peak_memory_mb": 9933,
+      "disk_read_mb": 8604,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "gaussnorm",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 12:57:31",
+      "exit_code": 0,
+      "duration_sec": 942,
+      "cpu_pct": 101.1,
+      "peak_memory_mb": 12903,
+      "disk_read_mb": 2151,
+      "disk_write_mb": 696
     }
   },
   {
@@ -158,10 +214,10 @@
     "method_id": "gaussnorm",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-22 05:57:41",
+      "submit": "2025-05-23 12:57:31",
       "exit_code": 0,
-      "duration_sec": 1302,
-      "cpu_pct": 733.7,
+      "duration_sec": 1290,
+      "cpu_pct": 739.3,
       "peak_memory_mb": 5428,
       "disk_read_mb": 2868,
       "disk_write_mb": 2
@@ -172,10 +228,10 @@
     "method_id": "harmonypy",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-22 06:48:21",
+      "submit": "2025-05-23 13:42:11",
       "exit_code": 0,
       "duration_sec": 176,
-      "cpu_pct": 179.5,
+      "cpu_pct": 176.3,
       "peak_memory_mb": 4608,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
@@ -186,13 +242,27 @@
     "method_id": "harmonypy",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-22 06:48:21",
+      "submit": "2025-05-23 13:42:11",
+      "exit_code": 0,
+      "duration_sec": 858,
+      "cpu_pct": 100,
+      "peak_memory_mb": 4916,
+      "disk_read_mb": 7374,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "harmonypy",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 13:42:11",
       "exit_code": 0,
-      "duration_sec": 448,
-      "cpu_pct": 99.9,
-      "peak_memory_mb": 6861,
-      "disk_read_mb": 4916,
-      "disk_write_mb": 4
+      "duration_sec": 851,
+      "cpu_pct": 100.6,
+      "peak_memory_mb": 9831,
+      "disk_read_mb": 1946,
+      "disk_write_mb": 696
     }
   },
   {
@@ -200,10 +270,10 @@
     "method_id": "harmonypy",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-22 06:48:21",
+      "submit": "2025-05-23 13:42:11",
       "exit_code": 0,
-      "duration_sec": 1346,
-      "cpu_pct": 1436.2,
+      "duration_sec": 1618,
+      "cpu_pct": 1635.2,
       "peak_memory_mb": 5837,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
@@ -214,11 +284,11 @@
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-22 05:52:11",
+      "submit": "2025-05-23 12:52:31",
       "exit_code": 0,
-      "duration_sec": 204,
-      "cpu_pct": 395.5,
-      "peak_memory_mb": 9728,
+      "duration_sec": 182,
+      "cpu_pct": 175.2,
+      "peak_memory_mb": 5735,
       "disk_read_mb": 2664,
       "disk_write_mb": 2
     }
@@ -228,13 +298,27 @@
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-22 05:52:11",
+      "submit": "2025-05-23 12:52:31",
       "exit_code": 0,
-      "duration_sec": 444,
-      "cpu_pct": 99.3,
-      "peak_memory_mb": 6349,
-      "disk_read_mb": 5328,
-      "disk_write_mb": 4
+      "duration_sec": 918,
+      "cpu_pct": 98.9,
+      "peak_memory_mb": 5940,
+      "disk_read_mb": 7992,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "limma_remove_batch_effect",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 12:52:31",
+      "exit_code": 0,
+      "duration_sec": 969,
+      "cpu_pct": 101.3,
+      "peak_memory_mb": 12698,
+      "disk_read_mb": 2048,
+      "disk_write_mb": 696
     }
   },
   {
@@ -242,11 +326,11 @@
     "method_id": "limma_remove_batch_effect",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-22 05:52:11",
+      "submit": "2025-05-23 12:52:31",
       "exit_code": 0,
-      "duration_sec": 1326,
-      "cpu_pct": 1423.4,
-      "peak_memory_mb": 6861,
+      "duration_sec": 1834,
+      "cpu_pct": 2662.2,
+      "peak_memory_mb": 9421,
       "disk_read_mb": 2664,
       "disk_write_mb": 2
     }
@@ -256,11 +340,11 @@
     "method_id": "no_integration",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-22 05:52:21",
+      "submit": "2025-05-23 12:46:01",
       "exit_code": 0,
       "duration_sec": 180,
-      "cpu_pct": 177.3,
-      "peak_memory_mb": 4608,
+      "cpu_pct": 271.1,
+      "peak_memory_mb": 5940,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
@@ -270,13 +354,27 @@
     "method_id": "no_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-22 05:52:21",
+      "submit": "2025-05-23 12:46:01",
       "exit_code": 0,
-      "duration_sec": 448,
-      "cpu_pct": 99.5,
-      "peak_memory_mb": 6861,
-      "disk_read_mb": 4916,
-      "disk_write_mb": 4
+      "duration_sec": 870,
+      "cpu_pct": 99.4,
+      "peak_memory_mb": 4916,
+      "disk_read_mb": 7374,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "no_integration",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 12:46:01",
+      "exit_code": 0,
+      "duration_sec": 799,
+      "cpu_pct": 100.9,
+      "peak_memory_mb": 9933,
+      "disk_read_mb": 1946,
+      "disk_write_mb": 696
     }
   },
   {
@@ -284,67 +382,137 @@
     "method_id": "no_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-22 05:52:21",
+      "submit": "2025-05-23 12:46:01",
       "exit_code": 0,
-      "duration_sec": 1328,
-      "cpu_pct": 1441.4,
-      "peak_memory_mb": 5837,
+      "duration_sec": 1288,
+      "cpu_pct": 735.1,
+      "peak_memory_mb": 4506,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
   },
   {
     "dataset_id": "leomazzi_cyto_spleen",
-    "method_id": "perfect_integration",
+    "method_id": "perfect_integration_horizontal",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-22 05:52:01",
+      "submit": "2025-05-23 12:51:21",
       "exit_code": 0,
-      "duration_sec": 226,
-      "cpu_pct": 205.1,
-      "peak_memory_mb": 4404,
+      "duration_sec": 298,
+      "cpu_pct": 437.8,
+      "peak_memory_mb": 8397,
       "disk_read_mb": 2254,
       "disk_write_mb": 2
     }
   },
   {
     "dataset_id": "leomazzi_cyto_spleen",
-    "method_id": "perfect_integration",
+    "method_id": "perfect_integration_horizontal",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-22 05:52:01",
+      "submit": "2025-05-23 12:51:21",
       "exit_code": 0,
-      "duration_sec": 380,
-      "cpu_pct": 100,
-      "peak_memory_mb": 5325,
-      "disk_read_mb": 4508,
-      "disk_write_mb": 4
+      "duration_sec": 1002,
+      "cpu_pct": 105,
+      "peak_memory_mb": 8704,
+      "disk_read_mb": 6762,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "perfect_integration_horizontal",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 12:51:21",
+      "exit_code": 0,
+      "duration_sec": 763,
+      "cpu_pct": 100.2,
+      "peak_memory_mb": 7988,
+      "disk_read_mb": 1844,
+      "disk_write_mb": 714
     }
   },
   {
     "dataset_id": "leomazzi_cyto_spleen",
-    "method_id": "perfect_integration",
+    "method_id": "perfect_integration_horizontal",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-22 05:52:01",
+      "submit": "2025-05-23 12:51:21",
       "exit_code": 0,
       "duration_sec": 1306,
-      "cpu_pct": 744.4,
+      "cpu_pct": 746.2,
       "peak_memory_mb": 4301,
       "disk_read_mb": 2254,
       "disk_write_mb": 2
     }
   },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "perfect_integration_vertical",
+    "metric_component_name": "average_batch_r2",
+    "resources": {
+      "submit": "2025-05-23 12:51:11",
+      "exit_code": 0,
+      "duration_sec": 204,
+      "cpu_pct": 208.5,
+      "peak_memory_mb": 5223,
+      "disk_read_mb": 2458,
+      "disk_write_mb": 2
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "perfect_integration_vertical",
+    "metric_component_name": "emd",
+    "resources": {
+      "submit": "2025-05-23 12:51:11",
+      "exit_code": 0,
+      "duration_sec": 876,
+      "cpu_pct": 100.6,
+      "peak_memory_mb": 5325,
+      "disk_read_mb": 7374,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "perfect_integration_vertical",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 12:51:11",
+      "exit_code": 0,
+      "duration_sec": 969,
+      "cpu_pct": 101.3,
+      "peak_memory_mb": 13005,
+      "disk_read_mb": 1946,
+      "disk_write_mb": 722
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "perfect_integration_vertical",
+    "metric_component_name": "n_inconsistent_peaks",
+    "resources": {
+      "submit": "2025-05-23 12:51:11",
+      "exit_code": 0,
+      "duration_sec": 1448,
+      "cpu_pct": 3488.4,
+      "peak_memory_mb": 9933,
+      "disk_read_mb": 2458,
+      "disk_write_mb": 2
+    }
+  },
   {
     "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-22 05:52:21",
+      "submit": "2025-05-23 12:51:21",
       "exit_code": 0,
-      "duration_sec": 186,
-      "cpu_pct": 265.4,
-      "peak_memory_mb": 5940,
+      "duration_sec": 176,
+      "cpu_pct": 180.3,
+      "peak_memory_mb": 4608,
       "disk_read_mb": 2664,
       "disk_write_mb": 2
     }
@@ -354,13 +522,27 @@
     "method_id": "shuffle_integration",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-22 05:52:21",
+      "submit": "2025-05-23 12:51:21",
       "exit_code": 0,
-      "duration_sec": 420,
-      "cpu_pct": 100.1,
-      "peak_memory_mb": 5530,
-      "disk_read_mb": 4916,
-      "disk_write_mb": 4
+      "duration_sec": 942,
+      "cpu_pct": 101,
+      "peak_memory_mb": 8909,
+      "disk_read_mb": 7374,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "shuffle_integration",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 12:51:21",
+      "exit_code": 0,
+      "duration_sec": 786,
+      "cpu_pct": 100.8,
+      "peak_memory_mb": 9933,
+      "disk_read_mb": 1946,
+      "disk_write_mb": 696
     }
   },
   {
@@ -368,11 +550,11 @@
     "method_id": "shuffle_integration",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-22 05:52:21",
+      "submit": "2025-05-23 12:51:21",
       "exit_code": 0,
-      "duration_sec": 1324,
-      "cpu_pct": 1441.1,
-      "peak_memory_mb": 5837,
+      "duration_sec": 1282,
+      "cpu_pct": 740.6,
+      "peak_memory_mb": 4506,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
@@ -382,11 +564,11 @@
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-22 05:51:01",
+      "submit": "2025-05-23 12:51:41",
       "exit_code": 0,
-      "duration_sec": 178,
-      "cpu_pct": 180,
-      "peak_memory_mb": 4608,
+      "duration_sec": 172,
+      "cpu_pct": 276.1,
+      "peak_memory_mb": 5940,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
@@ -396,13 +578,27 @@
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-22 05:51:01",
+      "submit": "2025-05-23 12:51:41",
       "exit_code": 0,
-      "duration_sec": 436,
-      "cpu_pct": 104,
-      "peak_memory_mb": 6964,
-      "disk_read_mb": 4916,
-      "disk_write_mb": 4
+      "duration_sec": 876,
+      "cpu_pct": 98.4,
+      "peak_memory_mb": 4916,
+      "disk_read_mb": 7374,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "shuffle_integration_by_batch",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 12:51:41",
+      "exit_code": 0,
+      "duration_sec": 765,
+      "cpu_pct": 100.3,
+      "peak_memory_mb": 8602,
+      "disk_read_mb": 1946,
+      "disk_write_mb": 696
     }
   },
   {
@@ -410,10 +606,10 @@
     "method_id": "shuffle_integration_by_batch",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-22 05:51:01",
+      "submit": "2025-05-23 12:51:41",
       "exit_code": 0,
-      "duration_sec": 1988,
-      "cpu_pct": 2474.6,
+      "duration_sec": 1824,
+      "cpu_pct": 2834.8,
       "peak_memory_mb": 8500,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
@@ -424,11 +620,11 @@
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "average_batch_r2",
     "resources": {
-      "submit": "2025-05-22 05:51:01",
+      "submit": "2025-05-23 12:51:21",
       "exit_code": 0,
-      "duration_sec": 180,
-      "cpu_pct": 177.2,
-      "peak_memory_mb": 4608,
+      "duration_sec": 186,
+      "cpu_pct": 428.7,
+      "peak_memory_mb": 8602,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
@@ -438,13 +634,27 @@
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "emd",
     "resources": {
-      "submit": "2025-05-22 05:51:01",
+      "submit": "2025-05-23 12:51:21",
       "exit_code": 0,
-      "duration_sec": 440,
+      "duration_sec": 870,
       "cpu_pct": 99.5,
-      "peak_memory_mb": 6861,
-      "disk_read_mb": 4916,
-      "disk_write_mb": 4
+      "peak_memory_mb": 4916,
+      "disk_read_mb": 7374,
+      "disk_write_mb": 6
+    }
+  },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "shuffle_integration_by_cell_type",
+    "metric_component_name": "flowsom_mapping_similarity",
+    "resources": {
+      "submit": "2025-05-23 12:51:21",
+      "exit_code": 0,
+      "duration_sec": 771,
+      "cpu_pct": 100.4,
+      "peak_memory_mb": 8295,
+      "disk_read_mb": 1946,
+      "disk_write_mb": 696
     }
   },
   {
@@ -452,11 +662,11 @@
     "method_id": "shuffle_integration_by_cell_type",
     "metric_component_name": "n_inconsistent_peaks",
     "resources": {
-      "submit": "2025-05-22 05:51:01",
+      "submit": "2025-05-23 12:51:21",
       "exit_code": 0,
-      "duration_sec": 1986,
-      "cpu_pct": 2497.5,
-      "peak_memory_mb": 8500,
+      "duration_sec": 1284,
+      "cpu_pct": 740.3,
+      "peak_memory_mb": 4506,
       "disk_read_mb": 2458,
       "disk_write_mb": 2
     }
diff --git a/results/cyto_batch_integration/data/metric_info.json b/results/cyto_batch_integration/data/metric_info.json
index d4e63040..3b2e119b 100644
--- a/results/cyto_batch_integration/data/metric_info.json
+++ b/results/cyto_batch_integration/data/metric_info.json
@@ -2,61 +2,91 @@
   {
     "task_id": "metrics",
     "component_name": "emd",
-    "metric_id": "emd_mean_ct",
-    "metric_name": "EMD Mean CT",
-    "metric_summary": "Mean Earth Mover Distance across cell types and markers.",
-    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample, cell type, and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the average of all these EMD values is computed to produce an overall metric score EMD Mean CT.\n\nA high score indicates large overall differences in the distributions of marker expressions \nbetween the paired samples, suggesting poor batch integration.\nA low score means the small differences in marker expression distributions between batches, \nindicating good batch integration.\n",
+    "metric_id": "emd_mean_ct_horiz",
+    "metric_name": "EMD Mean CT Horizontal",
+    "metric_summary": "Mean Earth Mover Distance calculated horizontally across donors for each cell type and marker.",
+    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample, cell type, and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the average of all these EMD values is computed and reported as the metric score.\n\nA high score indicates large overall differences in the distributions of marker expressions \nbetween the paired samples, suggesting poor batch integration.\nA low score means the small differences in marker expression distributions between batches, \nindicating good batch integration.\n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f",
     "maximize": false
   },
   {
     "task_id": "metrics",
     "component_name": "emd",
-    "metric_id": "emd_max_ct",
+    "metric_id": "emd_max_ct_horiz",
     "metric_name": "EMD Max CT",
-    "metric_summary": "Max Earth Mover Distance across cell types and markers.",
-    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample, cell type, and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the maximum of all these EMD values is computed as EMD Max CT.\n\nEMD Max CT score reflects the largest difference in marker expression distributions across all cell types, \nmarkers, and paired samples.\nA high score indicates that at least one marker, cell type, or sample pair has a large difference in \ndistribution after batch integration.\nA low score means that even the most poorly corrected marker expression is well integrated across batches.    \n",
+    "metric_summary": "Max Earth Mover Distance calculated horizontally across donors for each cell type and marker.",
+    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample, cell type, and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the maximum of all these EMD values is computed and reported as the metric score.\n\nEMD Max CT score reflects the largest difference in marker expression distributions across all cell types, \nmarkers, and paired samples.\nA high score indicates that at least one marker, cell type, or sample pair has a large difference in \ndistribution after batch integration.\nA low score means that even the most poorly corrected marker expression is well integrated across batches.    \n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f",
     "maximize": false
   },
   {
     "task_id": "metrics",
     "component_name": "emd",
-    "metric_id": "emd_mean_global",
-    "metric_name": "EMD Mean Global",
-    "metric_summary": "Mean Earth Mover Distance across samples and markers.",
-    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every marker and paired sample.\nFinally, the average of all these EMD values is computed to produce an overall metric score EMD Mean Global.\n\nA high score indicates that at least one marker and cell type in a given sample pair has a \nlarge difference in distribution after batch integration.\nA low score means that the most poorly corrected marker expression is well integrated across batches.   \n",
+    "metric_id": "emd_mean_global_horiz",
+    "metric_name": "EMD Mean Global Horizontal",
+    "metric_summary": "Mean Earth Mover Distance calculated horizontally across donors for each marker.",
+    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every marker and paired sample.\nFinally, the average of all these EMD values is computed and reported as the metric score.\n\nThe key difference between this and `emd_mean_ct_horiz` is that the EMD values are\ncomputed agnostic of cell types.\n\nA high score indicates that at least one marker and cell type in a given sample pair has a \nlarge difference in distribution after batch integration.\nA low score means that the most poorly corrected marker expression is well integrated across batches.   \n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f",
     "maximize": false
   },
   {
     "task_id": "metrics",
     "component_name": "emd",
-    "metric_id": "emd_max_global",
-    "metric_name": "EMD Max Global",
-    "metric_summary": "Max Earth Mover Distance across donors and markers.",
-    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the maximum of all these EMD values is computed as EMD Max Global.\n\nEMD Max Global score reflects the largest difference in marker expression distributions \nacross all markers and paired samples.\nA high score indicates that at least one marker in a given sample pair has a large difference in \ndistribution after batch integration.\nA low score means that the most poorly corrected marker expression is well integrated across batches.   \n",
+    "metric_id": "emd_max_global_horiz",
+    "metric_name": "EMD Max Global Horizontal",
+    "metric_summary": "Max Earth Mover Distance calculated horizontally across donors for each marker.",
+    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between paired samples from the same donor \nquantified across two different batches. \nFor each paired sample and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the maximum of all these EMD values is computed and reported as the metric score.\n\nThe key difference between this and `emd_max_ct_horiz` is that the EMD values are\ncomputed agnostic of cell types.\n\nA high score indicates that at least one marker in a given sample pair has a large difference in \ndistribution after batch integration.\nA low score means that the most poorly corrected marker expression is well integrated across batches.   \n",
     "references_doi": "10.1023/A:1026543900054",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/emd",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/metrics/emd",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f",
+    "maximize": false
+  },
+  {
+    "task_id": "metrics",
+    "component_name": "emd",
+    "metric_id": "emd_mean_global_vert",
+    "metric_name": "EMD Mean Global Vertical",
+    "metric_summary": "Mean Earth Mover Distance across batch corrected samples and markers.",
+    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between all integrated samples.\nFor each pair of samples and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the average of all these EMD values is computed and reported as the metric score.\n\nA high score indicates overall, there is a large difference in distribution of marker expression after batch integration.\nA low score means that overall, the samples are well integrated.\n",
+    "references_doi": "10.1023/A:1026543900054",
+    "references_bibtex": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/metrics/emd",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
+    "code_version": "build_main",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f",
+    "maximize": false
+  },
+  {
+    "task_id": "metrics",
+    "component_name": "emd",
+    "metric_id": "emd_max_global_vert",
+    "metric_name": "EMD Max Global Vertical",
+    "metric_summary": "Max Earth Mover Distance across batch corrected samples and markers.",
+    "metric_description": "Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference \nbetween two probability distributions. \n\nHere, EMD is used to compare marker expression distributions between all integrated samples.\nFor each pair of samples and marker, the marker expression values are first converted into \nprobability distributions. \nThis is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.\nThe `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two \nprobability distributions belonging to the same cell type, marker, and a given paired samples.\nThis is then repeated for every cell type, marker, and paired sample.\nFinally, the maximum of all these EMD values is computed and reported as the metric score.\n\nA high score indicates there is a pair of samples and marker which show large difference in distribution after batch integration.\nA low score means that, the worst integrated pair of samples and marker are well integrated.\n",
+    "references_doi": "10.1023/A:1026543900054",
+    "references_bibtex": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/metrics/emd",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/emd:build_main",
+    "code_version": "build_main",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f",
     "maximize": false
   },
   {
@@ -68,10 +98,10 @@
     "metric_description": "The metric compares the number of marker expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
     "references_doi": "10.1038/s41592-019-0686-2",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/n_inconsistent_peaks",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/metrics/n_inconsistent_peaks",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/n_inconsistent_peaks:build_main",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f",
     "maximize": false
   },
   {
@@ -83,10 +113,10 @@
     "metric_description": "The metric compares the number of cell type specific marker expression peaks between the validation and batch-normalized data. \nThe number of peaks is calculated using the `scipy.signal.find_peaks` function. \nThe metric is calculated as the absolute difference between the number of peaks in the validation and batch-normalized data.\nThe (cell type) marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),\nand then peaks are then identified using the `scipy.signal.find_peaks` function.\nFor peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.\n",
     "references_doi": "10.1038/s41592-019-0686-2",
     "references_bibtex": null,
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/n_inconsistent_peaks",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/metrics/n_inconsistent_peaks",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/n_inconsistent_peaks:build_main",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f",
     "maximize": false
   },
   {
@@ -98,10 +128,10 @@
     "metric_description": "First, a simple linear model `sklearn.linear_model.LinearRegression` is fitted for each paired sample and marker to determine the fraction of variance (R^2) explained by the batch covariate B. |\nThe average batch R_squared is then computed as the average of the $R^2$ values across all paired samples, markers. |\nAs a result, $\\overline{R^2_B}_{global}$ quantifies how much of the total variability in the data is driven by batch effects. Consequently, lower values are desirable. |\n\n$\\overline{R^2_B}_{global} = \\frac{1}{N*M}\\sum_{\\substack{(x_{\\mathrm{int}},\\,x_{\\mathrm{val}})\\\\ \\text{paired samples}}}^{N} \\sum_{i=1}^{M} \\,R^2\\!\\bigl(\\mathrm{marker}_i \\mid B\\bigr)$\n\nWhere:\n- $N$ is the number of paired samples, where x_{\\mathrm{int}} is the replicate that has been batch-corrected and x_{\\mathrm{val}} is replicate used for validation. Paired samples belong to different batches.\n- $M$ is the number of markers\n- $B$ is the batch covariate\n\nA higher value of $\\overline{R^2_B}_{global}$ indicates that the batch variable explains more of the variance in the data, which indicates a higher level of batch effects. |\n",
     "references_doi": null,
     "references_bibtex": "@book{draper1998applied,\ntitle={Applied regression analysis},\nauthor={Draper, Norman R and Smith, Harry},\npublisher={John Wiley \\& Sons}\n}\n",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/average_batch_r2",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/metrics/average_batch_r2",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/average_batch_r2:build_main",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f",
     "maximize": false
   },
   {
@@ -113,10 +143,25 @@
     "metric_description": "First, a simple linear model `sklearn.linear_model.LinearRegression` is fitted for each paired sample, marker and cell type to determine the fraction of variance (R^2) explained by the batch covariate B. |\nThe average batch R_squared is then computed as the average of the $R^2$ values across all paired samples, markers and cell types. |\nAs a result, $\\overline{R^2_B}_{cell\\ type}$ quantifies how much of the total variability in the data is driven by batch effects. Consequently, lower values are desirable. |\n\n$\\overline{R^2_B}_{cell\\ type} = \\frac{1}{N*C*M}\\sum_{\\substack{(x_{\\mathrm{int}},\\,x_{\\mathrm{val}})\\\\ \\text{paired samples}}}^{N} \\sum_{j=1}^{C} \\sum_{i=1}^{M}\\,R^2\\!\\bigl(\\mathrm{marker}_i \\mid B\\bigr)$\n\nWhere:\n- $N$ is the number of paired samples, where x_{\\mathrm{int}} is the replicate that has been batch-corrected and x_{\\mathrm{val}} is replicate used for validation. Paired samples belong to different batches.\n- $C$ is the number of cell types\n- $M$ is the number of markers\n- $B$ is the batch covariate\n\nThe $\\overline{Rˆ2_B}_{global}$ is a variation of the latter metric, where the average is computed across paired samples and markers only, without taking into account the cell types. |\n\nA higher value of $\\overline{R^2_B}_{global}$ or $\\overline{R^2_B}_{cell\\ type}$ indicates that the batch variable explains more of the variance in the data, which indicates a higher level of batch effects. |\n\nA good performance on $\\overline{R^2_B}_{global}$ but not on $\\overline{R^2_B}_{cell\\ type}$ might indicate that the batch effect correction is discarding cell type specific batch effects. |\n",
     "references_doi": null,
     "references_bibtex": "@book{draper1998applied,\ntitle={Applied regression analysis},\nauthor={Draper, Norman R and Smith, Harry},\npublisher={John Wiley \\& Sons}\n}\n",
-    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/8c9d7dac5bb329aa1b788a76154f7035ae4b83b4/src/metrics/average_batch_r2",
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/metrics/average_batch_r2",
     "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/average_batch_r2:build_main",
     "code_version": "build_main",
-    "commit_sha": "8c9d7dac5bb329aa1b788a76154f7035ae4b83b4",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f",
     "maximize": false
+  },
+  {
+    "task_id": "metrics",
+    "component_name": "flowsom_mapping_similarity",
+    "metric_id": "flowsom_mean_mapping_similarity",
+    "metric_name": "FlowSOM Mean Mapping Similarity",
+    "metric_summary": "Assess the similarity between FlowSOM trees of integrated and validation samples.",
+    "metric_description": "The metric is based on the FlowSOM algorithm, a popular method which uses self-organizing maps for the viasualization/interpretation/clustering of cytometry data. \nThe FlowSOM algorithm creates a tree structure that represents the relationships between different cell populations in the data.\n\nFor each paired sample (where 'int' is the batch-integrated sample and 'val' is the validation sample)\n1. A FlowSOM tree is created using validation data.\n2. Data from the integrated sample is mapped onto the FlowSOM tree created in step 1.\n3. A similarity measure is computed by comparing cell type proportions of 'val' and 'int' in each metacluster.\n\nIdeally, the proportions of cell types in the metaclusters of the integrated sample should be very similar to those in the validation sample,\nas we assume that only technical variability is present between these two samples.\n\nThe FlowSOM mapping similarity measure can be expressed as follows:\n$\\text{FlowSOM mapping similarity} = 100 - \\text{FlowSOM mapping dissimilarity}$\n\nThe $\\text{FlowSOM mapping dissimilarity}$ is:\n\n$\\text{FlowSOM mapping dissimilarity} = \\sum_{m=1}^{M}w_{m}\\sum_{c=1}^{C}\\abs{P^{val}_{m,c} - P^{int}_{m,c}}$\n\nWhere:\n- $M$ is the number of metaclusters\n- $C$ is the number of cell types\n- $w_{m}$ is the weight of metacluster $m$ (the number of cells in metacluster $m$, for both validation and integrated samples, divided by the total number of cells)\n- $P^{val}_{m,c}$ is the percentage of cell type $c$ in metacluster $m$ of the validation sample\n- $P^{int}_{m,c}$ is the percentage of cell type $c$ in metacluster $m$ of the integrated sample\n\nThe average FlowSOM mapping similarity among all paired samples is computed and used as the final metric value.\nIt is an horizontal metric.\n",
+    "references_doi": ["10.18129/B9.bioc.FlowSOM", "10.1002/cyto.a.22625"],
+    "references_bibtex": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_cyto_batch_integration/blob/37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f/src/metrics/flowsom_mapping_similarity",
+    "image": "https://ghcr.io/openproblems-bio/task_cyto_batch_integration/metrics/flowsom_mapping_similarity:build_main",
+    "code_version": "build_main",
+    "commit_sha": "37dcf0c34b0aa64c16d7d82bc631ff6684e37c5f",
+    "maximize": true
   }
 ]
diff --git a/results/cyto_batch_integration/data/quality_control.json b/results/cyto_batch_integration/data/quality_control.json
index 62c4bfda..c24f2ab7 100644
--- a/results/cyto_batch_integration/data/quality_control.json
+++ b/results/cyto_batch_integration/data/quality_control.json
@@ -93,7 +93,7 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Method info", 
         "name": "Pct 'paper_reference' missing", 
-        "value": 0.5454545454545454, 
+        "value": 0.5, 
         "severity": 2, 
         "severity_value": 3.0, 
         "code": "percent_missing(method_info, field)", 
@@ -243,51 +243,71 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw data", 
         "name": "Number of results", 
-        "value": 11, 
+        "value": 12, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "len(results) == len(method_info) * len(metric_info) * len(dataset_info)", 
-        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_cyto_batch_integration\n  Number of results: 11\n  Number of methods: 11\n  Number of metrics: 8\n  Number of datasets: 1\n"
+        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_cyto_batch_integration\n  Number of results: 12\n  Number of methods: 12\n  Number of metrics: 11\n  Number of datasets: 1\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Metric 'emd_mean_ct_horiz' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_mean_ct_horiz\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Metric 'emd_max_ct_horiz' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_max_ct_horiz\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
-        "name": "Metric 'emd_mean_ct' %missing", 
+        "name": "Metric 'emd_mean_global_horiz' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_mean_ct\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_mean_global_horiz\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
-        "name": "Metric 'emd_max_ct' %missing", 
+        "name": "Metric 'emd_max_global_horiz' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_max_ct\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_max_global_horiz\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
-        "name": "Metric 'emd_mean_global' %missing", 
+        "name": "Metric 'emd_mean_global_vert' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_mean_global\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_mean_global_vert\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
-        "name": "Metric 'emd_max_global' %missing", 
+        "name": "Metric 'emd_max_global_vert' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_max_global\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: emd_max_global_vert\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -329,6 +349,16 @@
         "code": "pct_missing <= .1", 
         "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: average_batch_r2_ct\n  Percentage missing: 0%\n"
     }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Metric 'flowsom_mean_mapping_similarity' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  Metric id: flowsom_mean_mapping_similarity\n  Percentage missing: 0%\n"
+    }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
@@ -392,12 +422,22 @@
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Raw results", 
-        "name": "Method 'perfect_integration' %missing", 
+        "name": "Method 'perfect_integration_horizontal' %missing", 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "pct_missing <= .1", 
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: perfect_integration_horizontal\n  Percentage missing: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Raw results", 
+        "name": "Method 'perfect_integration_vertical' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: perfect_integration\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_cyto_batch_integration\n  method id: perfect_integration_vertical\n  Percentage missing: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -452,1562 +492,2182 @@
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration emd_mean_ct", 
-        "value": 0.0245, 
+        "name": "Worst score shuffle_integration emd_mean_ct_horiz", 
+        "value": 0.0241, 
         "severity": 0, 
-        "severity_value": -0.0245, 
+        "severity_value": -0.0241, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_ct\n  Worst score: 0.0245%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_ct_horiz\n  Worst score: 0.0241%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration emd_mean_ct", 
-        "value": 0.0245, 
+        "name": "Best score shuffle_integration emd_mean_ct_horiz", 
+        "value": 0.0241, 
         "severity": 0, 
-        "severity_value": 0.01225, 
+        "severity_value": 0.01205, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_ct\n  Best score: 0.0245%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_ct_horiz\n  Best score: 0.0241%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_batch emd_mean_ct", 
+        "name": "Worst score shuffle_integration_by_batch emd_mean_ct_horiz", 
         "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_ct\n  Worst score: 0%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_ct_horiz\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_batch emd_mean_ct", 
+        "name": "Best score shuffle_integration_by_batch emd_mean_ct_horiz", 
         "value": 0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_ct\n  Best score: 0%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_ct_horiz\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_cell_type emd_mean_ct", 
-        "value": 0.7809, 
+        "name": "Worst score shuffle_integration_by_cell_type emd_mean_ct_horiz", 
+        "value": 0.78, 
         "severity": 0, 
-        "severity_value": -0.7809, 
+        "severity_value": -0.78, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_ct\n  Worst score: 0.7809%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_ct_horiz\n  Worst score: 0.78%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_cell_type emd_mean_ct", 
-        "value": 0.7809, 
+        "name": "Best score shuffle_integration_by_cell_type emd_mean_ct_horiz", 
+        "value": 0.78, 
         "severity": 0, 
-        "severity_value": 0.39045, 
+        "severity_value": 0.39, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_ct\n  Best score: 0.7809%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_ct_horiz\n  Best score: 0.78%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score harmonypy emd_mean_ct", 
-        "value": 0.7862, 
+        "name": "Worst score harmonypy emd_mean_ct_horiz", 
+        "value": 0.7864, 
         "severity": 0, 
-        "severity_value": -0.7862, 
+        "severity_value": -0.7864, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_ct\n  Worst score: 0.7862%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_ct_horiz\n  Worst score: 0.7864%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score harmonypy emd_mean_ct", 
-        "value": 0.7862, 
+        "name": "Best score harmonypy emd_mean_ct_horiz", 
+        "value": 0.7864, 
         "severity": 0, 
-        "severity_value": 0.3931, 
+        "severity_value": 0.3932, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_ct\n  Best score: 0.7862%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_ct_horiz\n  Best score: 0.7864%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score limma_remove_batch_effect emd_mean_ct", 
-        "value": 0.7721, 
+        "name": "Worst score limma_remove_batch_effect emd_mean_ct_horiz", 
+        "value": 0.7723, 
         "severity": 0, 
-        "severity_value": -0.7721, 
+        "severity_value": -0.7723, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_ct\n  Worst score: 0.7721%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_ct_horiz\n  Worst score: 0.7723%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score limma_remove_batch_effect emd_mean_ct", 
-        "value": 0.7721, 
+        "name": "Best score limma_remove_batch_effect emd_mean_ct_horiz", 
+        "value": 0.7723, 
         "severity": 0, 
-        "severity_value": 0.38605, 
+        "severity_value": 0.38615, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_ct\n  Best score: 0.7721%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_ct_horiz\n  Best score: 0.7723%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score no_integration emd_mean_ct", 
-        "value": 0.7451, 
+        "name": "Worst score no_integration emd_mean_ct_horiz", 
+        "value": 0.7453, 
         "severity": 0, 
-        "severity_value": -0.7451, 
+        "severity_value": -0.7453, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_ct\n  Worst score: 0.7451%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_ct_horiz\n  Worst score: 0.7453%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score no_integration emd_mean_ct", 
-        "value": 0.7451, 
+        "name": "Best score no_integration emd_mean_ct_horiz", 
+        "value": 0.7453, 
         "severity": 0, 
-        "severity_value": 0.37255, 
+        "severity_value": 0.37265, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_ct\n  Best score: 0.7451%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_ct_horiz\n  Best score: 0.7453%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score perfect_integration emd_mean_ct", 
+        "name": "Worst score perfect_integration_horizontal emd_mean_ct_horiz", 
         "value": 1, 
         "severity": 0, 
         "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean_ct\n  Worst score: 1%\n"
+        "message": "Method perfect_integration_horizontal performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_mean_ct_horiz\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score perfect_integration emd_mean_ct", 
+        "name": "Best score perfect_integration_horizontal emd_mean_ct_horiz", 
         "value": 1, 
         "severity": 0, 
         "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean_ct\n  Best score: 1%\n"
-    }, 
-    {
-        "task_id": "task_cyto_batch_integration", 
-        "category": "Scaling", 
-        "name": "Worst score combat emd_mean_ct", 
-        "value": 0.7765, 
-        "severity": 0, 
-        "severity_value": -0.7765, 
-        "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_ct\n  Worst score: 0.7765%\n"
-    }, 
-    {
-        "task_id": "task_cyto_batch_integration", 
-        "category": "Scaling", 
-        "name": "Best score combat emd_mean_ct", 
-        "value": 0.7765, 
-        "severity": 0, 
-        "severity_value": 0.38825, 
-        "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_ct\n  Best score: 0.7765%\n"
+        "message": "Method perfect_integration_horizontal performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_mean_ct_horiz\n  Best score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cycombine_nocontrols emd_mean_ct", 
-        "value": 0.8229, 
+        "name": "Worst score perfect_integration_vertical emd_mean_ct_horiz", 
+        "value": 0.8783, 
         "severity": 0, 
-        "severity_value": -0.8229, 
+        "severity_value": -0.8783, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_ct\n  Worst score: 0.8229%\n"
+        "message": "Method perfect_integration_vertical performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_mean_ct_horiz\n  Worst score: 0.8783%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cycombine_nocontrols emd_mean_ct", 
-        "value": 0.8229, 
+        "name": "Best score perfect_integration_vertical emd_mean_ct_horiz", 
+        "value": 0.8783, 
         "severity": 0, 
-        "severity_value": 0.41145, 
+        "severity_value": 0.43915, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_ct\n  Best score: 0.8229%\n"
+        "message": "Method perfect_integration_vertical performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_mean_ct_horiz\n  Best score: 0.8783%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score gaussnorm emd_mean_ct", 
-        "value": 0.7422, 
+        "name": "Worst score combat emd_mean_ct_horiz", 
+        "value": 0.7766, 
         "severity": 0, 
-        "severity_value": -0.7422, 
+        "severity_value": -0.7766, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_ct\n  Worst score: 0.7422%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_ct_horiz\n  Worst score: 0.7766%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score gaussnorm emd_mean_ct", 
-        "value": 0.7422, 
+        "name": "Best score combat emd_mean_ct_horiz", 
+        "value": 0.7766, 
         "severity": 0, 
-        "severity_value": 0.3711, 
+        "severity_value": 0.3883, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_ct\n  Best score: 0.7422%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_ct_horiz\n  Best score: 0.7766%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cytonorm_controls emd_mean_ct", 
-        "value": 0.8327, 
+        "name": "Worst score cycombine_nocontrols emd_mean_ct_horiz", 
+        "value": 0.823, 
         "severity": 0, 
-        "severity_value": -0.8327, 
+        "severity_value": -0.823, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_ct\n  Worst score: 0.8327%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_ct_horiz\n  Worst score: 0.823%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cytonorm_controls emd_mean_ct", 
-        "value": 0.8327, 
+        "name": "Best score cycombine_nocontrols emd_mean_ct_horiz", 
+        "value": 0.823, 
         "severity": 0, 
-        "severity_value": 0.41635, 
+        "severity_value": 0.4115, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_ct\n  Best score: 0.8327%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_ct_horiz\n  Best score: 0.823%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration emd_max_ct", 
-        "value": 0.0446, 
+        "name": "Worst score gaussnorm emd_mean_ct_horiz", 
+        "value": 0.7423, 
         "severity": 0, 
-        "severity_value": -0.0446, 
+        "severity_value": -0.7423, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_ct\n  Worst score: 0.0446%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_ct_horiz\n  Worst score: 0.7423%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration emd_max_ct", 
-        "value": 0.0446, 
+        "name": "Best score gaussnorm emd_mean_ct_horiz", 
+        "value": 0.7423, 
         "severity": 0, 
-        "severity_value": 0.0223, 
+        "severity_value": 0.37115, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_ct\n  Best score: 0.0446%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_ct_horiz\n  Best score: 0.7423%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_batch emd_max_ct", 
-        "value": 0, 
+        "name": "Worst score cytonorm_controls emd_mean_ct_horiz", 
+        "value": 0.8328, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.8328, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_ct\n  Worst score: 0%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_ct_horiz\n  Worst score: 0.8328%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_batch emd_max_ct", 
-        "value": 0, 
+        "name": "Best score cytonorm_controls emd_mean_ct_horiz", 
+        "value": 0.8328, 
         "severity": 0, 
-        "severity_value": 0.0, 
+        "severity_value": 0.4164, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_ct\n  Best score: 0%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_ct_horiz\n  Best score: 0.8328%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_cell_type emd_max_ct", 
-        "value": 0.54, 
+        "name": "Worst score shuffle_integration emd_max_ct_horiz", 
+        "value": 0.0338, 
         "severity": 0, 
-        "severity_value": -0.54, 
+        "severity_value": -0.0338, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_ct\n  Worst score: 0.54%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_ct_horiz\n  Worst score: 0.0338%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_cell_type emd_max_ct", 
-        "value": 0.54, 
+        "name": "Best score shuffle_integration emd_max_ct_horiz", 
+        "value": 0.0338, 
         "severity": 0, 
-        "severity_value": 0.27, 
+        "severity_value": 0.0169, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_ct\n  Best score: 0.54%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_ct_horiz\n  Best score: 0.0338%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score harmonypy emd_max_ct", 
-        "value": 0.5606, 
+        "name": "Worst score shuffle_integration_by_batch emd_max_ct_horiz", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": -0.5606, 
+        "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_ct\n  Worst score: 0.5606%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_ct_horiz\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score harmonypy emd_max_ct", 
-        "value": 0.5606, 
+        "name": "Best score shuffle_integration_by_batch emd_max_ct_horiz", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": 0.2803, 
+        "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_ct\n  Best score: 0.5606%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_ct_horiz\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score limma_remove_batch_effect emd_max_ct", 
-        "value": 0.5546, 
+        "name": "Worst score shuffle_integration_by_cell_type emd_max_ct_horiz", 
+        "value": 0.5382, 
         "severity": 0, 
-        "severity_value": -0.5546, 
+        "severity_value": -0.5382, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_ct\n  Worst score: 0.5546%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_ct_horiz\n  Worst score: 0.5382%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score limma_remove_batch_effect emd_max_ct", 
-        "value": 0.5546, 
+        "name": "Best score shuffle_integration_by_cell_type emd_max_ct_horiz", 
+        "value": 0.5382, 
         "severity": 0, 
-        "severity_value": 0.2773, 
+        "severity_value": 0.2691, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_ct\n  Best score: 0.5546%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_ct_horiz\n  Best score: 0.5382%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score no_integration emd_max_ct", 
-        "value": 0.5391, 
+        "name": "Worst score harmonypy emd_max_ct_horiz", 
+        "value": 0.559, 
         "severity": 0, 
-        "severity_value": -0.5391, 
+        "severity_value": -0.559, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_ct\n  Worst score: 0.5391%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_ct_horiz\n  Worst score: 0.559%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score no_integration emd_max_ct", 
-        "value": 0.5391, 
+        "name": "Best score harmonypy emd_max_ct_horiz", 
+        "value": 0.559, 
         "severity": 0, 
-        "severity_value": 0.26955, 
+        "severity_value": 0.2795, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_ct\n  Best score: 0.5391%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_ct_horiz\n  Best score: 0.559%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score perfect_integration emd_max_ct", 
-        "value": 1, 
+        "name": "Worst score limma_remove_batch_effect emd_max_ct_horiz", 
+        "value": 0.5529, 
         "severity": 0, 
-        "severity_value": -1.0, 
+        "severity_value": -0.5529, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max_ct\n  Worst score: 1%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_ct_horiz\n  Worst score: 0.5529%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score perfect_integration emd_max_ct", 
-        "value": 1, 
+        "name": "Best score limma_remove_batch_effect emd_max_ct_horiz", 
+        "value": 0.5529, 
         "severity": 0, 
-        "severity_value": 0.5, 
+        "severity_value": 0.27645, 
         "code": "best_score <= 2", 
-        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max_ct\n  Best score: 1%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_ct_horiz\n  Best score: 0.5529%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score combat emd_max_ct", 
-        "value": 0.5458, 
+        "name": "Worst score no_integration emd_max_ct_horiz", 
+        "value": 0.5373, 
         "severity": 0, 
-        "severity_value": -0.5458, 
+        "severity_value": -0.5373, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_ct\n  Worst score: 0.5458%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_ct_horiz\n  Worst score: 0.5373%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score combat emd_max_ct", 
-        "value": 0.5458, 
+        "name": "Best score no_integration emd_max_ct_horiz", 
+        "value": 0.5373, 
         "severity": 0, 
-        "severity_value": 0.2729, 
+        "severity_value": 0.26865, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_ct\n  Best score: 0.5458%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_ct_horiz\n  Best score: 0.5373%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cycombine_nocontrols emd_max_ct", 
-        "value": 0.6009, 
+        "name": "Worst score perfect_integration_horizontal emd_max_ct_horiz", 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": -0.6009, 
+        "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_ct\n  Worst score: 0.6009%\n"
+        "message": "Method perfect_integration_horizontal performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_max_ct_horiz\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cycombine_nocontrols emd_max_ct", 
-        "value": 0.6009, 
+        "name": "Best score perfect_integration_horizontal emd_max_ct_horiz", 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": 0.30045, 
+        "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_ct\n  Best score: 0.6009%\n"
+        "message": "Method perfect_integration_horizontal performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_max_ct_horiz\n  Best score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score gaussnorm emd_max_ct", 
-        "value": 0.5418, 
+        "name": "Worst score perfect_integration_vertical emd_max_ct_horiz", 
+        "value": 0.5979, 
         "severity": 0, 
-        "severity_value": -0.5418, 
+        "severity_value": -0.5979, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_ct\n  Worst score: 0.5418%\n"
+        "message": "Method perfect_integration_vertical performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_max_ct_horiz\n  Worst score: 0.5979%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score gaussnorm emd_max_ct", 
-        "value": 0.5418, 
+        "name": "Best score perfect_integration_vertical emd_max_ct_horiz", 
+        "value": 0.5979, 
         "severity": 0, 
-        "severity_value": 0.2709, 
+        "severity_value": 0.29895, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_ct\n  Best score: 0.5418%\n"
+        "message": "Method perfect_integration_vertical performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_max_ct_horiz\n  Best score: 0.5979%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cytonorm_controls emd_max_ct", 
-        "value": 0.6826, 
+        "name": "Worst score combat emd_max_ct_horiz", 
+        "value": 0.5441, 
         "severity": 0, 
-        "severity_value": -0.6826, 
+        "severity_value": -0.5441, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_ct\n  Worst score: 0.6826%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_ct_horiz\n  Worst score: 0.5441%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cytonorm_controls emd_max_ct", 
-        "value": 0.6826, 
+        "name": "Best score combat emd_max_ct_horiz", 
+        "value": 0.5441, 
         "severity": 0, 
-        "severity_value": 0.3413, 
+        "severity_value": 0.27205, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_ct\n  Best score: 0.6826%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_ct_horiz\n  Best score: 0.5441%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration emd_mean_global", 
-        "value": 0.1979, 
+        "name": "Worst score cycombine_nocontrols emd_max_ct_horiz", 
+        "value": 0.5993, 
         "severity": 0, 
-        "severity_value": -0.1979, 
+        "severity_value": -0.5993, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global\n  Worst score: 0.1979%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_ct_horiz\n  Worst score: 0.5993%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration emd_mean_global", 
-        "value": 0.1979, 
+        "name": "Best score cycombine_nocontrols emd_max_ct_horiz", 
+        "value": 0.5993, 
         "severity": 0, 
-        "severity_value": 0.09895, 
+        "severity_value": 0.29965, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global\n  Best score: 0.1979%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_ct_horiz\n  Best score: 0.5993%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_batch emd_mean_global", 
-        "value": 0, 
+        "name": "Worst score gaussnorm emd_max_ct_horiz", 
+        "value": 0.54, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.54, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_global\n  Worst score: 0%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_ct_horiz\n  Worst score: 0.54%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_batch emd_mean_global", 
-        "value": 0, 
+        "name": "Best score gaussnorm emd_max_ct_horiz", 
+        "value": 0.54, 
         "severity": 0, 
-        "severity_value": 0.0, 
+        "severity_value": 0.27, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_global\n  Best score: 0%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_ct_horiz\n  Best score: 0.54%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_cell_type emd_mean_global", 
-        "value": 0.5178, 
+        "name": "Worst score cytonorm_controls emd_max_ct_horiz", 
+        "value": 0.6814, 
         "severity": 0, 
-        "severity_value": -0.5178, 
+        "severity_value": -0.6814, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global\n  Worst score: 0.5178%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_ct_horiz\n  Worst score: 0.6814%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_cell_type emd_mean_global", 
-        "value": 0.5178, 
+        "name": "Best score cytonorm_controls emd_max_ct_horiz", 
+        "value": 0.6814, 
         "severity": 0, 
-        "severity_value": 0.2589, 
+        "severity_value": 0.3407, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global\n  Best score: 0.5178%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_ct_horiz\n  Best score: 0.6814%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score harmonypy emd_mean_global", 
-        "value": 0.5995, 
+        "name": "Worst score shuffle_integration emd_mean_global_horiz", 
+        "value": 0.2001, 
         "severity": 0, 
-        "severity_value": -0.5995, 
+        "severity_value": -0.2001, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global\n  Worst score: 0.5995%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global_horiz\n  Worst score: 0.2001%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score harmonypy emd_mean_global", 
-        "value": 0.5995, 
+        "name": "Best score shuffle_integration emd_mean_global_horiz", 
+        "value": 0.2001, 
         "severity": 0, 
-        "severity_value": 0.29975, 
+        "severity_value": 0.10005, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global\n  Best score: 0.5995%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global_horiz\n  Best score: 0.2001%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score limma_remove_batch_effect emd_mean_global", 
-        "value": 0.5889, 
+        "name": "Worst score shuffle_integration_by_batch emd_mean_global_horiz", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": -0.5889, 
+        "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global\n  Worst score: 0.5889%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_global_horiz\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score limma_remove_batch_effect emd_mean_global", 
-        "value": 0.5889, 
+        "name": "Best score shuffle_integration_by_batch emd_mean_global_horiz", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": 0.29445, 
+        "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global\n  Best score: 0.5889%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_global_horiz\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score no_integration emd_mean_global", 
-        "value": 0.3608, 
+        "name": "Worst score shuffle_integration_by_cell_type emd_mean_global_horiz", 
+        "value": 0.518, 
         "severity": 0, 
-        "severity_value": -0.3608, 
+        "severity_value": -0.518, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global\n  Worst score: 0.3608%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global_horiz\n  Worst score: 0.518%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score no_integration emd_mean_global", 
-        "value": 0.3608, 
+        "name": "Best score shuffle_integration_by_cell_type emd_mean_global_horiz", 
+        "value": 0.518, 
         "severity": 0, 
-        "severity_value": 0.1804, 
+        "severity_value": 0.259, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global\n  Best score: 0.3608%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global_horiz\n  Best score: 0.518%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score perfect_integration emd_mean_global", 
-        "value": 1, 
+        "name": "Worst score harmonypy emd_mean_global_horiz", 
+        "value": 0.6002, 
         "severity": 0, 
-        "severity_value": -1.0, 
+        "severity_value": -0.6002, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean_global\n  Worst score: 1%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global_horiz\n  Worst score: 0.6002%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score perfect_integration emd_mean_global", 
-        "value": 1, 
+        "name": "Best score harmonypy emd_mean_global_horiz", 
+        "value": 0.6002, 
         "severity": 0, 
-        "severity_value": 0.5, 
+        "severity_value": 0.3001, 
         "code": "best_score <= 2", 
-        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_mean_global\n  Best score: 1%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global_horiz\n  Best score: 0.6002%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score combat emd_mean_global", 
-        "value": 0.6011, 
+        "name": "Worst score limma_remove_batch_effect emd_mean_global_horiz", 
+        "value": 0.5896, 
         "severity": 0, 
-        "severity_value": -0.6011, 
+        "severity_value": -0.5896, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global\n  Worst score: 0.6011%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global_horiz\n  Worst score: 0.5896%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score combat emd_mean_global", 
-        "value": 0.6011, 
+        "name": "Best score limma_remove_batch_effect emd_mean_global_horiz", 
+        "value": 0.5896, 
         "severity": 0, 
-        "severity_value": 0.30055, 
+        "severity_value": 0.2948, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global\n  Best score: 0.6011%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global_horiz\n  Best score: 0.5896%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cycombine_nocontrols emd_mean_global", 
-        "value": 0.5821, 
+        "name": "Worst score no_integration emd_mean_global_horiz", 
+        "value": 0.3619, 
         "severity": 0, 
-        "severity_value": -0.5821, 
+        "severity_value": -0.3619, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global\n  Worst score: 0.5821%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global_horiz\n  Worst score: 0.3619%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cycombine_nocontrols emd_mean_global", 
-        "value": 0.5821, 
+        "name": "Best score no_integration emd_mean_global_horiz", 
+        "value": 0.3619, 
         "severity": 0, 
-        "severity_value": 0.29105, 
+        "severity_value": 0.18095, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global\n  Best score: 0.5821%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global_horiz\n  Best score: 0.3619%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score gaussnorm emd_mean_global", 
-        "value": 0.4566, 
+        "name": "Worst score perfect_integration_horizontal emd_mean_global_horiz", 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": -0.4566, 
+        "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global\n  Worst score: 0.4566%\n"
+        "message": "Method perfect_integration_horizontal performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_mean_global_horiz\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score gaussnorm emd_mean_global", 
-        "value": 0.4566, 
+        "name": "Best score perfect_integration_horizontal emd_mean_global_horiz", 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": 0.2283, 
+        "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global\n  Best score: 0.4566%\n"
+        "message": "Method perfect_integration_horizontal performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_mean_global_horiz\n  Best score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cytonorm_controls emd_mean_global", 
-        "value": 0.6438, 
+        "name": "Worst score perfect_integration_vertical emd_mean_global_horiz", 
+        "value": 0.6847, 
         "severity": 0, 
-        "severity_value": -0.6438, 
+        "severity_value": -0.6847, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global\n  Worst score: 0.6438%\n"
+        "message": "Method perfect_integration_vertical performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_mean_global_horiz\n  Worst score: 0.6847%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cytonorm_controls emd_mean_global", 
-        "value": 0.6438, 
+        "name": "Best score perfect_integration_vertical emd_mean_global_horiz", 
+        "value": 0.6847, 
         "severity": 0, 
-        "severity_value": 0.3219, 
+        "severity_value": 0.34235, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global\n  Best score: 0.6438%\n"
+        "message": "Method perfect_integration_vertical performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_mean_global_horiz\n  Best score: 0.6847%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration emd_max_global", 
-        "value": 0.1312, 
+        "name": "Worst score combat emd_mean_global_horiz", 
+        "value": 0.6018, 
         "severity": 0, 
-        "severity_value": -0.1312, 
+        "severity_value": -0.6018, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global\n  Worst score: 0.1312%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global_horiz\n  Worst score: 0.6018%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration emd_max_global", 
-        "value": 0.1312, 
+        "name": "Best score combat emd_mean_global_horiz", 
+        "value": 0.6018, 
         "severity": 0, 
-        "severity_value": 0.0656, 
+        "severity_value": 0.3009, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global\n  Best score: 0.1312%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global_horiz\n  Best score: 0.6018%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_batch emd_max_global", 
-        "value": 0, 
+        "name": "Worst score cycombine_nocontrols emd_mean_global_horiz", 
+        "value": 0.5829, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5829, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_global\n  Worst score: 0%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global_horiz\n  Worst score: 0.5829%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_batch emd_max_global", 
-        "value": 0, 
+        "name": "Best score cycombine_nocontrols emd_mean_global_horiz", 
+        "value": 0.5829, 
         "severity": 0, 
-        "severity_value": 0.0, 
+        "severity_value": 0.29145, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_global\n  Best score: 0%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global_horiz\n  Best score: 0.5829%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_cell_type emd_max_global", 
-        "value": 0.5831, 
+        "name": "Worst score gaussnorm emd_mean_global_horiz", 
+        "value": 0.4575, 
         "severity": 0, 
-        "severity_value": -0.5831, 
+        "severity_value": -0.4575, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global\n  Worst score: 0.5831%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global_horiz\n  Worst score: 0.4575%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_cell_type emd_max_global", 
-        "value": 0.5831, 
+        "name": "Best score gaussnorm emd_mean_global_horiz", 
+        "value": 0.4575, 
         "severity": 0, 
-        "severity_value": 0.29155, 
+        "severity_value": 0.22875, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global\n  Best score: 0.5831%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global_horiz\n  Best score: 0.4575%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score harmonypy emd_max_global", 
-        "value": 0.5861, 
+        "name": "Worst score cytonorm_controls emd_mean_global_horiz", 
+        "value": 0.6444, 
         "severity": 0, 
-        "severity_value": -0.5861, 
+        "severity_value": -0.6444, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global\n  Worst score: 0.5861%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global_horiz\n  Worst score: 0.6444%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score harmonypy emd_max_global", 
-        "value": 0.5861, 
+        "name": "Best score cytonorm_controls emd_mean_global_horiz", 
+        "value": 0.6444, 
         "severity": 0, 
-        "severity_value": 0.29305, 
+        "severity_value": 0.3222, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global\n  Best score: 0.5861%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global_horiz\n  Best score: 0.6444%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score limma_remove_batch_effect emd_max_global", 
-        "value": 0.5724, 
+        "name": "Worst score shuffle_integration emd_max_global_horiz", 
+        "value": 0.1315, 
         "severity": 0, 
-        "severity_value": -0.5724, 
+        "severity_value": -0.1315, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global\n  Worst score: 0.5724%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global_horiz\n  Worst score: 0.1315%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score limma_remove_batch_effect emd_max_global", 
-        "value": 0.5724, 
+        "name": "Best score shuffle_integration emd_max_global_horiz", 
+        "value": 0.1315, 
         "severity": 0, 
-        "severity_value": 0.2862, 
+        "severity_value": 0.06575, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global\n  Best score: 0.5724%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global_horiz\n  Best score: 0.1315%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score no_integration emd_max_global", 
-        "value": 0.2374, 
+        "name": "Worst score shuffle_integration_by_batch emd_max_global_horiz", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": -0.2374, 
+        "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global\n  Worst score: 0.2374%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_global_horiz\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score no_integration emd_max_global", 
-        "value": 0.2374, 
+        "name": "Best score shuffle_integration_by_batch emd_max_global_horiz", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": 0.1187, 
+        "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global\n  Best score: 0.2374%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_global_horiz\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score perfect_integration emd_max_global", 
-        "value": 1, 
+        "name": "Worst score shuffle_integration_by_cell_type emd_max_global_horiz", 
+        "value": 0.583, 
         "severity": 0, 
-        "severity_value": -1.0, 
+        "severity_value": -0.583, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max_global\n  Worst score: 1%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global_horiz\n  Worst score: 0.583%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score perfect_integration emd_max_global", 
-        "value": 1, 
+        "name": "Best score shuffle_integration_by_cell_type emd_max_global_horiz", 
+        "value": 0.583, 
         "severity": 0, 
-        "severity_value": 0.5, 
+        "severity_value": 0.2915, 
         "code": "best_score <= 2", 
-        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: emd_max_global\n  Best score: 1%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global_horiz\n  Best score: 0.583%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score combat emd_max_global", 
-        "value": 0.5298, 
+        "name": "Worst score harmonypy emd_max_global_horiz", 
+        "value": 0.5859, 
         "severity": 0, 
-        "severity_value": -0.5298, 
+        "severity_value": -0.5859, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global\n  Worst score: 0.5298%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global_horiz\n  Worst score: 0.5859%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score combat emd_max_global", 
-        "value": 0.5298, 
+        "name": "Best score harmonypy emd_max_global_horiz", 
+        "value": 0.5859, 
         "severity": 0, 
-        "severity_value": 0.2649, 
+        "severity_value": 0.29295, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global\n  Best score: 0.5298%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global_horiz\n  Best score: 0.5859%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cycombine_nocontrols emd_max_global", 
-        "value": 0.5339, 
+        "name": "Worst score limma_remove_batch_effect emd_max_global_horiz", 
+        "value": 0.5721, 
         "severity": 0, 
-        "severity_value": -0.5339, 
+        "severity_value": -0.5721, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global\n  Worst score: 0.5339%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global_horiz\n  Worst score: 0.5721%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cycombine_nocontrols emd_max_global", 
-        "value": 0.5339, 
+        "name": "Best score limma_remove_batch_effect emd_max_global_horiz", 
+        "value": 0.5721, 
         "severity": 0, 
-        "severity_value": 0.26695, 
+        "severity_value": 0.28605, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global\n  Best score: 0.5339%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global_horiz\n  Best score: 0.5721%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score gaussnorm emd_max_global", 
-        "value": 0.4736, 
+        "name": "Worst score no_integration emd_max_global_horiz", 
+        "value": 0.2369, 
         "severity": 0, 
-        "severity_value": -0.4736, 
+        "severity_value": -0.2369, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global\n  Worst score: 0.4736%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global_horiz\n  Worst score: 0.2369%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score gaussnorm emd_max_global", 
-        "value": 0.4736, 
+        "name": "Best score no_integration emd_max_global_horiz", 
+        "value": 0.2369, 
         "severity": 0, 
-        "severity_value": 0.2368, 
+        "severity_value": 0.11845, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global\n  Best score: 0.4736%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global_horiz\n  Best score: 0.2369%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cytonorm_controls emd_max_global", 
-        "value": 0.6243, 
+        "name": "Worst score perfect_integration_horizontal emd_max_global_horiz", 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": -0.6243, 
+        "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global\n  Worst score: 0.6243%\n"
+        "message": "Method perfect_integration_horizontal performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_max_global_horiz\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cytonorm_controls emd_max_global", 
-        "value": 0.6243, 
+        "name": "Best score perfect_integration_horizontal emd_max_global_horiz", 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": 0.31215, 
+        "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global\n  Best score: 0.6243%\n"
+        "message": "Method perfect_integration_horizontal performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_max_global_horiz\n  Best score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration n_inconsistent_peaks", 
-        "value": 0, 
+        "name": "Worst score perfect_integration_vertical emd_max_global_horiz", 
+        "value": 0.2369, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.2369, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0%\n"
+        "message": "Method perfect_integration_vertical performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_max_global_horiz\n  Worst score: 0.2369%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration n_inconsistent_peaks", 
-        "value": 0, 
+        "name": "Best score perfect_integration_vertical emd_max_global_horiz", 
+        "value": 0.2369, 
         "severity": 0, 
-        "severity_value": 0.0, 
+        "severity_value": 0.11845, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 0%\n"
+        "message": "Method perfect_integration_vertical performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_max_global_horiz\n  Best score: 0.2369%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_batch n_inconsistent_peaks", 
-        "value": 0, 
+        "name": "Worst score combat emd_max_global_horiz", 
+        "value": 0.5295, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.5295, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks\n  Worst score: 0%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global_horiz\n  Worst score: 0.5295%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_batch n_inconsistent_peaks", 
-        "value": 0, 
+        "name": "Best score combat emd_max_global_horiz", 
+        "value": 0.5295, 
         "severity": 0, 
-        "severity_value": 0.0, 
+        "severity_value": 0.26475, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks\n  Best score: 0%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global_horiz\n  Best score: 0.5295%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_cell_type n_inconsistent_peaks", 
-        "value": 0.5, 
+        "name": "Worst score cycombine_nocontrols emd_max_global_horiz", 
+        "value": 0.5337, 
         "severity": 0, 
-        "severity_value": -0.5, 
+        "severity_value": -0.5337, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.5%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global_horiz\n  Worst score: 0.5337%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_cell_type n_inconsistent_peaks", 
-        "value": 0.5, 
+        "name": "Best score cycombine_nocontrols emd_max_global_horiz", 
+        "value": 0.5337, 
         "severity": 0, 
-        "severity_value": 0.25, 
+        "severity_value": 0.26685, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Best score: 0.5%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global_horiz\n  Best score: 0.5337%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score harmonypy n_inconsistent_peaks", 
-        "value": 0.75, 
+        "name": "Worst score gaussnorm emd_max_global_horiz", 
+        "value": 0.4733, 
         "severity": 0, 
-        "severity_value": -0.75, 
+        "severity_value": -0.4733, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global_horiz\n  Worst score: 0.4733%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score harmonypy n_inconsistent_peaks", 
-        "value": 0.75, 
+        "name": "Best score gaussnorm emd_max_global_horiz", 
+        "value": 0.4733, 
         "severity": 0, 
-        "severity_value": 0.375, 
+        "severity_value": 0.23665, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global_horiz\n  Best score: 0.4733%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score limma_remove_batch_effect n_inconsistent_peaks", 
-        "value": 0.75, 
+        "name": "Worst score cytonorm_controls emd_max_global_horiz", 
+        "value": 0.6241, 
         "severity": 0, 
-        "severity_value": -0.75, 
+        "severity_value": -0.6241, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global_horiz\n  Worst score: 0.6241%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score limma_remove_batch_effect n_inconsistent_peaks", 
-        "value": 0.75, 
+        "name": "Best score cytonorm_controls emd_max_global_horiz", 
+        "value": 0.6241, 
         "severity": 0, 
-        "severity_value": 0.375, 
+        "severity_value": 0.31205, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global_horiz\n  Best score: 0.6241%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score no_integration n_inconsistent_peaks", 
-        "value": 0.75, 
+        "name": "Worst score shuffle_integration emd_mean_global_vert", 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": -0.75, 
+        "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global_vert\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score no_integration n_inconsistent_peaks", 
-        "value": 0.75, 
+        "name": "Best score shuffle_integration emd_mean_global_vert", 
+        "value": 1, 
         "severity": 0, 
-        "severity_value": 0.375, 
+        "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_mean_global_vert\n  Best score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score perfect_integration n_inconsistent_peaks", 
-        "value": 1, 
+        "name": "Worst score shuffle_integration_by_batch emd_mean_global_vert", 
+        "value": 0.6413, 
         "severity": 0, 
-        "severity_value": -1.0, 
+        "severity_value": -0.6413, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 1%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_global_vert\n  Worst score: 0.6413%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score perfect_integration n_inconsistent_peaks", 
-        "value": 1, 
+        "name": "Best score shuffle_integration_by_batch emd_mean_global_vert", 
+        "value": 0.6413, 
         "severity": 0, 
-        "severity_value": 0.5, 
+        "severity_value": 0.32065, 
         "code": "best_score <= 2", 
-        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 1%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_mean_global_vert\n  Best score: 0.6413%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score combat n_inconsistent_peaks", 
-        "value": 0.625, 
+        "name": "Worst score shuffle_integration_by_cell_type emd_mean_global_vert", 
+        "value": 0.4341, 
         "severity": 0, 
-        "severity_value": -0.625, 
+        "severity_value": -0.4341, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.625%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global_vert\n  Worst score: 0.4341%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score combat n_inconsistent_peaks", 
-        "value": 0.625, 
+        "name": "Best score shuffle_integration_by_cell_type emd_mean_global_vert", 
+        "value": 0.4341, 
         "severity": 0, 
-        "severity_value": 0.3125, 
+        "severity_value": 0.21705, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Best score: 0.625%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_mean_global_vert\n  Best score: 0.4341%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cycombine_nocontrols n_inconsistent_peaks", 
-        "value": 0.75, 
+        "name": "Worst score harmonypy emd_mean_global_vert", 
+        "value": 0.2491, 
         "severity": 0, 
-        "severity_value": -0.75, 
+        "severity_value": -0.2491, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global_vert\n  Worst score: 0.2491%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cycombine_nocontrols n_inconsistent_peaks", 
-        "value": 0.75, 
+        "name": "Best score harmonypy emd_mean_global_vert", 
+        "value": 0.2491, 
         "severity": 0, 
-        "severity_value": 0.375, 
+        "severity_value": 0.12455, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_mean_global_vert\n  Best score: 0.2491%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score gaussnorm n_inconsistent_peaks", 
-        "value": 0.625, 
+        "name": "Worst score limma_remove_batch_effect emd_mean_global_vert", 
+        "value": 0.2382, 
         "severity": 0, 
-        "severity_value": -0.625, 
+        "severity_value": -0.2382, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.625%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global_vert\n  Worst score: 0.2382%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score gaussnorm n_inconsistent_peaks", 
-        "value": 0.625, 
+        "name": "Best score limma_remove_batch_effect emd_mean_global_vert", 
+        "value": 0.2382, 
         "severity": 0, 
-        "severity_value": 0.3125, 
+        "severity_value": 0.1191, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks\n  Best score: 0.625%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_mean_global_vert\n  Best score: 0.2382%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cytonorm_controls n_inconsistent_peaks", 
-        "value": 0.75, 
+        "name": "Worst score no_integration emd_mean_global_vert", 
+        "value": 0.0543, 
         "severity": 0, 
-        "severity_value": -0.75, 
+        "severity_value": -0.0543, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global_vert\n  Worst score: 0.0543%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cytonorm_controls n_inconsistent_peaks", 
-        "value": 0.75, 
+        "name": "Best score no_integration emd_mean_global_vert", 
+        "value": 0.0543, 
         "severity": 0, 
-        "severity_value": 0.375, 
+        "severity_value": 0.02715, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_mean_global_vert\n  Best score: 0.0543%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration n_inconsistent_peaks_ct", 
-        "value": 0.0278, 
+        "name": "Worst score perfect_integration_horizontal emd_mean_global_vert", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": -0.0278, 
+        "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0278%\n"
+        "message": "Method perfect_integration_horizontal performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_mean_global_vert\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration n_inconsistent_peaks_ct", 
-        "value": 0.0278, 
+        "name": "Best score perfect_integration_horizontal emd_mean_global_vert", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": 0.0139, 
+        "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.0278%\n"
+        "message": "Method perfect_integration_horizontal performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_mean_global_vert\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_batch n_inconsistent_peaks_ct", 
-        "value": 0, 
+        "name": "Worst score perfect_integration_vertical emd_mean_global_vert", 
+        "value": 0.1673, 
         "severity": 0, 
-        "severity_value": -0.0, 
+        "severity_value": -0.1673, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0%\n"
+        "message": "Method perfect_integration_vertical performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_mean_global_vert\n  Worst score: 0.1673%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_batch n_inconsistent_peaks_ct", 
-        "value": 0, 
+        "name": "Best score perfect_integration_vertical emd_mean_global_vert", 
+        "value": 0.1673, 
         "severity": 0, 
-        "severity_value": 0.0, 
+        "severity_value": 0.08365, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0%\n"
+        "message": "Method perfect_integration_vertical performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_mean_global_vert\n  Best score: 0.1673%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
-        "value": 0.75, 
+        "name": "Worst score combat emd_mean_global_vert", 
+        "value": 0.2513, 
         "severity": 0, 
-        "severity_value": -0.75, 
+        "severity_value": -0.2513, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.75%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global_vert\n  Worst score: 0.2513%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
-        "value": 0.75, 
+        "name": "Best score combat emd_mean_global_vert", 
+        "value": 0.2513, 
         "severity": 0, 
-        "severity_value": 0.375, 
+        "severity_value": 0.12565, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.75%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_mean_global_vert\n  Best score: 0.2513%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score harmonypy n_inconsistent_peaks_ct", 
-        "value": 0.8715, 
+        "name": "Worst score cycombine_nocontrols emd_mean_global_vert", 
+        "value": 0.2659, 
         "severity": 0, 
-        "severity_value": -0.8715, 
+        "severity_value": -0.2659, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8715%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global_vert\n  Worst score: 0.2659%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score harmonypy n_inconsistent_peaks_ct", 
-        "value": 0.8715, 
+        "name": "Best score cycombine_nocontrols emd_mean_global_vert", 
+        "value": 0.2659, 
         "severity": 0, 
-        "severity_value": 0.43575, 
+        "severity_value": 0.13295, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8715%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_mean_global_vert\n  Best score: 0.2659%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score limma_remove_batch_effect n_inconsistent_peaks_ct", 
-        "value": 0.8681, 
+        "name": "Worst score gaussnorm emd_mean_global_vert", 
+        "value": 0.2413, 
         "severity": 0, 
-        "severity_value": -0.8681, 
+        "severity_value": -0.2413, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8681%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global_vert\n  Worst score: 0.2413%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score limma_remove_batch_effect n_inconsistent_peaks_ct", 
-        "value": 0.8681, 
+        "name": "Best score gaussnorm emd_mean_global_vert", 
+        "value": 0.2413, 
         "severity": 0, 
-        "severity_value": 0.43405, 
+        "severity_value": 0.12065, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8681%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_mean_global_vert\n  Best score: 0.2413%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score no_integration n_inconsistent_peaks_ct", 
-        "value": 0.8681, 
+        "name": "Worst score cytonorm_controls emd_mean_global_vert", 
+        "value": 0.2422, 
         "severity": 0, 
-        "severity_value": -0.8681, 
+        "severity_value": -0.2422, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8681%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global_vert\n  Worst score: 0.2422%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score no_integration n_inconsistent_peaks_ct", 
-        "value": 0.8681, 
+        "name": "Best score cytonorm_controls emd_mean_global_vert", 
+        "value": 0.2422, 
         "severity": 0, 
-        "severity_value": 0.43405, 
+        "severity_value": 0.1211, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8681%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_mean_global_vert\n  Best score: 0.2422%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score perfect_integration n_inconsistent_peaks_ct", 
+        "name": "Worst score shuffle_integration emd_max_global_vert", 
         "value": 1, 
         "severity": 0, 
         "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 1%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global_vert\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score perfect_integration n_inconsistent_peaks_ct", 
+        "name": "Best score shuffle_integration emd_max_global_vert", 
         "value": 1, 
         "severity": 0, 
         "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 1%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: emd_max_global_vert\n  Best score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score combat n_inconsistent_peaks_ct", 
-        "value": 0.8646, 
+        "name": "Worst score shuffle_integration_by_batch emd_max_global_vert", 
+        "value": 0.4225, 
         "severity": 0, 
-        "severity_value": -0.8646, 
+        "severity_value": -0.4225, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8646%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_global_vert\n  Worst score: 0.4225%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score combat n_inconsistent_peaks_ct", 
-        "value": 0.8646, 
+        "name": "Best score shuffle_integration_by_batch emd_max_global_vert", 
+        "value": 0.4225, 
         "severity": 0, 
-        "severity_value": 0.4323, 
+        "severity_value": 0.21125, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8646%\n"
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: emd_max_global_vert\n  Best score: 0.4225%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cycombine_nocontrols n_inconsistent_peaks_ct", 
-        "value": 0.8646, 
+        "name": "Worst score shuffle_integration_by_cell_type emd_max_global_vert", 
+        "value": 0.2146, 
         "severity": 0, 
-        "severity_value": -0.8646, 
+        "severity_value": -0.2146, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8646%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global_vert\n  Worst score: 0.2146%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cycombine_nocontrols n_inconsistent_peaks_ct", 
-        "value": 0.8646, 
+        "name": "Best score shuffle_integration_by_cell_type emd_max_global_vert", 
+        "value": 0.2146, 
         "severity": 0, 
-        "severity_value": 0.4323, 
+        "severity_value": 0.1073, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8646%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: emd_max_global_vert\n  Best score: 0.2146%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score gaussnorm n_inconsistent_peaks_ct", 
-        "value": 0.8819, 
+        "name": "Worst score harmonypy emd_max_global_vert", 
+        "value": 0.1647, 
         "severity": 0, 
-        "severity_value": -0.8819, 
+        "severity_value": -0.1647, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8819%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global_vert\n  Worst score: 0.1647%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score gaussnorm n_inconsistent_peaks_ct", 
-        "value": 0.8819, 
+        "name": "Best score harmonypy emd_max_global_vert", 
+        "value": 0.1647, 
         "severity": 0, 
-        "severity_value": 0.44095, 
+        "severity_value": 0.08235, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8819%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: emd_max_global_vert\n  Best score: 0.1647%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cytonorm_controls n_inconsistent_peaks_ct", 
-        "value": 0.8785, 
+        "name": "Worst score limma_remove_batch_effect emd_max_global_vert", 
+        "value": 0.1784, 
         "severity": 0, 
-        "severity_value": -0.8785, 
+        "severity_value": -0.1784, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8785%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global_vert\n  Worst score: 0.1784%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score cytonorm_controls n_inconsistent_peaks_ct", 
-        "value": 0.8785, 
+        "name": "Best score limma_remove_batch_effect emd_max_global_vert", 
+        "value": 0.1784, 
         "severity": 0, 
-        "severity_value": 0.43925, 
+        "severity_value": 0.0892, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8785%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: emd_max_global_vert\n  Best score: 0.1784%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration average_batch_r2_global", 
-        "value": 0.5228, 
+        "name": "Worst score no_integration emd_max_global_vert", 
+        "value": 0.0084, 
         "severity": 0, 
-        "severity_value": -0.5228, 
+        "severity_value": -0.0084, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.5228%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global_vert\n  Worst score: 0.0084%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration average_batch_r2_global", 
-        "value": 0.5228, 
+        "name": "Best score no_integration emd_max_global_vert", 
+        "value": 0.0084, 
         "severity": 0, 
-        "severity_value": 0.2614, 
+        "severity_value": 0.0042, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.5228%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: emd_max_global_vert\n  Best score: 0.0084%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_batch average_batch_r2_global", 
+        "name": "Worst score perfect_integration_horizontal emd_max_global_vert", 
         "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: average_batch_r2_global\n  Worst score: 0%\n"
+        "message": "Method perfect_integration_horizontal performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_max_global_vert\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_batch average_batch_r2_global", 
+        "name": "Best score perfect_integration_horizontal emd_max_global_vert", 
         "value": 0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: average_batch_r2_global\n  Best score: 0%\n"
+        "message": "Method perfect_integration_horizontal performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: emd_max_global_vert\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score shuffle_integration_by_cell_type average_batch_r2_global", 
-        "value": 0.7144, 
+        "name": "Worst score perfect_integration_vertical emd_max_global_vert", 
+        "value": 0.1784, 
         "severity": 0, 
-        "severity_value": -0.7144, 
+        "severity_value": -0.1784, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Worst score: 0.7144%\n"
+        "message": "Method perfect_integration_vertical performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_max_global_vert\n  Worst score: 0.1784%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score shuffle_integration_by_cell_type average_batch_r2_global", 
-        "value": 0.7144, 
+        "name": "Best score perfect_integration_vertical emd_max_global_vert", 
+        "value": 0.1784, 
         "severity": 0, 
-        "severity_value": 0.3572, 
+        "severity_value": 0.0892, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Best score: 0.7144%\n"
+        "message": "Method perfect_integration_vertical performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: emd_max_global_vert\n  Best score: 0.1784%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score harmonypy average_batch_r2_global", 
-        "value": 0.7585, 
+        "name": "Worst score combat emd_max_global_vert", 
+        "value": 0.1831, 
         "severity": 0, 
-        "severity_value": -0.7585, 
+        "severity_value": -0.1831, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Worst score: 0.7585%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global_vert\n  Worst score: 0.1831%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score harmonypy average_batch_r2_global", 
-        "value": 0.7585, 
+        "name": "Best score combat emd_max_global_vert", 
+        "value": 0.1831, 
         "severity": 0, 
-        "severity_value": 0.37925, 
+        "severity_value": 0.09155, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Best score: 0.7585%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: emd_max_global_vert\n  Best score: 0.1831%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score limma_remove_batch_effect average_batch_r2_global", 
-        "value": 0.7619, 
+        "name": "Worst score cycombine_nocontrols emd_max_global_vert", 
+        "value": 0.176, 
         "severity": 0, 
-        "severity_value": -0.7619, 
+        "severity_value": -0.176, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Worst score: 0.7619%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global_vert\n  Worst score: 0.176%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score limma_remove_batch_effect average_batch_r2_global", 
-        "value": 0.7619, 
+        "name": "Best score cycombine_nocontrols emd_max_global_vert", 
+        "value": 0.176, 
         "severity": 0, 
-        "severity_value": 0.38095, 
+        "severity_value": 0.088, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Best score: 0.7619%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: emd_max_global_vert\n  Best score: 0.176%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score no_integration average_batch_r2_global", 
-        "value": 0.2159, 
+        "name": "Worst score gaussnorm emd_max_global_vert", 
+        "value": 0.0301, 
         "severity": 0, 
-        "severity_value": -0.2159, 
+        "severity_value": -0.0301, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.2159%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global_vert\n  Worst score: 0.0301%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score no_integration average_batch_r2_global", 
-        "value": 0.2159, 
+        "name": "Best score gaussnorm emd_max_global_vert", 
+        "value": 0.0301, 
         "severity": 0, 
-        "severity_value": 0.10795, 
+        "severity_value": 0.01505, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.2159%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: emd_max_global_vert\n  Best score: 0.0301%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score perfect_integration average_batch_r2_global", 
-        "value": 1, 
+        "name": "Worst score cytonorm_controls emd_max_global_vert", 
+        "value": 0.1853, 
         "severity": 0, 
-        "severity_value": -1.0, 
+        "severity_value": -0.1853, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_global\n  Worst score: 1%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global_vert\n  Worst score: 0.1853%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score perfect_integration average_batch_r2_global", 
-        "value": 1, 
+        "name": "Best score cytonorm_controls emd_max_global_vert", 
+        "value": 0.1853, 
         "severity": 0, 
-        "severity_value": 0.5, 
+        "severity_value": 0.09265, 
         "code": "best_score <= 2", 
-        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_global\n  Best score: 1%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: emd_max_global_vert\n  Best score: 0.1853%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score combat average_batch_r2_global", 
-        "value": 0.754, 
+        "name": "Worst score shuffle_integration n_inconsistent_peaks", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": -0.754, 
+        "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Worst score: 0.754%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score combat average_batch_r2_global", 
-        "value": 0.754, 
+        "name": "Best score shuffle_integration n_inconsistent_peaks", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": 0.377, 
+        "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Best score: 0.754%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 0%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score cycombine_nocontrols average_batch_r2_global", 
-        "value": 0.6772, 
+        "name": "Worst score shuffle_integration_by_batch n_inconsistent_peaks", 
+        "value": 0, 
         "severity": 0, 
-        "severity_value": -0.6772, 
+        "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Worst score: 0.6772%\n"
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_batch n_inconsistent_peaks", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type n_inconsistent_peaks", 
+        "value": 0.5, 
+        "severity": 0, 
+        "severity_value": -0.5, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.5%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type n_inconsistent_peaks", 
+        "value": 0.5, 
+        "severity": 0, 
+        "severity_value": 0.25, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks\n  Best score: 0.5%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": -0.75, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": 0.375, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": -0.75, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": 0.375, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": -0.75, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": 0.375, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration_horizontal n_inconsistent_peaks", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": -1.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration_horizontal performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: n_inconsistent_peaks\n  Worst score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration_horizontal n_inconsistent_peaks", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration_horizontal performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: n_inconsistent_peaks\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration_vertical n_inconsistent_peaks", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": -1.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration_vertical performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: n_inconsistent_peaks\n  Worst score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration_vertical n_inconsistent_peaks", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration_vertical performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: n_inconsistent_peaks\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat n_inconsistent_peaks", 
+        "value": 0.625, 
+        "severity": 0, 
+        "severity_value": -0.625, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.625%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat n_inconsistent_peaks", 
+        "value": 0.625, 
+        "severity": 0, 
+        "severity_value": 0.3125, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks\n  Best score: 0.625%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": -0.75, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": 0.375, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score gaussnorm n_inconsistent_peaks", 
+        "value": 0.625, 
+        "severity": 0, 
+        "severity_value": -0.625, 
+        "code": "worst_score >= -1", 
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.625%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score gaussnorm n_inconsistent_peaks", 
+        "value": 0.625, 
+        "severity": 0, 
+        "severity_value": 0.3125, 
+        "code": "best_score <= 2", 
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks\n  Best score: 0.625%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cytonorm_controls n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": -0.75, 
+        "code": "worst_score >= -1", 
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks\n  Worst score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cytonorm_controls n_inconsistent_peaks", 
+        "value": 0.75, 
+        "severity": 0, 
+        "severity_value": 0.375, 
+        "code": "best_score <= 2", 
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks\n  Best score: 0.75%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration n_inconsistent_peaks_ct", 
+        "value": 0.0034, 
+        "severity": 0, 
+        "severity_value": -0.0034, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.0034%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration n_inconsistent_peaks_ct", 
+        "value": 0.0034, 
+        "severity": 0, 
+        "severity_value": 0.0017, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.0034%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_batch n_inconsistent_peaks_ct", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_batch n_inconsistent_peaks_ct", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
+        "value": 0.7655, 
+        "severity": 0, 
+        "severity_value": -0.7655, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.7655%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type n_inconsistent_peaks_ct", 
+        "value": 0.7655, 
+        "severity": 0, 
+        "severity_value": 0.38275, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.7655%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy n_inconsistent_peaks_ct", 
+        "value": 0.8724, 
+        "severity": 0, 
+        "severity_value": -0.8724, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8724%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy n_inconsistent_peaks_ct", 
+        "value": 0.8724, 
+        "severity": 0, 
+        "severity_value": 0.4362, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8724%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect n_inconsistent_peaks_ct", 
+        "value": 0.869, 
+        "severity": 0, 
+        "severity_value": -0.869, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.869%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect n_inconsistent_peaks_ct", 
+        "value": 0.869, 
+        "severity": 0, 
+        "severity_value": 0.4345, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.869%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration n_inconsistent_peaks_ct", 
+        "value": 0.869, 
+        "severity": 0, 
+        "severity_value": -0.869, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.869%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration n_inconsistent_peaks_ct", 
+        "value": 0.869, 
+        "severity": 0, 
+        "severity_value": 0.4345, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.869%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration_horizontal n_inconsistent_peaks_ct", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": -1.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration_horizontal performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration_horizontal n_inconsistent_peaks_ct", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration_horizontal performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration_vertical n_inconsistent_peaks_ct", 
+        "value": 0.9345, 
+        "severity": 0, 
+        "severity_value": -0.9345, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration_vertical performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.9345%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration_vertical n_inconsistent_peaks_ct", 
+        "value": 0.9345, 
+        "severity": 0, 
+        "severity_value": 0.46725, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration_vertical performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.9345%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat n_inconsistent_peaks_ct", 
+        "value": 0.8655, 
+        "severity": 0, 
+        "severity_value": -0.8655, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8655%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat n_inconsistent_peaks_ct", 
+        "value": 0.8655, 
+        "severity": 0, 
+        "severity_value": 0.43275, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8655%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols n_inconsistent_peaks_ct", 
+        "value": 0.8655, 
+        "severity": 0, 
+        "severity_value": -0.8655, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8655%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols n_inconsistent_peaks_ct", 
+        "value": 0.8655, 
+        "severity": 0, 
+        "severity_value": 0.43275, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8655%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score gaussnorm n_inconsistent_peaks_ct", 
+        "value": 0.8828, 
+        "severity": 0, 
+        "severity_value": -0.8828, 
+        "code": "worst_score >= -1", 
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8828%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score gaussnorm n_inconsistent_peaks_ct", 
+        "value": 0.8828, 
+        "severity": 0, 
+        "severity_value": 0.4414, 
+        "code": "best_score <= 2", 
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8828%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cytonorm_controls n_inconsistent_peaks_ct", 
+        "value": 0.8793, 
+        "severity": 0, 
+        "severity_value": -0.8793, 
+        "code": "worst_score >= -1", 
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks_ct\n  Worst score: 0.8793%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cytonorm_controls n_inconsistent_peaks_ct", 
+        "value": 0.8793, 
+        "severity": 0, 
+        "severity_value": 0.43965, 
+        "code": "best_score <= 2", 
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: n_inconsistent_peaks_ct\n  Best score: 0.8793%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration average_batch_r2_global", 
+        "value": 0.5241, 
+        "severity": 0, 
+        "severity_value": -0.5241, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.5241%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration average_batch_r2_global", 
+        "value": 0.5241, 
+        "severity": 0, 
+        "severity_value": 0.26205, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.5241%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_batch average_batch_r2_global", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: average_batch_r2_global\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_batch average_batch_r2_global", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: average_batch_r2_global\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type average_batch_r2_global", 
+        "value": 0.7157, 
+        "severity": 0, 
+        "severity_value": -0.7157, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Worst score: 0.7157%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type average_batch_r2_global", 
+        "value": 0.7157, 
+        "severity": 0, 
+        "severity_value": 0.35785, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_global\n  Best score: 0.7157%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy average_batch_r2_global", 
+        "value": 0.759, 
+        "severity": 0, 
+        "severity_value": -0.759, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Worst score: 0.759%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy average_batch_r2_global", 
+        "value": 0.759, 
+        "severity": 0, 
+        "severity_value": 0.3795, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_global\n  Best score: 0.759%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect average_batch_r2_global", 
+        "value": 0.7624, 
+        "severity": 0, 
+        "severity_value": -0.7624, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Worst score: 0.7624%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect average_batch_r2_global", 
+        "value": 0.7624, 
+        "severity": 0, 
+        "severity_value": 0.3812, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_global\n  Best score: 0.7624%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration average_batch_r2_global", 
+        "value": 0.2176, 
+        "severity": 0, 
+        "severity_value": -0.2176, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Worst score: 0.2176%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration average_batch_r2_global", 
+        "value": 0.2176, 
+        "severity": 0, 
+        "severity_value": 0.1088, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_global\n  Best score: 0.2176%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration_horizontal average_batch_r2_global", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": -1.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration_horizontal performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: average_batch_r2_global\n  Worst score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration_horizontal average_batch_r2_global", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration_horizontal performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: average_batch_r2_global\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration_vertical average_batch_r2_global", 
+        "value": 0.5898, 
+        "severity": 0, 
+        "severity_value": -0.5898, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration_vertical performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: average_batch_r2_global\n  Worst score: 0.5898%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration_vertical average_batch_r2_global", 
+        "value": 0.5898, 
+        "severity": 0, 
+        "severity_value": 0.2949, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration_vertical performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: average_batch_r2_global\n  Best score: 0.5898%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat average_batch_r2_global", 
+        "value": 0.7545, 
+        "severity": 0, 
+        "severity_value": -0.7545, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Worst score: 0.7545%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat average_batch_r2_global", 
+        "value": 0.7545, 
+        "severity": 0, 
+        "severity_value": 0.37725, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_global\n  Best score: 0.7545%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols average_batch_r2_global", 
+        "value": 0.6779, 
+        "severity": 0, 
+        "severity_value": -0.6779, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Worst score: 0.6779%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols average_batch_r2_global", 
-        "value": 0.6772, 
+        "value": 0.6779, 
         "severity": 0, 
-        "severity_value": 0.3386, 
+        "severity_value": 0.33895, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Best score: 0.6772%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_global\n  Best score: 0.6779%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score gaussnorm average_batch_r2_global", 
-        "value": 0.5398, 
+        "value": 0.5408, 
         "severity": 0, 
-        "severity_value": -0.5398, 
+        "severity_value": -0.5408, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_global\n  Worst score: 0.5398%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_global\n  Worst score: 0.5408%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score gaussnorm average_batch_r2_global", 
-        "value": 0.5398, 
+        "value": 0.5408, 
         "severity": 0, 
-        "severity_value": 0.2699, 
+        "severity_value": 0.2704, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_global\n  Best score: 0.5398%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_global\n  Best score: 0.5408%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cytonorm_controls average_batch_r2_global", 
-        "value": 0.7636, 
+        "value": 0.7641, 
         "severity": 0, 
-        "severity_value": -0.7636, 
+        "severity_value": -0.7641, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_global\n  Worst score: 0.7636%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_global\n  Worst score: 0.7641%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cytonorm_controls average_batch_r2_global", 
-        "value": 0.7636, 
+        "value": 0.7641, 
         "severity": 0, 
-        "severity_value": 0.3818, 
+        "severity_value": 0.38205, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_global\n  Best score: 0.7636%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_global\n  Best score: 0.7641%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration average_batch_r2_ct", 
-        "value": 0.0641, 
+        "value": 0.0598, 
         "severity": 0, 
-        "severity_value": -0.0641, 
+        "severity_value": -0.0598, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0641%\n"
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.0598%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration average_batch_r2_ct", 
-        "value": 0.0641, 
+        "value": 0.0598, 
         "severity": 0, 
-        "severity_value": 0.03205, 
+        "severity_value": 0.0299, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.0641%\n"
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.0598%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
@@ -2033,180 +2693,440 @@
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score shuffle_integration_by_cell_type average_batch_r2_ct", 
-        "value": 0.843, 
+        "value": 0.8421, 
         "severity": 0, 
-        "severity_value": -0.843, 
+        "severity_value": -0.8421, 
         "code": "worst_score >= -1", 
-        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Worst score: 0.843%\n"
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Worst score: 0.8421%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score shuffle_integration_by_cell_type average_batch_r2_ct", 
-        "value": 0.843, 
+        "value": 0.8421, 
         "severity": 0, 
-        "severity_value": 0.4215, 
+        "severity_value": 0.42105, 
         "code": "best_score <= 2", 
-        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Best score: 0.843%\n"
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: average_batch_r2_ct\n  Best score: 0.8421%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score harmonypy average_batch_r2_ct", 
-        "value": 0.7966, 
+        "value": 0.7969, 
         "severity": 0, 
-        "severity_value": -0.7966, 
+        "severity_value": -0.7969, 
         "code": "worst_score >= -1", 
-        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7966%\n"
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7969%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score harmonypy average_batch_r2_ct", 
-        "value": 0.7966, 
+        "value": 0.7969, 
         "severity": 0, 
-        "severity_value": 0.3983, 
+        "severity_value": 0.39845, 
         "code": "best_score <= 2", 
-        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Best score: 0.7966%\n"
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: average_batch_r2_ct\n  Best score: 0.7969%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score limma_remove_batch_effect average_batch_r2_ct", 
-        "value": 0.754, 
+        "value": 0.7542, 
         "severity": 0, 
-        "severity_value": -0.754, 
+        "severity_value": -0.7542, 
         "code": "worst_score >= -1", 
-        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Worst score: 0.754%\n"
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7542%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score limma_remove_batch_effect average_batch_r2_ct", 
-        "value": 0.754, 
+        "value": 0.7542, 
         "severity": 0, 
-        "severity_value": 0.377, 
+        "severity_value": 0.3771, 
         "code": "best_score <= 2", 
-        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Best score: 0.754%\n"
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: average_batch_r2_ct\n  Best score: 0.7542%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score no_integration average_batch_r2_ct", 
-        "value": 0.706, 
+        "value": 0.7063, 
         "severity": 0, 
-        "severity_value": -0.706, 
+        "severity_value": -0.7063, 
         "code": "worst_score >= -1", 
-        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.706%\n"
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7063%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score no_integration average_batch_r2_ct", 
-        "value": 0.706, 
+        "value": 0.7063, 
         "severity": 0, 
-        "severity_value": 0.353, 
+        "severity_value": 0.35315, 
         "code": "best_score <= 2", 
-        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.706%\n"
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: average_batch_r2_ct\n  Best score: 0.7063%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Worst score perfect_integration average_batch_r2_ct", 
+        "name": "Worst score perfect_integration_horizontal average_batch_r2_ct", 
         "value": 1, 
         "severity": 0, 
         "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method perfect_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_ct\n  Worst score: 1%\n"
+        "message": "Method perfect_integration_horizontal performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: average_batch_r2_ct\n  Worst score: 1%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
-        "name": "Best score perfect_integration average_batch_r2_ct", 
+        "name": "Best score perfect_integration_horizontal average_batch_r2_ct", 
         "value": 1, 
         "severity": 0, 
         "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method perfect_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration\n  Metric id: average_batch_r2_ct\n  Best score: 1%\n"
+        "message": "Method perfect_integration_horizontal performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: average_batch_r2_ct\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration_vertical average_batch_r2_ct", 
+        "value": 0.8482, 
+        "severity": 0, 
+        "severity_value": -0.8482, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration_vertical performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: average_batch_r2_ct\n  Worst score: 0.8482%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration_vertical average_batch_r2_ct", 
+        "value": 0.8482, 
+        "severity": 0, 
+        "severity_value": 0.4241, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration_vertical performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: average_batch_r2_ct\n  Best score: 0.8482%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score combat average_batch_r2_ct", 
-        "value": 0.7585, 
+        "value": 0.7587, 
         "severity": 0, 
-        "severity_value": -0.7585, 
+        "severity_value": -0.7587, 
         "code": "worst_score >= -1", 
-        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7585%\n"
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7587%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score combat average_batch_r2_ct", 
-        "value": 0.7585, 
+        "value": 0.7587, 
         "severity": 0, 
-        "severity_value": 0.37925, 
+        "severity_value": 0.37935, 
         "code": "best_score <= 2", 
-        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Best score: 0.7585%\n"
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: average_batch_r2_ct\n  Best score: 0.7587%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cycombine_nocontrols average_batch_r2_ct", 
-        "value": 0.854, 
+        "value": 0.8541, 
         "severity": 0, 
-        "severity_value": -0.854, 
+        "severity_value": -0.8541, 
         "code": "worst_score >= -1", 
-        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Worst score: 0.854%\n"
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Worst score: 0.8541%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cycombine_nocontrols average_batch_r2_ct", 
-        "value": 0.854, 
+        "value": 0.8541, 
         "severity": 0, 
-        "severity_value": 0.427, 
+        "severity_value": 0.42705, 
         "code": "best_score <= 2", 
-        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Best score: 0.854%\n"
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: average_batch_r2_ct\n  Best score: 0.8541%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score gaussnorm average_batch_r2_ct", 
-        "value": 0.7231, 
+        "value": 0.7234, 
         "severity": 0, 
-        "severity_value": -0.7231, 
+        "severity_value": -0.7234, 
         "code": "worst_score >= -1", 
-        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7231%\n"
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_ct\n  Worst score: 0.7234%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score gaussnorm average_batch_r2_ct", 
-        "value": 0.7231, 
+        "value": 0.7234, 
         "severity": 0, 
-        "severity_value": 0.36155, 
+        "severity_value": 0.3617, 
         "code": "best_score <= 2", 
-        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_ct\n  Best score: 0.7231%\n"
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: average_batch_r2_ct\n  Best score: 0.7234%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Worst score cytonorm_controls average_batch_r2_ct", 
-        "value": 0.8639, 
+        "value": 0.8641, 
         "severity": 0, 
-        "severity_value": -0.8639, 
+        "severity_value": -0.8641, 
         "code": "worst_score >= -1", 
-        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_ct\n  Worst score: 0.8639%\n"
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_ct\n  Worst score: 0.8641%\n"
     }, 
     {
         "task_id": "task_cyto_batch_integration", 
         "category": "Scaling", 
         "name": "Best score cytonorm_controls average_batch_r2_ct", 
-        "value": 0.8639, 
+        "value": 0.8641, 
+        "severity": 0, 
+        "severity_value": 0.43205, 
+        "code": "best_score <= 2", 
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_ct\n  Best score: 0.8641%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration flowsom_mean_mapping_similarity", 
+        "value": 0.0002, 
+        "severity": 0, 
+        "severity_value": -0.0002, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 0.0002%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration flowsom_mean_mapping_similarity", 
+        "value": 0.0002, 
+        "severity": 0, 
+        "severity_value": 0.0001, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 0.0002%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_batch flowsom_mean_mapping_similarity", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": -0.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_batch performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_batch flowsom_mean_mapping_similarity", 
+        "value": 0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_batch performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_batch\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 0%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score shuffle_integration_by_cell_type flowsom_mean_mapping_similarity", 
+        "value": 0.9761, 
+        "severity": 0, 
+        "severity_value": -0.9761, 
+        "code": "worst_score >= -1", 
+        "message": "Method shuffle_integration_by_cell_type performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 0.9761%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score shuffle_integration_by_cell_type flowsom_mean_mapping_similarity", 
+        "value": 0.9761, 
+        "severity": 0, 
+        "severity_value": 0.48805, 
+        "code": "best_score <= 2", 
+        "message": "Method shuffle_integration_by_cell_type performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: shuffle_integration_by_cell_type\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 0.9761%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score harmonypy flowsom_mean_mapping_similarity", 
+        "value": 0.9817, 
+        "severity": 0, 
+        "severity_value": -0.9817, 
+        "code": "worst_score >= -1", 
+        "message": "Method harmonypy performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 0.9817%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score harmonypy flowsom_mean_mapping_similarity", 
+        "value": 0.9817, 
+        "severity": 0, 
+        "severity_value": 0.49085, 
+        "code": "best_score <= 2", 
+        "message": "Method harmonypy performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: harmonypy\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 0.9817%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score limma_remove_batch_effect flowsom_mean_mapping_similarity", 
+        "value": 0.9809, 
+        "severity": 0, 
+        "severity_value": -0.9809, 
+        "code": "worst_score >= -1", 
+        "message": "Method limma_remove_batch_effect performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 0.9809%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score limma_remove_batch_effect flowsom_mean_mapping_similarity", 
+        "value": 0.9809, 
+        "severity": 0, 
+        "severity_value": 0.49045, 
+        "code": "best_score <= 2", 
+        "message": "Method limma_remove_batch_effect performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: limma_remove_batch_effect\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 0.9809%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score no_integration flowsom_mean_mapping_similarity", 
+        "value": 0.9803, 
+        "severity": 0, 
+        "severity_value": -0.9803, 
+        "code": "worst_score >= -1", 
+        "message": "Method no_integration performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 0.9803%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score no_integration flowsom_mean_mapping_similarity", 
+        "value": 0.9803, 
+        "severity": 0, 
+        "severity_value": 0.49015, 
+        "code": "best_score <= 2", 
+        "message": "Method no_integration performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: no_integration\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 0.9803%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration_horizontal flowsom_mean_mapping_similarity", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": -1.0, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration_horizontal performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration_horizontal flowsom_mean_mapping_similarity", 
+        "value": 1, 
+        "severity": 0, 
+        "severity_value": 0.5, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration_horizontal performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_horizontal\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 1%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score perfect_integration_vertical flowsom_mean_mapping_similarity", 
+        "value": 0.9923, 
+        "severity": 0, 
+        "severity_value": -0.9923, 
+        "code": "worst_score >= -1", 
+        "message": "Method perfect_integration_vertical performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 0.9923%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score perfect_integration_vertical flowsom_mean_mapping_similarity", 
+        "value": 0.9923, 
+        "severity": 0, 
+        "severity_value": 0.49615, 
+        "code": "best_score <= 2", 
+        "message": "Method perfect_integration_vertical performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: perfect_integration_vertical\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 0.9923%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score combat flowsom_mean_mapping_similarity", 
+        "value": 0.981, 
+        "severity": 0, 
+        "severity_value": -0.981, 
+        "code": "worst_score >= -1", 
+        "message": "Method combat performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 0.981%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score combat flowsom_mean_mapping_similarity", 
+        "value": 0.981, 
+        "severity": 0, 
+        "severity_value": 0.4905, 
+        "code": "best_score <= 2", 
+        "message": "Method combat performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: combat\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 0.981%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cycombine_nocontrols flowsom_mean_mapping_similarity", 
+        "value": 0.9835, 
+        "severity": 0, 
+        "severity_value": -0.9835, 
+        "code": "worst_score >= -1", 
+        "message": "Method cycombine_nocontrols performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 0.9835%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cycombine_nocontrols flowsom_mean_mapping_similarity", 
+        "value": 0.9835, 
+        "severity": 0, 
+        "severity_value": 0.49175, 
+        "code": "best_score <= 2", 
+        "message": "Method cycombine_nocontrols performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cycombine_nocontrols\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 0.9835%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score gaussnorm flowsom_mean_mapping_similarity", 
+        "value": 0.9759, 
+        "severity": 0, 
+        "severity_value": -0.9759, 
+        "code": "worst_score >= -1", 
+        "message": "Method gaussnorm performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 0.9759%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score gaussnorm flowsom_mean_mapping_similarity", 
+        "value": 0.9759, 
+        "severity": 0, 
+        "severity_value": 0.48795, 
+        "code": "best_score <= 2", 
+        "message": "Method gaussnorm performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: gaussnorm\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 0.9759%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Worst score cytonorm_controls flowsom_mean_mapping_similarity", 
+        "value": 0.9838, 
+        "severity": 0, 
+        "severity_value": -0.9838, 
+        "code": "worst_score >= -1", 
+        "message": "Method cytonorm_controls performs much worse than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: flowsom_mean_mapping_similarity\n  Worst score: 0.9838%\n"
+    }, 
+    {
+        "task_id": "task_cyto_batch_integration", 
+        "category": "Scaling", 
+        "name": "Best score cytonorm_controls flowsom_mean_mapping_similarity", 
+        "value": 0.9838, 
         "severity": 0, 
-        "severity_value": 0.43195, 
+        "severity_value": 0.4919, 
         "code": "best_score <= 2", 
-        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: average_batch_r2_ct\n  Best score: 0.8639%\n"
+        "message": "Method cytonorm_controls performs a lot better than baselines.\n  Task id: task_cyto_batch_integration\n  Method id: cytonorm_controls\n  Metric id: flowsom_mean_mapping_similarity\n  Best score: 0.9838%\n"
     }
 ]
\ No newline at end of file
diff --git a/results/cyto_batch_integration/data/results.json b/results/cyto_batch_integration/data/results.json
index 99155bc5..469e6506 100644
--- a/results/cyto_batch_integration/data/results.json
+++ b/results/cyto_batch_integration/data/results.json
@@ -5,29 +5,35 @@
     "metric_values": {
       "average_batch_r2_ct": 0.0523,
       "average_batch_r2_global": 0.008,
-      "emd_max_ct": 1.2193,
-      "emd_max_global": 0.3177,
-      "emd_mean_ct": 0.1348,
-      "emd_mean_global": 0.0749,
+      "emd_max_ct_horiz": 1.2193,
+      "emd_max_global_horiz": 0.3177,
+      "emd_max_global_vert": 0.6408,
+      "emd_mean_ct_horiz": 0.1348,
+      "emd_mean_global_horiz": 0.0749,
+      "emd_mean_global_vert": 0.1587,
+      "flowsom_mean_mapping_similarity": 97.8579,
       "n_inconsistent_peaks": 3,
       "n_inconsistent_peaks_ct": 39
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.7585,
-      "average_batch_r2_global": 0.754,
-      "emd_max_ct": 0.5458,
-      "emd_max_global": 0.5298,
-      "emd_mean_ct": 0.7765,
-      "emd_mean_global": 0.6011,
+      "average_batch_r2_ct": 0.7587,
+      "average_batch_r2_global": 0.7545,
+      "emd_max_ct_horiz": 0.5441,
+      "emd_max_global_horiz": 0.5295,
+      "emd_max_global_vert": 0.1831,
+      "emd_mean_ct_horiz": 0.7766,
+      "emd_mean_global_horiz": 0.6018,
+      "emd_mean_global_vert": 0.2513,
+      "flowsom_mean_mapping_similarity": 0.981,
       "n_inconsistent_peaks": 0.625,
-      "n_inconsistent_peaks_ct": 0.8646
+      "n_inconsistent_peaks_ct": 0.8655
     },
-    "mean_score": 0.6819,
+    "mean_score": 0.6246,
     "resources": {
-      "submit": "2025-05-22 05:46:32",
+      "submit": "2025-05-23 12:44:42",
       "exit_code": 0,
-      "duration_sec": 92,
-      "cpu_pct": 138.9,
+      "duration_sec": 91,
+      "cpu_pct": 140.2,
       "peak_memory_mb": 6656,
       "disk_read_mb": 512,
       "disk_write_mb": 808
@@ -39,31 +45,37 @@
     "metric_values": {
       "average_batch_r2_ct": 0.0316,
       "average_batch_r2_global": 0.0106,
-      "emd_max_ct": 1.0715,
-      "emd_max_global": 0.3149,
-      "emd_mean_ct": 0.1069,
-      "emd_mean_global": 0.0785,
+      "emd_max_ct_horiz": 1.0715,
+      "emd_max_global_horiz": 0.3149,
+      "emd_max_global_vert": 0.6463,
+      "emd_mean_ct_horiz": 0.1069,
+      "emd_mean_global_horiz": 0.0785,
+      "emd_mean_global_vert": 0.1556,
+      "flowsom_mean_mapping_similarity": 98.144,
       "n_inconsistent_peaks": 2,
       "n_inconsistent_peaks_ct": 39
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.854,
-      "average_batch_r2_global": 0.6772,
-      "emd_max_ct": 0.6009,
-      "emd_max_global": 0.5339,
-      "emd_mean_ct": 0.8229,
-      "emd_mean_global": 0.5821,
+      "average_batch_r2_ct": 0.8541,
+      "average_batch_r2_global": 0.6779,
+      "emd_max_ct_horiz": 0.5993,
+      "emd_max_global_horiz": 0.5337,
+      "emd_max_global_vert": 0.176,
+      "emd_mean_ct_horiz": 0.823,
+      "emd_mean_global_horiz": 0.5829,
+      "emd_mean_global_vert": 0.2659,
+      "flowsom_mean_mapping_similarity": 0.9835,
       "n_inconsistent_peaks": 0.75,
-      "n_inconsistent_peaks_ct": 0.8646
+      "n_inconsistent_peaks_ct": 0.8655
     },
-    "mean_score": 0.7107,
+    "mean_score": 0.6465,
     "resources": {
-      "submit": "2025-05-22 05:46:32",
+      "submit": "2025-05-23 12:44:43",
       "exit_code": 0,
-      "duration_sec": 389,
-      "cpu_pct": 102.6,
-      "peak_memory_mb": 15872,
-      "disk_read_mb": 520,
+      "duration_sec": 357,
+      "cpu_pct": 102.1,
+      "peak_memory_mb": 13312,
+      "disk_read_mb": 519,
       "disk_write_mb": 808
     }
   },
@@ -73,30 +85,36 @@
     "metric_values": {
       "average_batch_r2_ct": 0.0295,
       "average_batch_r2_global": 0.0077,
-      "emd_max_ct": 0.8521,
-      "emd_max_global": 0.2538,
-      "emd_mean_ct": 0.101,
-      "emd_mean_global": 0.0669,
+      "emd_max_ct_horiz": 0.8521,
+      "emd_max_global_horiz": 0.2538,
+      "emd_max_global_vert": 0.6391,
+      "emd_mean_ct_horiz": 0.101,
+      "emd_mean_global_horiz": 0.0669,
+      "emd_mean_global_vert": 0.1606,
+      "flowsom_mean_mapping_similarity": 98.1757,
       "n_inconsistent_peaks": 2,
       "n_inconsistent_peaks_ct": 35
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.8639,
-      "average_batch_r2_global": 0.7636,
-      "emd_max_ct": 0.6826,
-      "emd_max_global": 0.6243,
-      "emd_mean_ct": 0.8327,
-      "emd_mean_global": 0.6438,
+      "average_batch_r2_ct": 0.8641,
+      "average_batch_r2_global": 0.7641,
+      "emd_max_ct_horiz": 0.6814,
+      "emd_max_global_horiz": 0.6241,
+      "emd_max_global_vert": 0.1853,
+      "emd_mean_ct_horiz": 0.8328,
+      "emd_mean_global_horiz": 0.6444,
+      "emd_mean_global_vert": 0.2422,
+      "flowsom_mean_mapping_similarity": 0.9838,
       "n_inconsistent_peaks": 0.75,
-      "n_inconsistent_peaks_ct": 0.8785
+      "n_inconsistent_peaks_ct": 0.8793
     },
-    "mean_score": 0.7549,
+    "mean_score": 0.6774,
     "resources": {
-      "submit": "2025-05-22 05:46:32",
+      "submit": "2025-05-23 12:44:42",
       "exit_code": 0,
-      "duration_sec": 990,
-      "cpu_pct": 100.6,
-      "peak_memory_mb": 13415,
+      "duration_sec": 752,
+      "cpu_pct": 100.5,
+      "peak_memory_mb": 9319,
       "disk_read_mb": 2151,
       "disk_write_mb": 2356
     }
@@ -107,30 +125,36 @@
     "metric_values": {
       "average_batch_r2_ct": 0.06,
       "average_batch_r2_global": 0.015,
-      "emd_max_ct": 1.2303,
-      "emd_max_global": 0.3556,
-      "emd_mean_ct": 0.1555,
-      "emd_mean_global": 0.1021,
+      "emd_max_ct_horiz": 1.2303,
+      "emd_max_global_horiz": 0.3556,
+      "emd_max_global_vert": 0.7597,
+      "emd_mean_ct_horiz": 0.1555,
+      "emd_mean_global_horiz": 0.1021,
+      "emd_mean_global_vert": 0.1608,
+      "flowsom_mean_mapping_similarity": 97.2821,
       "n_inconsistent_peaks": 3,
       "n_inconsistent_peaks_ct": 34
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.7231,
-      "average_batch_r2_global": 0.5398,
-      "emd_max_ct": 0.5418,
-      "emd_max_global": 0.4736,
-      "emd_mean_ct": 0.7422,
-      "emd_mean_global": 0.4566,
+      "average_batch_r2_ct": 0.7234,
+      "average_batch_r2_global": 0.5408,
+      "emd_max_ct_horiz": 0.54,
+      "emd_max_global_horiz": 0.4733,
+      "emd_max_global_vert": 0.0301,
+      "emd_mean_ct_horiz": 0.7423,
+      "emd_mean_global_horiz": 0.4575,
+      "emd_mean_global_vert": 0.2413,
+      "flowsom_mean_mapping_similarity": 0.9759,
       "n_inconsistent_peaks": 0.625,
-      "n_inconsistent_peaks_ct": 0.8819
+      "n_inconsistent_peaks_ct": 0.8828
     },
-    "mean_score": 0.623,
+    "mean_score": 0.5666,
     "resources": {
-      "submit": "2025-05-22 05:46:32",
+      "submit": "2025-05-23 12:44:43",
       "exit_code": 0,
       "duration_sec": 396,
-      "cpu_pct": 101.6,
-      "peak_memory_mb": 9319,
+      "cpu_pct": 101.2,
+      "peak_memory_mb": 9421,
       "disk_read_mb": 957,
       "disk_write_mb": 1127
     }
@@ -141,30 +165,36 @@
     "metric_values": {
       "average_batch_r2_ct": 0.044,
       "average_batch_r2_global": 0.0079,
-      "emd_max_ct": 1.1795,
-      "emd_max_global": 0.2796,
-      "emd_mean_ct": 0.129,
-      "emd_mean_global": 0.0752,
+      "emd_max_ct_horiz": 1.1795,
+      "emd_max_global_horiz": 0.2796,
+      "emd_max_global_vert": 0.6551,
+      "emd_mean_ct_horiz": 0.129,
+      "emd_mean_global_horiz": 0.0752,
+      "emd_mean_global_vert": 0.1592,
+      "flowsom_mean_mapping_similarity": 97.9356,
       "n_inconsistent_peaks": 2,
       "n_inconsistent_peaks_ct": 37
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.7966,
-      "average_batch_r2_global": 0.7585,
-      "emd_max_ct": 0.5606,
-      "emd_max_global": 0.5861,
-      "emd_mean_ct": 0.7862,
-      "emd_mean_global": 0.5995,
+      "average_batch_r2_ct": 0.7969,
+      "average_batch_r2_global": 0.759,
+      "emd_max_ct_horiz": 0.559,
+      "emd_max_global_horiz": 0.5859,
+      "emd_max_global_vert": 0.1647,
+      "emd_mean_ct_horiz": 0.7864,
+      "emd_mean_global_horiz": 0.6002,
+      "emd_mean_global_vert": 0.2491,
+      "flowsom_mean_mapping_similarity": 0.9817,
       "n_inconsistent_peaks": 0.75,
-      "n_inconsistent_peaks_ct": 0.8715
+      "n_inconsistent_peaks_ct": 0.8724
     },
-    "mean_score": 0.7137,
+    "mean_score": 0.6459,
     "resources": {
-      "submit": "2025-05-22 05:46:32",
+      "submit": "2025-05-23 12:44:43",
       "exit_code": 0,
-      "duration_sec": 3454,
-      "cpu_pct": 398.9,
-      "peak_memory_mb": 24884,
+      "duration_sec": 3076,
+      "cpu_pct": 305.1,
+      "peak_memory_mb": 22733,
       "disk_read_mb": 498,
       "disk_write_mb": 466
     }
@@ -175,29 +205,35 @@
     "metric_values": {
       "average_batch_r2_ct": 0.0533,
       "average_batch_r2_global": 0.0078,
-      "emd_max_ct": 1.1958,
-      "emd_max_global": 0.2889,
-      "emd_mean_ct": 0.1375,
-      "emd_mean_global": 0.0772,
+      "emd_max_ct_horiz": 1.1958,
+      "emd_max_global_horiz": 0.2889,
+      "emd_max_global_vert": 0.6444,
+      "emd_mean_ct_horiz": 0.1375,
+      "emd_mean_global_horiz": 0.0772,
+      "emd_mean_global_vert": 0.1614,
+      "flowsom_mean_mapping_similarity": 97.8486,
       "n_inconsistent_peaks": 2,
       "n_inconsistent_peaks_ct": 38
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.754,
-      "average_batch_r2_global": 0.7619,
-      "emd_max_ct": 0.5546,
-      "emd_max_global": 0.5724,
-      "emd_mean_ct": 0.7721,
-      "emd_mean_global": 0.5889,
+      "average_batch_r2_ct": 0.7542,
+      "average_batch_r2_global": 0.7624,
+      "emd_max_ct_horiz": 0.5529,
+      "emd_max_global_horiz": 0.5721,
+      "emd_max_global_vert": 0.1784,
+      "emd_mean_ct_horiz": 0.7723,
+      "emd_mean_global_horiz": 0.5896,
+      "emd_mean_global_vert": 0.2382,
+      "flowsom_mean_mapping_similarity": 0.9809,
       "n_inconsistent_peaks": 0.75,
-      "n_inconsistent_peaks_ct": 0.8681
+      "n_inconsistent_peaks_ct": 0.869
     },
-    "mean_score": 0.7027,
+    "mean_score": 0.6382,
     "resources": {
-      "submit": "2025-05-22 05:46:32",
+      "submit": "2025-05-23 12:44:43",
       "exit_code": 0,
       "duration_sec": 94,
-      "cpu_pct": 103.9,
+      "cpu_pct": 104.6,
       "peak_memory_mb": 7578,
       "disk_read_mb": 495,
       "disk_write_mb": 554
@@ -209,98 +245,156 @@
     "metric_values": {
       "average_batch_r2_ct": 0.0637,
       "average_batch_r2_global": 0.0256,
-      "emd_max_ct": 1.2375,
-      "emd_max_global": 0.5152,
-      "emd_mean_ct": 0.1538,
-      "emd_mean_global": 0.1201,
+      "emd_max_ct_horiz": 1.2375,
+      "emd_max_global_horiz": 0.5152,
+      "emd_max_global_vert": 0.7766,
+      "emd_mean_ct_horiz": 0.1538,
+      "emd_mean_global_horiz": 0.1201,
+      "emd_mean_global_vert": 0.2,
+      "flowsom_mean_mapping_similarity": 97.7805,
       "n_inconsistent_peaks": 2,
       "n_inconsistent_peaks_ct": 38
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.706,
-      "average_batch_r2_global": 0.2159,
-      "emd_max_ct": 0.5391,
-      "emd_max_global": 0.2374,
-      "emd_mean_ct": 0.7451,
-      "emd_mean_global": 0.3608,
+      "average_batch_r2_ct": 0.7063,
+      "average_batch_r2_global": 0.2176,
+      "emd_max_ct_horiz": 0.5373,
+      "emd_max_global_horiz": 0.2369,
+      "emd_max_global_vert": 0.0084,
+      "emd_mean_ct_horiz": 0.7453,
+      "emd_mean_global_horiz": 0.3619,
+      "emd_mean_global_vert": 0.0543,
+      "flowsom_mean_mapping_similarity": 0.9803,
       "n_inconsistent_peaks": 0.75,
-      "n_inconsistent_peaks_ct": 0.8681
+      "n_inconsistent_peaks_ct": 0.869
     },
-    "mean_score": 0.5528,
+    "mean_score": 0.497,
     "resources": {
-      "submit": "2025-05-22 05:46:32",
+      "submit": "2025-05-23 12:44:42",
       "exit_code": 0,
-      "duration_sec": 52.8,
-      "cpu_pct": 101.9,
-      "peak_memory_mb": 6656,
+      "duration_sec": 42.1,
+      "cpu_pct": 113.9,
+      "peak_memory_mb": 3994,
       "disk_read_mb": 486,
       "disk_write_mb": 466
     }
   },
   {
     "dataset_id": "leomazzi_cyto_spleen",
-    "method_id": "perfect_integration",
+    "method_id": "perfect_integration_horizontal",
     "metric_values": {
       "average_batch_r2_ct": -3.7381e-19,
       "average_batch_r2_global": 0,
-      "emd_max_ct": 0,
-      "emd_max_global": 0,
-      "emd_mean_ct": 0,
-      "emd_mean_global": 0,
+      "emd_max_ct_horiz": 0,
+      "emd_max_global_horiz": 0,
+      "emd_max_global_vert": 0.7831,
+      "emd_mean_ct_horiz": 0,
+      "emd_mean_global_horiz": 0,
+      "emd_mean_global_vert": 0.2113,
+      "flowsom_mean_mapping_similarity": 100,
       "n_inconsistent_peaks": 0,
       "n_inconsistent_peaks_ct": 0
     },
     "scaled_scores": {
       "average_batch_r2_ct": 1,
       "average_batch_r2_global": 1,
-      "emd_max_ct": 1,
-      "emd_max_global": 1,
-      "emd_mean_ct": 1,
-      "emd_mean_global": 1,
+      "emd_max_ct_horiz": 1,
+      "emd_max_global_horiz": 1,
+      "emd_max_global_vert": 0,
+      "emd_mean_ct_horiz": 1,
+      "emd_mean_global_horiz": 1,
+      "emd_mean_global_vert": 0,
+      "flowsom_mean_mapping_similarity": 1,
       "n_inconsistent_peaks": 1,
       "n_inconsistent_peaks_ct": 1
     },
-    "mean_score": 1,
+    "mean_score": 0.8182,
     "resources": {
-      "submit": "2025-05-22 05:46:32",
+      "submit": "2025-05-23 12:44:42",
       "exit_code": 0,
-      "duration_sec": 38.7,
-      "cpu_pct": 100.7,
-      "peak_memory_mb": 6247,
+      "duration_sec": 27.2,
+      "cpu_pct": 107.5,
+      "peak_memory_mb": 2253,
       "disk_read_mb": 321,
       "disk_write_mb": 302
     }
   },
+  {
+    "dataset_id": "leomazzi_cyto_spleen",
+    "method_id": "perfect_integration_vertical",
+    "metric_values": {
+      "average_batch_r2_ct": 0.0329,
+      "average_batch_r2_global": 0.0134,
+      "emd_max_ct_horiz": 1.0755,
+      "emd_max_global_horiz": 0.5152,
+      "emd_max_global_vert": 0.6444,
+      "emd_mean_ct_horiz": 0.0734,
+      "emd_mean_global_horiz": 0.0593,
+      "emd_mean_global_vert": 0.1763,
+      "flowsom_mean_mapping_similarity": 99.1373,
+      "n_inconsistent_peaks": 0,
+      "n_inconsistent_peaks_ct": 19
+    },
+    "scaled_scores": {
+      "average_batch_r2_ct": 0.8482,
+      "average_batch_r2_global": 0.5898,
+      "emd_max_ct_horiz": 0.5979,
+      "emd_max_global_horiz": 0.2369,
+      "emd_max_global_vert": 0.1784,
+      "emd_mean_ct_horiz": 0.8783,
+      "emd_mean_global_horiz": 0.6847,
+      "emd_mean_global_vert": 0.1673,
+      "flowsom_mean_mapping_similarity": 0.9923,
+      "n_inconsistent_peaks": 1,
+      "n_inconsistent_peaks_ct": 0.9345
+    },
+    "mean_score": 0.6462,
+    "resources": {
+      "submit": "2025-05-23 12:44:42",
+      "exit_code": 0,
+      "duration_sec": 49.2,
+      "cpu_pct": 101.5,
+      "peak_memory_mb": 3994,
+      "disk_read_mb": 788,
+      "disk_write_mb": 407
+    }
+  },
   {
     "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration",
     "metric_values": {
-      "average_batch_r2_ct": 0.2027,
+      "average_batch_r2_ct": 0.2038,
       "average_batch_r2_global": 0.0156,
-      "emd_max_ct": 2.5651,
-      "emd_max_global": 0.587,
-      "emd_mean_ct": 0.5885,
-      "emd_mean_global": 0.1507,
+      "emd_max_ct_horiz": 2.584,
+      "emd_max_global_horiz": 0.5864,
+      "emd_max_global_vert": 0.0058,
+      "emd_mean_ct_horiz": 0.5891,
+      "emd_mean_global_horiz": 0.1505,
+      "emd_mean_global_vert": 0.0019,
+      "flowsom_mean_mapping_similarity": -12.6513,
       "n_inconsistent_peaks": 8,
-      "n_inconsistent_peaks_ct": 280
+      "n_inconsistent_peaks_ct": 289
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.0641,
-      "average_batch_r2_global": 0.5228,
-      "emd_max_ct": 0.0446,
-      "emd_max_global": 0.1312,
-      "emd_mean_ct": 0.0245,
-      "emd_mean_global": 0.1979,
+      "average_batch_r2_ct": 0.0598,
+      "average_batch_r2_global": 0.5241,
+      "emd_max_ct_horiz": 0.0338,
+      "emd_max_global_horiz": 0.1315,
+      "emd_max_global_vert": 1,
+      "emd_mean_ct_horiz": 0.0241,
+      "emd_mean_global_horiz": 0.2001,
+      "emd_mean_global_vert": 1,
+      "flowsom_mean_mapping_similarity": 0.0002,
       "n_inconsistent_peaks": 0,
-      "n_inconsistent_peaks_ct": 0.0278
+      "n_inconsistent_peaks_ct": 0.0034
     },
-    "mean_score": 0.1266,
+    "mean_score": 0.2706,
     "resources": {
-      "submit": "2025-05-22 05:46:32",
+      "submit": "2025-05-23 12:44:43",
       "exit_code": 0,
-      "duration_sec": 59.6,
-      "cpu_pct": 96.5,
-      "peak_memory_mb": 7783,
+      "duration_sec": 52.7,
+      "cpu_pct": 90.7,
+      "peak_memory_mb": 3687,
       "disk_read_mb": 486,
       "disk_write_mb": 481
     }
@@ -309,31 +403,37 @@
     "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_batch",
     "metric_values": {
-      "average_batch_r2_ct": 0.2165,
-      "average_batch_r2_global": 0.0327,
-      "emd_max_ct": 2.6847,
-      "emd_max_global": 0.6756,
-      "emd_mean_ct": 0.6033,
-      "emd_mean_global": 0.1878,
+      "average_batch_r2_ct": 0.2168,
+      "average_batch_r2_global": 0.0328,
+      "emd_max_ct_horiz": 2.6744,
+      "emd_max_global_horiz": 0.6752,
+      "emd_max_global_vert": 0.4547,
+      "emd_mean_ct_horiz": 0.6037,
+      "emd_mean_global_horiz": 0.1881,
+      "emd_mean_global_vert": 0.077,
+      "flowsom_mean_mapping_similarity": -12.6693,
       "n_inconsistent_peaks": 8,
-      "n_inconsistent_peaks_ct": 288
+      "n_inconsistent_peaks_ct": 290
     },
     "scaled_scores": {
       "average_batch_r2_ct": 0,
       "average_batch_r2_global": 0,
-      "emd_max_ct": 0,
-      "emd_max_global": 0,
-      "emd_mean_ct": 0,
-      "emd_mean_global": 0,
+      "emd_max_ct_horiz": 0,
+      "emd_max_global_horiz": 0,
+      "emd_max_global_vert": 0.4225,
+      "emd_mean_ct_horiz": 0,
+      "emd_mean_global_horiz": 0,
+      "emd_mean_global_vert": 0.6413,
+      "flowsom_mean_mapping_similarity": 0,
       "n_inconsistent_peaks": 0,
       "n_inconsistent_peaks_ct": 0
     },
-    "mean_score": 0,
+    "mean_score": 0.0967,
     "resources": {
-      "submit": "2025-05-22 05:46:32",
+      "submit": "2025-05-23 12:44:43",
       "exit_code": 0,
-      "duration_sec": 53.6,
-      "cpu_pct": 88.7,
+      "duration_sec": 44.6,
+      "cpu_pct": 106.6,
       "peak_memory_mb": 3482,
       "disk_read_mb": 486,
       "disk_write_mb": 480
@@ -343,31 +443,37 @@
     "dataset_id": "leomazzi_cyto_spleen",
     "method_id": "shuffle_integration_by_cell_type",
     "metric_values": {
-      "average_batch_r2_ct": 0.034,
+      "average_batch_r2_ct": 0.0342,
       "average_batch_r2_global": 0.0093,
-      "emd_max_ct": 1.2351,
-      "emd_max_global": 0.2816,
-      "emd_mean_ct": 0.1322,
-      "emd_mean_global": 0.0906,
+      "emd_max_ct_horiz": 1.235,
+      "emd_max_global_horiz": 0.2816,
+      "emd_max_global_vert": 0.6163,
+      "emd_mean_ct_horiz": 0.1328,
+      "emd_mean_global_horiz": 0.0907,
+      "emd_mean_global_vert": 0.1204,
+      "flowsom_mean_mapping_similarity": 97.3047,
       "n_inconsistent_peaks": 4,
-      "n_inconsistent_peaks_ct": 72
+      "n_inconsistent_peaks_ct": 68
     },
     "scaled_scores": {
-      "average_batch_r2_ct": 0.843,
-      "average_batch_r2_global": 0.7144,
-      "emd_max_ct": 0.54,
-      "emd_max_global": 0.5831,
-      "emd_mean_ct": 0.7809,
-      "emd_mean_global": 0.5178,
+      "average_batch_r2_ct": 0.8421,
+      "average_batch_r2_global": 0.7157,
+      "emd_max_ct_horiz": 0.5382,
+      "emd_max_global_horiz": 0.583,
+      "emd_max_global_vert": 0.2146,
+      "emd_mean_ct_horiz": 0.78,
+      "emd_mean_global_horiz": 0.518,
+      "emd_mean_global_vert": 0.4341,
+      "flowsom_mean_mapping_similarity": 0.9761,
       "n_inconsistent_peaks": 0.5,
-      "n_inconsistent_peaks_ct": 0.75
+      "n_inconsistent_peaks_ct": 0.7655
     },
-    "mean_score": 0.6537,
+    "mean_score": 0.6243,
     "resources": {
-      "submit": "2025-05-22 05:46:32",
+      "submit": "2025-05-23 12:44:42",
       "exit_code": 0,
-      "duration_sec": 54.6,
-      "cpu_pct": 95,
+      "duration_sec": 56,
+      "cpu_pct": 92.4,
       "peak_memory_mb": 3482,
       "disk_read_mb": 486,
       "disk_write_mb": 480