diff --git a/.github/workflows/ab_tests.yml b/.github/workflows/ab_tests.yml
index 9865e0e08e..e4c51ab296 100644
--- a/.github/workflows/ab_tests.yml
+++ b/.github/workflows/ab_tests.yml
@@ -20,101 +20,52 @@ jobs:
     name: Discover A/B environments
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
-      - uses: actions/setup-python@v4
+
+      - name: Install Python
+        uses: actions/setup-python@v4
         with:
           python-version: '3.10'
-      - id: set-matrix
+
+      - name: Install dependencies
+        run: pip install PyYaml
+
+      - name: Generate dynamic matrix
+        id: set-matrix
         run: echo "::set-output name=matrix::$(python ci/scripts/discover_ab_environments.py)"
+
     outputs:
-        matrix: ${{ steps.set-matrix.outputs.matrix }}
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
 
   # Everything below this point runs iff there are files matching
-  # AB_environments/AB_*.conda.yaml
-  # AB_environments/AB_*.dask.yaml
+  # AB_environments/AB_*.{conda,dask}.yaml
+  # and AB_environments/config.yaml set repeat > 0
 
-  software:
-    name: Setup
-    runs-on: ubuntu-latest
+  tests:
+    name: A/B Tests - ${{ matrix.category }} ${{ matrix.runtime-version }} ${{ matrix.os }} py${{ matrix.python-version }}
     needs: discover_ab_envs
-    if: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.9"]
-        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-
-      - name: Set up environment
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniforge-variant: Mambaforge
-          use-mamba: true
-          condarc-file: ci/condarc
-          python-version: ${{ matrix.python-version }}
-          environment-file: ci/environment.yml
-
-      - name: Build Coiled Software Environment
-        env:
-          DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
-        run: |
-          PYTHON_VERSION_FORMATTED=$(echo "${{ matrix.python-version }}" | sed 's/\.//g' )
-          NAME_HEAD=dask-engineering/coiled-runtime-${{ github.event_name }}
-          NAME_TAIL=$GITHUB_RUN_ID-${{ matrix.runtime-version }}-py$PYTHON_VERSION_FORMATTED
-          if [[ ${{ github.event_name }} = 'pull_request' ]]
-          then
-            NAME_MID=${{ github.event.number }}
-          else
-            NAME_MID=$GITHUB_REF_TYPE-$(echo "$GITHUB_REF_NAME" | sed 's/\./-/g' )
-          fi
-          # env name can only contain lowercase ASCII letters, numbers, hyphens and underscores
-          COILED_SOFTWARE_NAME=$NAME_HEAD-$(echo $NAME_MID-$NAME_TAIL | tr 'A-Z' 'a-z' | sed -r 's/[^a-z0-9_-]/_/g')
-
-          cp AB_environments/${{ matrix.runtime-version }}.conda.yaml coiled_software_environment.yaml
-          COILED_SOFTWARE_ENV=$(python ci/scripts/dask_config_to_env.py AB_environments/${{ matrix.runtime-version }}.dask.yaml) 
-          ENV_FILE=coiled_software_environment.yaml
-          cat $ENV_FILE
-
-          mamba install coiled
-          echo "Creating Coiled software environment for $COILED_SOFTWARE_NAME"
-          echo "Environment parameters: $COILED_SOFTWARE_ENV"
-          coiled env create --name $COILED_SOFTWARE_NAME --conda $ENV_FILE $COILED_SOFTWARE_ENV
-
-          # Put COILED_SOFTWARE_NAME into a file so it can be downloaded in subsequent workflow jobs
-          echo $COILED_SOFTWARE_NAME > software_name.txt
-
-          # Dummy for compatibility with tests.yml
-          echo false > test_upstream.txt
-
-      - name: Upload environment file
-        uses: actions/upload-artifact@v3
-        with:
-          name: software-environment-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: |
-            coiled_software_environment.yaml
-            software_name.txt
-            test_upstream.txt
-
-  runtime:
-    name: Runtime - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
-    needs: [discover_ab_envs, software]
     runs-on: ${{ matrix.os }}
     timeout-minutes: 120
     strategy:
       fail-fast: false
+      # AWS implements limiters to how many EC2 instances you can spawn in parallel *on
+      # the same AWS account*. If such limit is reached, jobs will randomly fail when
+      # trying to create the Coiled clusters, and restarting failed jobs won't fix the
+      # problem.
+      max-parallel: 20
       matrix:
         os: [ubuntu-latest]
         python-version: ["3.9"]
-        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
+        category: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).category }}
+        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }}
+        repeat: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).repeat }}
 
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
@@ -127,16 +78,21 @@ jobs:
           python-version: ${{ matrix.python-version }}
           environment-file: ci/environment.yml
 
-      - name: Download software environment assets
-        if: matrix.runtime-version == 'latest' || startsWith(matrix.runtime-version, 'AB_')
-        uses: actions/download-artifact@v3
-        with:
-          name: software-environment-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
+      - name: Create null hypothesis as a copy of baseline
+        if: matrix.runtime-version == 'AB_null_hypothesis'
+        run: |
+          cd AB_environments
+          cp AB_baseline.conda.yaml AB_null_hypothesis.conda.yaml
+          cp AB_baseline.dask.yaml AB_null_hypothesis.dask.yaml
 
       - name: Install coiled-runtime
         env:
           COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-        run: source ci/scripts/install_coiled_runtime.sh
+        run: |
+          source ci/scripts/install_coiled_runtime.sh AB_environments/${{ matrix.runtime-version }}.conda.yaml
+
+      - name: Convert dask config into environment variables
+        run: python ci/scripts/dask_config_to_env.py AB_environments/${{ matrix.runtime-version }}.dask.yaml >> $GITHUB_ENV
 
       - name: Run Coiled Runtime Tests
         id: test
@@ -145,166 +101,22 @@ jobs:
           AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
           COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-          BENCHMARK: true
-        run: bash ci/scripts/run_tests.sh tests/runtime
-
-      - name: Upload benchmark results
-        uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-
-  benchmarks:
-    name: Benchmarks - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
-    needs: [discover_ab_envs, software]
-    runs-on: ${{ matrix.os }}
-    timeout-minutes: 120
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest]
-        python-version: ["3.9"]
-        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-
-      - name: Set up environment
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniforge-variant: Mambaforge
-          use-mamba: true
-          condarc-file: ci/condarc
-          python-version: ${{ matrix.python-version }}
-          environment-file: ci/environment.yml
-
-      - name: Download software environment assets
-        uses: actions/download-artifact@v3
-        with:
-          name: software-environment-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-
-      - name: Install coiled-runtime
-        env:
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-        run: source ci/scripts/install_coiled_runtime.sh
-
-      - name: Run benchmarking tests
-        id: benchmarking_tests
-        env:
-          DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-          BENCHMARK: true
-        run: bash ci/scripts/run_tests.sh tests/benchmarks
-
-      - name: Upload benchmark results
-        uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-
-  stability:
-    name: Stability - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
-    needs: [discover_ab_envs, software]
-    runs-on: ${{ matrix.os }}
-    timeout-minutes: 120
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest]
-        python-version: ["3.9"]
-        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-
-      - name: Set up environment
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniforge-variant: Mambaforge
-          use-mamba: true
-          condarc-file: ci/condarc
-          python-version: ${{ matrix.python-version }}
-          environment-file: ci/environment.yml
-
-      - name: Download software environment assets
-        if: matrix.runtime-version == 'latest' || startsWith(matrix.runtime-version, 'AB_')
-        uses: actions/download-artifact@v3
-        with:
-          name: software-environment-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-
-      - name: Install coiled-runtime
-        env:
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-        run: source ci/scripts/install_coiled_runtime.sh
-
-      - name: Run stability tests
-        id: stability_tests
-        env:
-          DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
+          DB_NAME: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-${{ matrix.repeat }}-py${{ matrix.python-version }}.db
           BENCHMARK: true
           CLUSTER_DUMP: true
-        run: bash ci/scripts/run_tests.sh tests/stability
+        run: bash ci/scripts/run_tests.sh tests/${{ matrix.category }}
 
       - name: Upload benchmark results
         uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-
-  cleanup:
-    needs: [discover_ab_envs, software, runtime, benchmarks, stability]
-    if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
-    name: Cleanup
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.9"]
-        runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
-
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Install coiled
-        run: python -m pip install coiled
-
-      - name: Download software environment assets
-        uses: actions/download-artifact@v3
-        with:
-          name: software-environment-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-
-      - name: Remove Coiled software environment
-        env:
-          DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
-        run: |
-          SOFTWARE_NAME=$(cat software_name.txt)
-          echo "Deleting $SOFTWARE_NAME"
-          coiled env delete $SOFTWARE_NAME
+          name: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-${{ matrix.repeat }}-py${{ matrix.python-version }}
+          path: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-${{ matrix.repeat }}-py${{ matrix.python-version }}.db
 
   process-results:
-    needs: [discover_ab_envs, runtime, benchmarks, stability]
+    needs: [discover_ab_envs, tests]
     name: Combine separate benchmark results
-    if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
+    if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }}
     runs-on: ubuntu-latest
     concurrency:
       # Fairly strict concurrency rule to avoid stepping on benchmark db.
@@ -312,14 +124,17 @@ jobs:
       group: process-benchmarks
       cancel-in-progress: false
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
 
-      - uses: actions/setup-python@v4
+      - name: Install Python
+        uses: actions/setup-python@v4
 
       - name: Install dependencies
         run: pip install alembic
 
-      - uses: actions/download-artifact@v3
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
         with:
           path: benchmarks
 
@@ -331,23 +146,25 @@ jobs:
       - name: Upload benchmark results as artifact
         uses: actions/upload-artifact@v3
         with:
-          name: benchmark.db
+          name: benchmark
           path: benchmark.db
 
   static-site:
     needs: [discover_ab_envs, process-results]
     # Always generate the site, as this can be skipped even if an indirect dependency fails (like a test run)
-    if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }}
+    if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }}
     name: Build static dashboards
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
-      - uses: actions/download-artifact@v3
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
         with:
-          name: benchmark.db
+          name: benchmark
 
       - name: Set up environment
         uses: conda-incubator/setup-miniconda@v2
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 194f1e64e3..b4db13c326 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -22,19 +22,65 @@ defaults:
     shell: bash -l {0}
 
 jobs:
-  runtime:
-    name: Runtime - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
+  tests:
+    name: Tests - ${{ matrix.category }} ${{ matrix.runtime-version }} ${{ matrix.os }} py${{ matrix.python-version }}
     runs-on: ${{ matrix.os }}
     timeout-minutes: 120
     strategy:
       fail-fast: false
       matrix:
-        os: ["ubuntu-latest"]
+        os: [ubuntu-latest]
         python-version: ["3.9"]
-        runtime-version: ["upstream", "latest", "0.0.4", "0.1.0"]
+        category: [runtime, benchmarks, stability]
+        runtime-version: [upstream, latest, "0.0.4", "0.1.0"]
+        include:
+          # Run stability tests on Python 3.8
+          - category: stability
+            python-version: "3.8"
+            runtime-version: upstream
+            os: ubuntu-latest
+          - category: stability
+            python-version: "3.8"
+            runtime-version: latest
+            os: ubuntu-latest
+          - category: stability
+            python-version: "3.8"
+            runtime-version: "0.0.4"
+            os: ubuntu-latest
+          - category: stability
+            python-version: "3.8"
+            runtime-version: "0.1.0"
+            os: ubuntu-latest
+          # Run stability tests on Python 3.10
+          - category: stability
+            python-version: "3.10"
+            runtime-version: upstream
+            os: ubuntu-latest
+          - category: stability
+            python-version: "3.10"
+            runtime-version: latest
+            os: ubuntu-latest
+          - category: stability
+            python-version: "3.10"
+            runtime-version: "0.0.4"
+            os: ubuntu-latest
+          - category: stability
+            python-version: "3.10"
+            runtime-version: "0.1.0"
+            os: ubuntu-latest
+          # Run stability tests on Python Windows and MacOS (latest py39 only)
+          - category: stability
+            python-version: "3.9"
+            runtime-version: latest
+            os: windows-latest
+          - category: stability
+            python-version: "3.9"
+            runtime-version: latest
+            os: macos-latest
 
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
@@ -52,7 +98,7 @@ jobs:
           COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
         run: |
           python ci/create_runtime_meta.py
-          source ci/scripts/install_coiled_runtime.sh
+          source ci/scripts/install_coiled_runtime.sh coiled_software_environment.yaml
 
       - name: Run Coiled Runtime Tests
         id: test
@@ -61,127 +107,20 @@ jobs:
           AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
           COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-          BENCHMARK: true
-        run: bash ci/scripts/run_tests.sh tests/runtime
-
-      - name: Upload benchmark results
-        uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-
-  benchmarks:
-    name: Benchmarks - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
-    runs-on: ${{ matrix.os }}
-    timeout-minutes: 120
-    strategy:
-      fail-fast: false
-      matrix:
-        os: ["ubuntu-latest"]
-        python-version: ["3.9"]
-        runtime-version: ["upstream", "latest", "0.0.4", "0.1.0"]
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-
-      - name: Set up environment
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniforge-variant: Mambaforge
-          use-mamba: true
-          condarc-file: ci/condarc
-          python-version: ${{ matrix.python-version }}
-          environment-file: ci/environment.yml
-
-      - name: Install coiled-runtime
-        env:
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-        run: |
-          python ci/create_runtime_meta.py
-          source ci/scripts/install_coiled_runtime.sh
-
-      - name: Run benchmarking tests
-        id: benchmarking_tests
-        env:
-          DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-          BENCHMARK: true
-        run: bash ci/scripts/run_tests.sh tests/benchmarks
-
-      - name: Upload benchmark results
-        uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
-
-  stability:
-    name: Stability - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
-    runs-on: ${{ matrix.os }}
-    timeout-minutes: 120
-    strategy:
-      fail-fast: false
-      matrix:
-        os: ["ubuntu-latest"]
-        python-version: ["3.8", "3.9", "3.10"]
-        runtime-version: ["upstream", "latest", "0.0.4", "0.1.0"]
-        include:
-          - python-version: "3.9"
-            runtime-version: "latest"
-            os: "windows-latest"
-          - python-version: "3.9"
-            runtime-version: "latest"
-            os: "macos-latest"
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-
-      - name: Set up environment
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniforge-variant: Mambaforge
-          use-mamba: true
-          condarc-file: ci/condarc
-          python-version: ${{ matrix.python-version }}
-          environment-file: ci/environment.yml
-
-      - name: Install coiled-runtime
-        env:
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-        run: |
-          python ci/create_runtime_meta.py
-          source ci/scripts/install_coiled_runtime.sh
-
-      - name: Run stability tests
-        id: stability_tests
-        env:
-          DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
-          COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
-          DB_NAME: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
+          DB_NAME: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
           BENCHMARK: true
           CLUSTER_DUMP: true
-        run: bash ci/scripts/run_tests.sh tests/stability
+        run: bash ci/scripts/run_tests.sh tests/${{ matrix.category }}
 
       - name: Upload benchmark results
         uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
-          path: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
+          name: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}
+          path: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
 
   process-results:
-    needs: [runtime, benchmarks, stability]
+    needs: tests
     name: Combine separate benchmark results
     if: always() && github.repository == 'coiled/coiled-runtime'
     runs-on: ubuntu-latest
@@ -191,14 +130,17 @@ jobs:
       group: process-benchmarks
       cancel-in-progress: false
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
 
-      - uses: actions/setup-python@v4
+      - name: Install Python
+        uses: actions/setup-python@v4
 
       - name: Install dependencies
         run: pip install alembic
 
-      - uses: actions/download-artifact@v3
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
         with:
           path: benchmarks
 
@@ -239,7 +181,8 @@ jobs:
     name: Detect regressions
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
@@ -271,15 +214,12 @@ jobs:
 
   report:
     name: report
-    needs: [runtime, benchmarks, stability, regressions]
+    needs: [tests, regressions]
     if: |
       always()
       && github.event_name != 'pull_request'
       && github.repository == 'coiled/coiled-runtime'
-      && (needs.runtime.result == 'failure' ||
-          needs.benchmarks.result == 'failure' ||
-          needs.stability.result == 'failure' ||
-          needs.regressions.result == 'failure') 
+      && (needs.tests.result == 'failure' || needs.regressions.result == 'failure') 
 
     runs-on: ubuntu-latest
     defaults:
@@ -302,7 +242,6 @@ jobs:
                 labels: ["ci-failure"],
             })
 
-
   static-site:
     needs: process-results
     # Always generate the site, as this can be skipped even if an indirect dependency fails (like a test run)
@@ -310,11 +249,13 @@ jobs:
     name: Build static dashboards
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout
+        uses: actions/checkout@v2
         with:
           fetch-depth: 0
 
-      - uses: actions/download-artifact@v3
+      - name: Download tests database
+        uses: actions/download-artifact@v3
         with:
           name: benchmark
 
@@ -327,7 +268,7 @@ jobs:
           environment-file: ci/environment-dashboard.yml
 
       - name: Generate dashboards
-        run: python dashboard.py -d benchmark.db -o static -b coiled-latest-py3.9 coiled-upstream-py3.9 
+        run: python dashboard.py -d benchmark.db -o static -b coiled-latest-py3.9 coiled-0.1.0-py3.9
 
       - name: Upload artifact
         uses: actions/upload-artifact@v3
diff --git a/AB_environments/AB_baseline.conda.yaml.rename_me b/AB_environments/AB_baseline.conda.yaml
similarity index 87%
rename from AB_environments/AB_baseline.conda.yaml.rename_me
rename to AB_environments/AB_baseline.conda.yaml
index 8485352382..2beac715c6 100644
--- a/AB_environments/AB_baseline.conda.yaml.rename_me
+++ b/AB_environments/AB_baseline.conda.yaml
@@ -1,5 +1,5 @@
 # Special environment file for A/B testing, used as the baseline environment.
-# Change contents as needed and remove the .rename_me suffix.
+# Change contents, but do not rename.
 channels:
   - conda-forge
 dependencies:
@@ -14,6 +14,6 @@ dependencies:
       # - You can point to your own git fork instead
       # For example, if you want to test a PR before it's merged into main, you should
       # change this to the dask/dask and/or dask/distributed git tip
-      - dask==2022.8.1
-      - distributed=2022.8.1
+      - dask==2022.9.0
+      - distributed==2022.9.0
       # - git+https://github.com/dask/distributed@dd81b424971e81616e1a52fa09ce4698a5002d41
diff --git a/AB_environments/AB_baseline.dask.yaml.rename_me b/AB_environments/AB_baseline.dask.yaml
similarity index 68%
rename from AB_environments/AB_baseline.dask.yaml.rename_me
rename to AB_environments/AB_baseline.dask.yaml
index 8c296301be..cd1d2e38d3 100644
--- a/AB_environments/AB_baseline.dask.yaml.rename_me
+++ b/AB_environments/AB_baseline.dask.yaml
@@ -1,3 +1,3 @@
 # Special environment file for A/B testing, used as the baseline environment.
-# Change contents as needed and remove the .rename_me suffix.
+# Change contents, but do not rename.
 # Leave empty if you don't want to override anything.
diff --git a/AB_environments/AB_sample.conda.yaml.rename_me b/AB_environments/AB_sample.conda.yaml
similarity index 68%
rename from AB_environments/AB_sample.conda.yaml.rename_me
rename to AB_environments/AB_sample.conda.yaml
index 87b6409f3f..46dbb07913 100644
--- a/AB_environments/AB_sample.conda.yaml.rename_me
+++ b/AB_environments/AB_sample.conda.yaml
@@ -10,5 +10,6 @@ dependencies:
     - python=3.9
     - coiled-runtime=0.1.0
     - pip:
-      - dask==2022.8.1
-      - git+https://github.com/dask/distributed@dd81b424971e81616e1a52fa09ce4698a5002d41
+      - dask==2022.9.0
+      # - distributed==2022.9.0
+      - git+https://github.com/dask/distributed@1fd07f03cacee6fde81d13282568a727bce789b9
diff --git a/AB_environments/AB_sample.dask.yaml.rename_me b/AB_environments/AB_sample.dask.yaml
similarity index 100%
rename from AB_environments/AB_sample.dask.yaml.rename_me
rename to AB_environments/AB_sample.dask.yaml
diff --git a/AB_environments/README.md b/AB_environments/README.md
index ddc056c2a2..a647c7be85 100644
--- a/AB_environments/README.md
+++ b/AB_environments/README.md
@@ -34,8 +34,8 @@ dependencies:
     - python=3.9
     - coiled-runtime=0.1.0
     - pip:
-      - dask==2022.8.1
-      - distributed=2022.8.1
+      - dask==2022.9.0
+      - distributed==2022.9.0
 ```
 In this example it's using `coiled-runtime` as a base, but it doesn't have to. If you do
 use `coiled-runtime` though, you must install any conflicting packages with pip; in the
@@ -47,8 +47,8 @@ arbitrary forks, e.g.
 
 ```yaml
     - pip:
-      - dask==2022.8.1
-      - git+https://github.com/yourname/distributed@dd81b424971e81616e1a52fa09ce4698a5002d41
+      - dask==2022.9.0
+      - git+https://github.com/yourname/distributed@1fd07f03cacee6fde81d13282568a727bce789b9
 ```
 The second file in each pair is a dask config file. If you don't want to change the
 config, you must create an empty file.
@@ -66,8 +66,32 @@ If you create *any* files in `AB_environments/`, you *must* create the baseline
 - `AB_baseline.conda.yaml`
 - `AB_baseline.dask.yaml`
 
-#### Complete example
-We want to test the impact of disabling work stealing. We create 4 files:
+### 4. Tweak configuration file
+Open `AB_environments/config.yaml` and set the `repeat` setting to a number higher than 0.
+This enables the A/B tests.
+Setting a low number of repeated runs is faster and cheaper, but will result in higher
+variance.
+
+`repeat` must remain set to 0 in the main branch, thus completely disabling
+A/B tests, in order to avoid unnecessary runs.
+
+In the same file, you can also set the `test_null_hypothesis` flag to true to
+automatically create a verbatim copy of AB_baseline and then compare the two in the A/B
+tests. Set it to false to save some money if you are already confident that the 'repeat'
+setting is high enough.
+
+Finally, the files offers a `categories` list. These are the subdirectories of `tests/`
+which you wish to run.
+
+### 5. (optional) Tweak tests
+Nothing prevents you from changing the tests themselves.
+
+For example, you may be interested in a single test, but you don't want to run its
+whole category; all you need to do is open the test files and delete what you don't care
+about.
+
+### Complete example
+You want to test the impact of disabling work stealing. You'll create at least 4 files:
 
 - `AB_environments/AB_baseline.conda.yaml`:
 ```yaml
@@ -77,8 +101,8 @@ dependencies:
     - python=3.9
     - coiled-runtime=0.1.0
     - pip:
-      - dask==2022.8.1
-      - distributed=2022.8.1
+      - dask==2022.9.0
+      - distributed==2022.9.0
 ```
 - `AB_environments/AB_baseline.dask.yaml`: (empty file)
 - `AB_environments/AB_no_steal.conda.yaml`: (same as baseline)
@@ -89,8 +113,18 @@ distributed:
     work-stealing: False
 ```
 
-### 4. Run CI
-- `git push`. Note: we are *not* creating a PR. 
+- `AB_environments/config.yaml`:
+```yaml
+repeat: 5
+test_null_hypothesis: true
+categories:
+  - runtime
+  - benchmarks
+  - stability
+```
+
+### 6. Run CI
+- `git push`. Note: you should *not* open a Pull Request. 
 - Open https://github.com/coiled/coiled-runtime/actions/workflows/ab_tests.yml and wait
   for the run to complete.
 - Open the run from the link above. In the Summary tab, scroll down and download the
@@ -98,9 +132,11 @@ distributed:
   Note: artifacts will appear only after the run is complete.
 - Decompress `static-dashboard.zip` and open `index.html` in your browser.
 
-### 5. Clean up
+
+### 7. Clean up
 Remember to delete the branch once you're done.
 
+
 ### Troubleshooting
 
 #### Problem:
diff --git a/AB_environments/config.yaml b/AB_environments/config.yaml
new file mode 100644
index 0000000000..e9e4f5f3b1
--- /dev/null
+++ b/AB_environments/config.yaml
@@ -0,0 +1,16 @@
+# Number of times to run each test suite.
+# Lower values are faster and cheaper but will result in higher variance.
+# This must remain set to 0 in the main branch, thus completely disabling
+# A/B tests, in order to avoid unnecessary runs.
+repeat: 0
+
+# Set to true to automatically create a verbatim copy of AB_baseline and then compare
+# the two in the A/B tests. Set to false to save some money if you are already confident
+# that the 'repeat' setting is high enough.
+test_null_hypothesis: true
+
+# Tests categories to run. These are subdirectories of tests/.
+categories:
+  - benchmarks
+  # - runtime
+  # - stability
diff --git a/ci/scripts/dask_config_to_env.py b/ci/scripts/dask_config_to_env.py
index 74897fc45d..b151ee5920 100755
--- a/ci/scripts/dask_config_to_env.py
+++ b/ci/scripts/dask_config_to_env.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-"""Read a dask config file and print it out in the format `-e ENV=VALUE ENV=VALUE ...`
+"""Read a dask config file and print it out in the format `ENV=VALUE\nENV=VALUE ...`
 This script is a work-around to not being able to upload dask config files to
 `conda env create`.
 """
@@ -14,9 +14,8 @@
 def main(fname: str) -> None:
     with open(fname) as fh:
         cfg = yaml.safe_load(fh)
-    # Print nothing in case of empty file, comments only, or empty dict
     if cfg:
-        print("-e " + " ".join(traverse(cfg, [])))
+        print("\n".join(traverse(cfg, [])))
 
 
 def traverse(node: dict | list | str | float | None, path: list[str]) -> Iterator[str]:
diff --git a/ci/scripts/discover_ab_environments.py b/ci/scripts/discover_ab_environments.py
index 60db39bf9a..f13610338d 100644
--- a/ci/scripts/discover_ab_environments.py
+++ b/ci/scripts/discover_ab_environments.py
@@ -1,22 +1,47 @@
+from __future__ import annotations
+
 import glob
 import json
 import os.path
 
+import yaml
+
 
-def main():
-    envs = []
+def build_json() -> dict[str, list[int]]:
+    with open("AB_environments/config.yaml") as fh:
+        cfg = yaml.safe_load(fh)
+    if not isinstance(cfg.get("repeat"), int) or cfg["repeat"] < 0:
+        raise ValueError("AB_environments/config.yaml: missing key {repeat: N}")
+    if not cfg["repeat"]:
+        return {"repeat": [], "runtime": [], "category": []}
+
+    runtimes = []
     for conda_fname in sorted(glob.glob("AB_environments/AB_*.conda.yaml")):
         env_name = os.path.basename(conda_fname)[: -len(".conda.yaml")]
         dask_fname = f"AB_environments/{env_name}.dask.yaml"
         # Raise FileNotFoundError if missing
         open(dask_fname).close()
-        envs.append(env_name)
+        runtimes.append(env_name)
+
+    if not runtimes:
+        return {"repeat": [], "runtime": [], "category": []}
 
-    if envs and "AB_baseline" not in envs:
+    if "AB_baseline" not in runtimes:
         # If any A/B environments are defined, AB_baseline is required
         raise FileNotFoundError("AB_environments/AB_baseline.conda.yaml")
 
-    print(json.dumps(envs))
+    if cfg["test_null_hypothesis"]:
+        runtimes += ["AB_null_hypothesis"]
+
+    return {
+        "repeat": list(range(1, cfg["repeat"] + 1)),
+        "runtime": runtimes,
+        "category": cfg["categories"],
+    }
+
+
+def main() -> None:
+    print(json.dumps(build_json()))
 
 
 if __name__ == "__main__":
diff --git a/ci/scripts/install_coiled_runtime.sh b/ci/scripts/install_coiled_runtime.sh
index a1044279cc..a38099ce08 100644
--- a/ci/scripts/install_coiled_runtime.sh
+++ b/ci/scripts/install_coiled_runtime.sh
@@ -7,8 +7,8 @@ set -o xtrace
 
 if [[ "$COILED_RUNTIME_VERSION" =~ upstream|latest|AB_ ]]
 then
-  cat coiled_software_environment.yaml
-  mamba env update --file coiled_software_environment.yaml
+  cat $1
+  mamba env update --file $1
 else
   mamba install -c conda-forge coiled-runtime=$COILED_RUNTIME_VERSION
 fi
diff --git a/conftest.py b/conftest.py
index ac6e4efa1b..7b8fe33871 100644
--- a/conftest.py
+++ b/conftest.py
@@ -397,8 +397,13 @@ def test_name_uuid(request):
     return f"{request.node.originalname}-{uuid.uuid4().hex}"
 
 
+@pytest.fixture(scope="session")
+def dask_env_variables():
+    return {k: v for k, v in os.environ.items() if k.startswith("DASK_")}
+
+
 @pytest.fixture(scope="module")
-def small_cluster(request):
+def small_cluster(request, dask_env_variables):
     # Extract `backend_options` for cluster from `backend_options` markers
     backend_options = merge(
         m.kwargs for m in request.node.iter_markers(name="backend_options")
@@ -411,6 +416,7 @@ def small_cluster(request):
         scheduler_vm_types=["t3.xlarge"],
         backend_options=backend_options,
         package_sync=True,
+        environ=dask_env_variables,
     ) as cluster:
         yield cluster
 
diff --git a/dashboard.py b/dashboard.py
index ff819dee91..5fcd3a7424 100644
--- a/dashboard.py
+++ b/dashboard.py
@@ -4,15 +4,19 @@
 import glob
 import importlib
 import inspect
+import operator
 import pathlib
-from typing import Literal, NamedTuple
+from collections.abc import Callable
+from typing import Any, Literal, NamedTuple
 
 import altair
+import numpy
 import pandas
 import panel
 import sqlalchemy
 from bokeh.resources import INLINE
 
+altair.data_transformers.enable("default", max_rows=None)
 panel.extension("vega")
 
 
@@ -53,121 +57,222 @@ def load_test_source() -> None:
     print(f"Discovered {len(source)} tests")
 
 
-def align_to_baseline(df: pandas.DataFrame, baseline: str) -> pandas.DataFrame | None:
-    """Add columns
+def calc_ab_confidence_intervals(
+    df: pandas.DataFrame, field_name: str, A: str, B: str
+) -> pandas.DataFrame:
+    """Calculate p(B / A - 1) > x and p(B / A - 1) < -x for discrete x, where A and B
+    are runtimes, for all tests in df.
+
+    Algorithm
+    ---------
+    https://towardsdatascience.com/a-practical-guide-to-a-b-tests-in-python-66666f5c3b02
+
+    Returns
+    -------
+    DataFrame:
+
+    fullname
+        Test name with category, e.g. bencharks/test_foo.py::test_123[1]
+    fullname_no_category
+        Test name without category, e.g. test_foo.py::test_123[1]
+    x
+        Confidence interval [-0.5, 0.5]. Note that element 0 will be repeated.
+    xlabel
+        "<-{p*100}% | x < 0
+        ">{p*100}% | x > 0
+    p
+        p(B/A-1) < x | x < 0
+        p(B/A-1) > x | x > 0
+    color
+        0 if p=1 and x < 0
+        0.5 if p=0
+        1 if p=1 and x > 0
+        plus all shades in between
+    """
 
-    - duration_baseline
-    - average_memory_baseline
-    - peak_memory_baseline
-    - duration_delta (A/B - 1)
-    - average_memory_delta (A/B - 1)
-    - peak_memory_delta (A/B - 1)
+    def bootstrap_mean(df_i: pandas.DataFrame) -> pandas.DataFrame:
+        boot = df_i[field_name].sample(frac=10_000, replace=True).to_frame()
+        boot["i"] = pandas.RangeIndex(boot.shape[0]) // df_i.shape[0]
+        out = boot.groupby("i").mean().reset_index()[[field_name]]
+        assert out.shape == (10_000, 1)
+        out.index.name = "bootstrap_run"
+        return out
+
+    # DataFrame with 20,000 rows per test exactly, with columns
+    # [fullname, fullname_no_category, runtime, bootstrap_run, {field_name}]
+    bootstrapped = (
+        df.groupby(["fullname", "fullname_no_category", "runtime"])
+        .apply(bootstrap_mean)
+        .reset_index()
+    )
 
-    Baseline values are from the matching rows given the same test name and the baseline
-    runtime. Note that this means that df is expected to have exactly 1 test in the
-    baseline runtime for each test in every other runtime.
-    """
-    df_baseline = df[df["runtime"] == baseline]
-
-    if df_baseline.empty:
-        # Typically a misspelling. However, this can legitimately happen in CI if all
-        # three jobs of the baseline runtime failed early.
-        print(
-            f"Baseline runtime {baseline!r} not found; valid choices are:",
-            ", ".join(df["runtime"].unique()),
+    # DataFrame with 10,000 rows per test exactly, with columns
+    # [fullname, fullname_no_category, bootstrap_run, {A}, {B}, diff]
+    pivot = bootstrapped.pivot(
+        ["fullname", "fullname_no_category", "bootstrap_run"],
+        "runtime",
+        field_name,
+    ).reset_index()
+    pivot["diff"] = pivot[B] / pivot[A] - 1
+
+    def confidence(
+        df_i: pandas.DataFrame,
+        x: numpy.ndarray,
+        op: Literal["<", ">"],
+        cmp: Callable[[Any, Any], bool],
+        color_factor: float,
+    ) -> pandas.DataFrame:
+        xlabel = [f"{op}{xi * 100:.0f}%" for xi in x]
+        p = (cmp(df_i["diff"].values.reshape([-1, 1]), x)).sum(axis=0) / df_i.shape[0]
+        color = color_factor * p / 2 + 0.5
+        return pandas.DataFrame({"x": x, "xlabel": xlabel, "p": p, "color": color})
+
+    pivot_groups = pivot.groupby(["fullname", "fullname_no_category"])[["diff"]]
+    x_neg = numpy.linspace(-0.8, 0, 17)
+    x_pos = numpy.linspace(0, 0.8, 17)
+    conf_neg, conf_pos = [
+        # DataFrame with 1 row per element of x_neg/x_pos and columns
+        # [fullname, fullname_no_category, x, xlabel, p, color]
+        (
+            pivot_groups.apply(confidence, p, op, cmp, color_factor)
+            .reset_index()
+            .drop("level_2", axis=1)
         )
-        return None
-
-    baseline_names = df_baseline["fullname"].unique()
-    all_names = df["fullname"].unique()
-
-    assert len(baseline_names) == df_baseline.shape[0]
-    if len(baseline_names) < len(all_names):
-        # This will happen in CI if one or two out of three jobs of the baseline failed.
-        # Note that df contains the latest run only. It means that tests on all runtimes
-        # (including historical ones) should be from the coiled-runtime git tip, so
-        # adding or removing tests should not cause a mismatch.
-        print(
-            f"Baseline runtime {baseline!r} is missing some tests:",
-            ", ".join(set(all_names) - set(baseline_names)),
+        for (p, op, cmp, color_factor) in (
+            (x_neg, "<", operator.lt, -1),
+            (x_pos, ">", operator.gt, 1),
         )
-        return None
-
-    columns = [spec.field_name for spec in SPECS]
-    df_baseline = (
-        df_baseline.set_index("fullname")
-        .loc[df["fullname"], columns]
-        .rename(columns={k: k + "_baseline" for k in columns})
-    )
-    df_baseline.index = df.index
-    df = pandas.concat([df, df_baseline], axis=1)
-    for column in columns:
-        df[column + "_delta"] = (df[column] / df[column + "_baseline"] - 1) * 100
-    return df
+    ]
+    return pandas.concat([conf_neg, conf_pos], axis=0)
 
 
 def make_barchart(
     df: pandas.DataFrame,
     spec: ChartSpec,
     title: str,
-    baseline: str | None,
-) -> altair.Chart | None:
+) -> tuple[altair.Chart | None, int]:
     """Make a single Altair barchart for a given test or runtime"""
     df = df.dropna(subset=[spec.field_name, "start"])
     if not len(df):
         # Some tests do not have average_memory or peak_memory measures, only runtime
-        return None
+        return None, 0
 
-    fields = [
-        spec.field_name,
-        "fullname",
-        "fullname_no_category",
-        "dask_version",
-        "distributed_version",
-        "runtime",
+    df = df[
+        [
+            spec.field_name,
+            "fullname",
+            "fullname_no_category",
+            "dask_version",
+            "distributed_version",
+            "runtime",
+        ]
     ]
 
-    height = max(df.shape[0] * 20 + 50, 90)
     tooltip = [
         altair.Tooltip("fullname:N", title="Test"),
+        altair.Tooltip("runtime:N", title="Runtime"),
         altair.Tooltip("dask_version:N", title="Dask"),
         altair.Tooltip("distributed_version:N", title="Distributed"),
-        altair.Tooltip(f"{spec.field_name}:Q", title=f"{spec.field_desc} {spec.unit}"),
+        altair.Tooltip(f"count({spec.field_name}):N", title="Number of runs"),
+        altair.Tooltip(f"stdev({spec.field_name}):Q", title=f"std dev {spec.unit}"),
+        altair.Tooltip(f"min({spec.field_name}):Q", title=f"min {spec.unit}"),
+        altair.Tooltip(f"median({spec.field_name}):Q", title=f"median {spec.unit}"),
+        altair.Tooltip(f"mean({spec.field_name}):Q", title=f"mean {spec.unit}"),
+        altair.Tooltip(f"max({spec.field_name}):Q", title=f"max {spec.unit}"),
     ]
 
     by_test = len(df["fullname"].unique()) == 1
     if by_test:
         df = df.sort_values("runtime", key=runtime_sort_key_pd)
         y = altair.Y("runtime", title="Runtime", sort=None)
+        n_bars = df["runtime"].unique().size
     else:
         y = altair.Y("fullname_no_category", title="Test name")
+        n_bars = df["fullname_no_category"].unique().size
 
-    if baseline:
-        fields += [
-            f"{spec.field_name}_delta",
-            f"{spec.field_name}_baseline",
-        ]
-        x = altair.X(
-            f"{spec.field_name}_delta",
-            title=f"{spec.field_desc} (delta % from {baseline})",
-        )
-        tooltip += [
-            altair.Tooltip(
-                f"{spec.field_name}_baseline:Q", title=f"{baseline} {spec.unit}"
+    height = max(n_bars * 20 + 50, 90)
+
+    bars = (
+        altair.Chart(width=800, height=height)
+        .mark_bar()
+        .encode(
+            x=altair.X(
+                f"median({spec.field_name}):Q", title=f"{spec.field_desc} {spec.unit}"
             ),
-            altair.Tooltip(f"{spec.field_name}_delta:Q", title="Delta %"),
+            y=y,
+            tooltip=tooltip,
+        )
+    )
+    ticks = (
+        altair.Chart()
+        .mark_tick(color="black")
+        .encode(x=f"mean({spec.field_name})", y=y)
+    )
+    error_bars = (
+        altair.Chart().mark_errorbar(extent="stdev").encode(x=spec.field_name, y=y)
+    )
+    chart = (
+        altair.layer(bars, ticks, error_bars, data=df)
+        .properties(title=title)
+        .configure(autosize="fit")
+    )
+
+    return chart, height
+
+
+def make_ab_confidence_map(
+    df: pandas.DataFrame,
+    spec: ChartSpec,
+    title: str,
+    baseline: str,
+) -> tuple[altair.Chart | None, int]:
+    """Make a single Altair heatmap of p(B/A - 1) confidence intervals, where B is the
+    examined runtime and A is the baseline, for all tests for a given measure.
+    """
+    df = df.dropna(subset=[spec.field_name, "start"])
+    if not len(df):
+        # Some tests do not have average_memory or peak_memory measures, only runtime
+        return None, 0
+
+    df = df[
+        [
+            spec.field_name,
+            "fullname",
+            "fullname_no_category",
+            "runtime",
         ]
-    else:
-        x = altair.X(spec.field_name, title=f"{spec.field_desc} {spec.unit}")
+    ]
+    runtimes = df["runtime"].unique()
+    A = baseline
+    B = next(r for r in runtimes if r != baseline)
+    conf = calc_ab_confidence_intervals(df, spec.field_name, A, B)
 
-    return (
-        altair.Chart(df[fields], width=800, height=height)
-        .mark_bar()
-        .encode(x=x, y=y, tooltip=tooltip)
+    n_bars = df["fullname_no_category"].unique().size
+    height = max(n_bars * 20 + 50, 90)
+
+    chart = (
+        altair.Chart(conf, width=800, height=height)
+        .mark_rect()
+        .encode(
+            x=altair.X("xlabel:O", title="confidence threshold (B/A - 1)", sort=None),
+            y=altair.Y("fullname_no_category:O", title="Test"),
+            color=altair.Color(
+                "color:Q",
+                scale=altair.Scale(scheme="redblue", domain=[0, 1], reverse=True),
+                legend=None,
+            ),
+            tooltip=[
+                altair.Tooltip("fullname:O", title="Test Name"),
+                altair.Tooltip("xlabel:O", title="Confidence threshold"),
+                altair.Tooltip("p:Q", format=".2p", title="p(B/A-1) exceeds threshold"),
+            ],
+        )
         .properties(title=title)
         .configure(autosize="fit")
     )
 
+    return chart, height
+
 
 def make_timeseries(
     df: pandas.DataFrame, spec: ChartSpec, title: str
@@ -229,7 +334,7 @@ def make_timeseries(
 
 def make_test_report(
     df: pandas.DataFrame,
-    kind: Literal["barchart" | "timeseries"],
+    kind: Literal["barchart" | "timeseries" | "A/B"],
     title: str,
     sourcename: str | None = None,
     baseline: str | None = None,
@@ -240,17 +345,19 @@ def make_test_report(
         if kind == "timeseries":
             assert not baseline
             chart = make_timeseries(df, spec, title)
+            height = 384
+        elif kind == "barchart":
+            assert not baseline
+            chart, height = make_barchart(df, spec, title)
+        elif kind == "A/B":
+            assert baseline
+            chart, height = make_ab_confidence_map(df, spec, title, baseline=baseline)
         else:
-            chart = make_barchart(df, spec, title, baseline)
+            raise ValueError(kind)  # pragma: nocover
         if not chart:
             continue
         tabs.append((spec.field_desc, chart))
 
-    if kind == "timeseries":
-        height = 384
-    else:
-        height = max(df.shape[0] * 20 + 50, 90)
-
     if sourcename in source:
         code = panel.pane.Markdown(
             f"```python\n{source[sourcename]}\n```",
@@ -281,10 +388,8 @@ def make_timeseries_html_report(
     categories = sorted(df[df.runtime == runtime].category.unique())
     tabs = []
     for category in categories:
-        df_by_test = (
-            df[(df.runtime == runtime) & (df.category == category)]
-            .sort_values("sourcename")
-            .groupby("sourcename")
+        df_by_test = df[(df.runtime == runtime) & (df.category == category)].groupby(
+            "sourcename"
         )
         panes = [
             make_test_report(
@@ -302,29 +407,22 @@ def make_timeseries_html_report(
     doc.save(out_fname, title=runtime, resources=INLINE)
 
 
-def make_ab_html_report(
+def make_barchart_html_report(
     df: pandas.DataFrame,
     output_dir: pathlib.Path,
     by_test: bool,
-    baseline: str | None,
 ) -> None:
-    """Generate HTML report for the latest CI run, comparing all runtimes (e.g.
-    coiled-upstream-py3.9) against a baseline runtime
+    """Generate HTML report containing bar charts showing statistical information
+    (mean, median, etc).
 
     Create one tab for each test category (e.g. benchmarks, runtime, stability),
     one graph for each runtime and one bar for each test
     OR one graph for each test and one bar for each runtime,
     and one graph tab for each measure (wall clock, average memory, peak memory).
-
-    If a baseline runtime is defined, all measures are expressed relative to the
-    baseline; otherwise they're expressed in absolute terms.
     """
     out_fname = str(
         output_dir.joinpath(
-            "AB_by_"
-            + ("test" if by_test else "runtime")
-            + (f"_vs_{baseline}" if baseline else "")
-            + ".html"
+            "barcharts_by_" + ("test" if by_test else "runtime") + ".html"
         )
     )
     print(f"Generating {out_fname}")
@@ -333,36 +431,25 @@ def make_ab_html_report(
     tabs = []
     for category in categories:
         if by_test:
-            df_by_test = (
-                df[df.category == category]
-                .sort_values(["sourcename", "fullname"])
-                .groupby(["sourcename", "fullname"])
-            )
+            df_by_test = df[df.category == category].groupby(["sourcename", "fullname"])
             panes = [
                 make_test_report(
                     df_by_test.get_group((sourcename, fullname)),
                     kind="barchart",
                     title=fullname,
                     sourcename=sourcename,
-                    baseline=baseline,
                 )
                 for sourcename, fullname in df_by_test.groups
             ]
         else:
-            df_by_runtime = (
-                df[df.category == category]
-                .sort_values("runtime", key=runtime_sort_key_pd)
-                .groupby("runtime")
-            )
+            df_by_runtime = df[df.category == category].groupby("runtime")
             panes = [
                 make_test_report(
                     df_by_runtime.get_group(runtime),
                     kind="barchart",
                     title=runtime,
-                    baseline=baseline,
                 )
                 for runtime in sorted(df_by_runtime.groups, key=runtime_sort_key)
-                if runtime != baseline
             ]
         flex = panel.FlexBox(*panes, align_items="start", justify_content="start")
         tabs.append((category.title(), flex))
@@ -370,11 +457,69 @@ def make_ab_html_report(
 
     doc.save(
         out_fname,
-        title="A/B by "
-        + ("test" if by_test else "runtime")
-        + (f" vs. {baseline}" if baseline else ""),
+        title="Bar charts by " + ("test" if by_test else "runtime"),
+        resources=INLINE,
+    )
+
+
+def make_ab_html_report(
+    df: pandas.DataFrame,
+    output_dir: pathlib.Path,
+    baseline: str,
+) -> bool:
+    """Generate HTML report containing heat maps for confidence intervals relative to
+    a baseline runtime, e.g. p(B/A-1) > 10%
+
+    Create one tab for each test category (e.g. benchmarks, runtime, stability), one
+    graph for each runtime, and one graph tab for each measure (wall clock, average
+    memory, peak memory).
+
+    Returns
+    -------
+    True if the report was generated; False otherwise
+    """
+    out_fname = str(output_dir.joinpath(f"AB_vs_{baseline}.html"))
+    print(f"Generating {out_fname}")
+
+    categories = sorted(df.category.unique())
+    tabs = []
+    for category in categories:
+        df_by_runtime = df[df.category == category].groupby("runtime")
+        if baseline not in df_by_runtime.groups:
+            # Typically a misspelling. However, this can legitimately happen in CI if
+            # all three jobs of the baseline runtime failed early.
+            print(
+                f"Baseline runtime {baseline!r} not found; valid choices are:",
+                ", ".join(df["runtime"].unique()),
+            )
+            return False
+
+        panes = [
+            make_test_report(
+                pandas.concat(
+                    [
+                        df_by_runtime.get_group(runtime),
+                        df_by_runtime.get_group(baseline),
+                    ],
+                    axis=0,
+                ),
+                kind="A/B",
+                title=runtime,
+                baseline=baseline,
+            )
+            for runtime in sorted(df_by_runtime.groups, key=runtime_sort_key)
+            if runtime != baseline
+        ]
+        flex = panel.FlexBox(*panes, align_items="start", justify_content="start")
+        tabs.append((category.title(), flex))
+    doc = panel.Tabs(*tabs, margin=12)
+
+    doc.save(
+        out_fname,
+        title="A/B confidence intervals vs. " + baseline,
         resources=INLINE,
     )
+    return True
 
 
 def make_index_html_report(
@@ -385,12 +530,12 @@ def make_index_html_report(
     index_txt += "### Historical timeseries\n"
     for runtime in runtimes:
         index_txt += f"- [{runtime}](./{runtime}.html)\n"
-    index_txt += "\n\n### A/B tests\n"
-    index_txt += "- [by test](./AB_by_test.html)\n"
-    index_txt += "- [by runtime](./AB_by_runtime.html)\n"
+    index_txt += "\n\n### Statistical analysis\n"
+    index_txt += "- [Bar charts, by test](./barcharts_by_test.html)\n"
+    index_txt += "- [Bar charts, by runtime](./barcharts_by_runtime.html)\n"
     for baseline in baselines:
         index_txt += (
-            f"- [by runtime vs. {baseline}](./AB_by_runtime_vs_{baseline}.html)\n"
+            f"- [A/B confidence intervals vs. {baseline}](./AB_vs_{baseline}.html)\n"
         )
 
     index = panel.pane.Markdown(index_txt, width=800)
@@ -503,24 +648,17 @@ def main() -> None:
     for runtime in runtimes:
         make_timeseries_html_report(df, output_dir, runtime)
 
-    # Select only the latest run for each runtime. This may pick up historical runs (up
-    # to 6h old) if they have not been rerun in the current pull/PR.
-    # TODO This is fragile. Keep the latest and historical databases separate, or record
-    #      the coiled-runtime git hash and use it to filter?
-    max_end = df.sort_values("end").groupby(["runtime", "category"]).tail(1)
-    max_end = max_end[max_end["end"] > max_end["end"].max() - pandas.Timedelta("6h")]
-    session_ids = max_end["session_id"].unique()
-    latest_run = df[df["session_id"].isin(session_ids)]
-
-    make_ab_html_report(latest_run, output_dir, by_test=True, baseline=None)
-    make_ab_html_report(latest_run, output_dir, by_test=False, baseline=None)
+    # Do not use data that is more than a week old in statistical analysis
+    df_recent = df[df["end"] > df["end"].max() - pandas.Timedelta("7d")]
+
+    make_barchart_html_report(df_recent, output_dir, by_test=True)
+    make_barchart_html_report(df_recent, output_dir, by_test=False)
+
     baselines = []
     for baseline in args.baseline:
-        df_baseline = align_to_baseline(latest_run, baseline)
-        if df_baseline is None:
-            continue
-        baselines.append(baseline)
-        make_ab_html_report(df_baseline, output_dir, by_test=False, baseline=baseline)
+        has_baseline = make_ab_html_report(df_recent, output_dir, baseline)
+        if has_baseline:
+            baselines.append(baseline)
 
     make_index_html_report(output_dir, runtimes, baselines)
 
diff --git a/tests/benchmarks/test_parquet.py b/tests/benchmarks/test_parquet.py
index feeaa7666b..69b8c03858 100644
--- a/tests/benchmarks/test_parquet.py
+++ b/tests/benchmarks/test_parquet.py
@@ -15,13 +15,14 @@
 
 
 @pytest.fixture(scope="module")
-def parquet_cluster():
+def parquet_cluster(dask_env_variables):
     with Cluster(
         f"parquet-{uuid.uuid4().hex[:8]}",
         n_workers=N_WORKERS,
         worker_vm_types=["m5.xlarge"],
         scheduler_vm_types=["m5.xlarge"],
         package_sync=True,
+        environ=dask_env_variables,
     ) as cluster:
         yield cluster
 
diff --git a/tests/benchmarks/test_work_stealing.py b/tests/benchmarks/test_work_stealing.py
index 05759ed2ad..2facfd43d0 100644
--- a/tests/benchmarks/test_work_stealing.py
+++ b/tests/benchmarks/test_work_stealing.py
@@ -23,7 +23,7 @@ def test_trivial_workload_should_not_cause_work_stealing(small_client):
     reason="https://github.com/dask/distributed/issues/6624",
 )
 def test_work_stealing_on_scaling_up(
-    test_name_uuid, upload_cluster_dump, benchmark_all
+    test_name_uuid, upload_cluster_dump, benchmark_all, dask_env_variables
 ):
     with Cluster(
         name=test_name_uuid,
@@ -32,6 +32,7 @@ def test_work_stealing_on_scaling_up(
         scheduler_vm_types=["t3.xlarge"],
         wait_for_workers=True,
         package_sync=True,
+        environ=dask_env_variables,
     ) as cluster:
         with Client(cluster) as client:
             # FIXME https://github.com/coiled/platform/issues/103
@@ -79,7 +80,7 @@ def clog(n):
 
 
 def test_work_stealing_on_straggling_worker(
-    test_name_uuid, upload_cluster_dump, benchmark_all
+    test_name_uuid, upload_cluster_dump, benchmark_all, dask_env_variables
 ):
     with Cluster(
         name=test_name_uuid,
@@ -87,6 +88,7 @@ def test_work_stealing_on_straggling_worker(
         worker_vm_types=["t3.medium"],
         scheduler_vm_types=["t3.xlarge"],
         wait_for_workers=True,
+        environ=dask_env_variables,
     ) as cluster:
         with Client(cluster) as client:
             # FIXME https://github.com/coiled/platform/issues/103
diff --git a/tests/stability/test_deadlock.py b/tests/stability/test_deadlock.py
index 8c29d658e2..6ee4825cbb 100644
--- a/tests/stability/test_deadlock.py
+++ b/tests/stability/test_deadlock.py
@@ -13,7 +13,7 @@
 @pytest.mark.skip(
     reason="Skip until https://github.com/dask/distributed/pull/6637 is merged"
 )
-def test_repeated_merge_spill(upload_cluster_dump, benchmark_all):
+def test_repeated_merge_spill(upload_cluster_dump, benchmark_all, dask_env_variables):
     with Cluster(
         name=f"test_deadlock-{uuid.uuid4().hex}",
         n_workers=20,
@@ -21,6 +21,7 @@ def test_repeated_merge_spill(upload_cluster_dump, benchmark_all):
         scheduler_vm_types=["t3.xlarge"],
         wait_for_workers=True,
         package_sync=True,
+        environ=dask_env_variables,
     ) as cluster:
         with Client(cluster) as client:
             with upload_cluster_dump(client, cluster), benchmark_all(client):
diff --git a/tests/stability/test_spill.py b/tests/stability/test_spill.py
index 04bedc4b33..2e9f191d03 100644
--- a/tests/stability/test_spill.py
+++ b/tests/stability/test_spill.py
@@ -4,10 +4,11 @@
 import pytest
 from coiled import Cluster
 from dask.distributed import Client, wait
+from toolz import merge
 
 
 @pytest.fixture(scope="module")
-def spill_cluster():
+def spill_cluster(dask_env_variables):
     with Cluster(
         f"spill-{uuid.uuid4().hex[:8]}",
         n_workers=5,
@@ -16,17 +17,20 @@ def spill_cluster():
         worker_vm_types=["t3.large"],
         scheduler_vm_types=["t3.xlarge"],
         wait_for_workers=True,
-        environ={
-            # Note: We set allowed-failures to ensure that no tasks are not retried
-            #  upon ungraceful shutdown behavior during adaptive scaling
-            #  but we receive a KilledWorker() instead.
-            "DASK_DISTRIBUTED__SCHEDULER__ALLOWED_FAILURES": "0",
-            # We need to limit the number of connections to avoid getting `oom-killed`.
-            #  See https://github.com/coiled/coiled-runtime/pull/229#discussion_r946807049
-            #  for a longer discussion
-            "DASK_DISTRIBUTED__WORKER__CONNECTIONS__INCOMING": "1",
-            "DASK_DISTRIBUTED__WORKER__CONNECTIONS__OUTGOING": "1",
-        },
+        environ=merge(
+            dask_env_variables,
+            {
+                # Note: We set allowed-failures to ensure that no tasks are not retried
+                # upon ungraceful shutdown behavior during adaptive scaling but we
+                # receive a KilledWorker() instead.
+                "DASK_DISTRIBUTED__SCHEDULER__ALLOWED_FAILURES": "0",
+                # We need to limit the number of connections to avoid getting
+                # `oom-killed`. For a longer discussion, see
+                # https://github.com/coiled/coiled-runtime/pull/229#discussion_r946807049
+                "DASK_DISTRIBUTED__WORKER__CONNECTIONS__INCOMING": "1",
+                "DASK_DISTRIBUTED__WORKER__CONNECTIONS__OUTGOING": "1",
+            },
+        ),
     ) as cluster:
         yield cluster