coiled · crusaderky · Dec 14, 2022 · Nov 30, 2022 · Dec 1, 2022 · Dec 1, 2022
diff --git a/.github/workflows/ab_tests.yml b/.github/workflows/ab_tests.yml
@@ -52,12 +52,7 @@ jobs:
     timeout-minutes: 120
     strategy:
       fail-fast: false
-      # AWS implements limiters to how many EC2 instances you can spawn in parallel *on
-      # the same AWS account*. If such limit is reached, jobs will randomly fail when
-      # trying to create the Coiled clusters, and restarting failed jobs won't fix the
-      # problem. Additionally, there are problems with Coiled itself triggered by
-      # limitations that are never actually reached with real paying users.
-      max-parallel: 5
+      max-parallel: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).max_parallel }}
       matrix:
         os: [ubuntu-latest]
         python-version: ["3.9"]

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -102,7 +102,7 @@ jobs:
           DB_NAME: ${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db
           BENCHMARK: true
           CLUSTER_DUMP: always
-        run: bash ci/scripts/run_tests.sh ${{ matrix.pytest_args }}
+        run: bash ci/scripts/run_tests.sh -n 4 --dist loadscope ${{ matrix.pytest_args }}
 
       - name: Dump coiled.Cluster kwargs
         run: cat cluster_kwargs.merged.yaml

diff --git a/AB_environments/README.md b/AB_environments/README.md
@@ -112,9 +112,15 @@ automatically create a verbatim copy of AB_baseline and then compare the two in
 tests. Set it to false to save some money if you are already confident that the 'repeat'
 setting is high enough.
 
-Finally, the files offers a `categories` list. These are the subdirectories of `tests/`
+The file offers a `categories` list. These are the subdirectories of `tests/`
 which you wish to run.
 
+Finally, the `max_parallel` setting lets you tweak maximum test parallelism, both in
+github actions and in pytest-xdist. Reducing parallelism is useful when testing on very
+large clusters (e.g. to avoid having 20 clusters with 1000 workers each at the same
+time).
+
+
 ### 5. (optional) Tweak tests
 Nothing prevents you from changing the tests themselves.
 
@@ -154,6 +160,9 @@ categories:
   - runtime
   - benchmarks
   - stability
+max_parallel:
+  ci_jobs: 5
+  pytest_workers_per_job: 4
 ```
 
 ### 6. Run CI

diff --git a/AB_environments/config.yaml b/AB_environments/config.yaml
@@ -14,3 +14,16 @@ categories:
   - benchmarks
   # - runtime
   # - stability
+
+# AWS implements limiters to how many EC2 instances you can spawn in parallel on the
+# same AWS account. If such limit is reached, tests will randomly fail when trying to
+# create the Coiled clusters, and restarting failed jobs won't fix the problem.
+# Additionally, there are problems with Coiled itself triggered by limitations that are
+# never actually reached with real paying users.
+max_parallel:
+  # Number of parallel A/B test jobs per branch.
+  ci_jobs: 5
+  # Number of parallel test_*.py modules per A/B test job.
+  # Each module typically spawns one Coiled cluster at a time.
+  # Set to 1 to disable pytest-xdist.
+  pytest_workers_per_job: 4
diff --git a/ci/scripts/discover_ab_environments.py b/ci/scripts/discover_ab_environments.py
@@ -12,9 +12,19 @@ class JSONOutput(TypedDict):
     run_AB: bool
     repeat: list[int]
     runtime: list[str]
+    max_parallel: int
     pytest_args: list[str]
 
 
+DO_NOT_RUN: JSONOutput = {
+    "run_AB": False,
+    "repeat": [],
+    "runtime": [],
+    "max_parallel": 1,
+    "pytest_args": [],
+}
+
+
 def build_json() -> JSONOutput:
     with open("AB_environments/config.yaml") as fh:
         cfg = yaml.safe_load(fh)
@@ -26,7 +36,7 @@ def build_json() -> JSONOutput:
             raise ValueError("fNot a valid test category: {category}")
 
     if not cfg["repeat"] or not cfg["categories"]:
-        return {"run_AB": False, "repeat": [], "runtime": [], "pytest_args": []}
+        return DO_NOT_RUN
 
     runtimes = []
     for conda_fname in sorted(glob.glob("AB_environments/AB_*.conda.yaml")):
@@ -37,7 +47,7 @@ def build_json() -> JSONOutput:
         runtimes.append(env_name)
 
     if not runtimes:
-        return {"run_AB": False, "repeat": [], "runtime": [], "pytest_args": []}
+        return DO_NOT_RUN
 
     if "AB_baseline" not in runtimes:
         # If any A/B environments are defined, AB_baseline is required
@@ -46,11 +56,15 @@ def build_json() -> JSONOutput:
     if cfg["test_null_hypothesis"]:
         runtimes += ["AB_null_hypothesis"]
 
+    n = cfg["max_parallel"]["pytest_workers_per_job"]
+    xdist_args = f"-n {n} --dist loadscope " if n > 1 else ""
+
     return {
         "run_AB": True,
         "repeat": list(range(1, cfg["repeat"] + 1)),
         "runtime": runtimes,
-        "pytest_args": [" ".join(f"tests/{c}" for c in cfg["categories"])],
+        "max_parallel": cfg["max_parallel"]["ci_jobs"],
+        "pytest_args": [xdist_args + " ".join(f"tests/{c}" for c in cfg["categories"])],
     }
 
 

diff --git a/ci/scripts/run_tests.sh b/ci/scripts/run_tests.sh
@@ -16,4 +16,4 @@ then
   EXTRA_OPTIONS="$EXTRA_OPTIONS --benchmark"
 fi
 
-python -m pytest -n 4 --dist loadscope $EXTRA_OPTIONS $@
+python -m pytest $EXTRA_OPTIONS $@