diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 81b6095712..7a5c5e4755 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -57,16 +57,19 @@ jobs: if [[ "$COMMIT" == *"test-upstream"* || ${{ github.event_name }} == "schedule" ]] then export TEST_UPSTREAM="true" + export AB_BASELINE="coiled-upstream-py3.9 coiled-0.1.0-py3.9" else export TEST_UPSTREAM="false" + export AB_BASELINE="coiled-0.1.0-py3.9" fi # Put TEST_UPSTREAM into $GITHUB_ENV so it can be used in subsequent workflow steps echo $TEST_UPSTREAM echo TEST_UPSTREAM=$TEST_UPSTREAM >> $GITHUB_ENV - # Put TEST_UPSTREAM into a file so it can be downloaded in subsequent workflow jobs + # Put env variables into files so it can be downloaded in subsequent workflow jobs echo $TEST_UPSTREAM > test_upstream.txt + echo $AB_BASELINE > ab_baseline.txt - name: Build Coiled Software Environment env: @@ -104,6 +107,7 @@ jobs: latest.yaml software_name.txt test_upstream.txt + ab_baseline.txt runtime: name: Runtime - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }} @@ -457,9 +461,14 @@ jobs: python-version: "3.9" environment-file: ci/environment-dashboard.yml + - name: Download software environment assets + uses: actions/download-artifact@v3 + with: + name: software-environment-py3.9 + - name: Generate dashboards run: | - python dashboard.py + python dashboard.py -d benchmark.db -o static -b <(cat ab_baseline.txt) - name: Deploy 🚀 uses: JamesIves/github-pages-deploy-action@4.1.7 diff --git a/.gitignore b/.gitignore index c3d511b0c9..272b49eb25 100644 --- a/.gitignore +++ b/.gitignore @@ -120,6 +120,9 @@ venv.bak/ # Rope project settings .ropeproject +# PyCharm project settings +.idea + # mkdocs documentation /site diff --git a/ci/environment-dashboard.yml b/ci/environment-dashboard.yml index bf298d61e6..4ebdbd3ca1 100644 --- a/ci/environment-dashboard.yml +++ b/ci/environment-dashboard.yml @@ -11,6 +11,7 @@ dependencies: - dask - dask-ml - distributed + - xarray - xgboost - pandas - tabulate diff --git a/dashboard.py b/dashboard.py index 42b03d91ff..988de74ec5 100644 --- a/dashboard.py +++ b/dashboard.py @@ -1,12 +1,12 @@ from __future__ import annotations +import argparse import collections import glob import importlib import inspect -import os import pathlib -import sys +from typing import Literal import altair import pandas @@ -17,51 +17,74 @@ panel.extension("vega") -def get_test_source(): - """ - Crawl the tests directory and try to grab code for each test on a best-effort - basis. This relies on the tests being importable from this script, so the - environment should be similar enough that that is possible. - """ - source: dict[str, str] = {} - files = glob.glob("tests/**/test_*.py", recursive=True) - for f in files: - try: - # Fragile! - mod = importlib.import_module(f.replace("/", ".")[: -len(".py")]) - tests = [a for a in dir(mod) if a.startswith("test_")] - for test in tests: - if fn := getattr(mod, test, None): - if not callable(fn): - continue - source[f[len("tests/") :] + "::" + test] = inspect.getsource(fn) - except BaseException: # Some pytest exceptions inherit directly from BaseException - pass - return source - - -source = get_test_source() - - -def make_timeseries(originalname, df, spec) -> altair.Chart | None: +source: dict[str, str] = {} + + +def load_test_source() -> None: + """Crawl the tests directory and try to grab code for each test. This relies on the + tests being importable from this script. """ - Make a single timeseries altair chart for a given test. + for fname in glob.iglob("tests/**/test_*.py", recursive=True): + mod = importlib.import_module(fname.replace("/", ".")[: -len(".py")]) + tests = [a for a in dir(mod) if a.startswith("test_")] + for test in tests: + if (func := getattr(mod, test, None)) and callable(func): + # FIXME missing decorators, namely @pytest.mark.parametrize + source[fname[len("tests/") :] + "::" + test] = inspect.getsource(func) - originalname: str - The name of the test without any fixture or other modifications. - df: pandas.DataFrame - A dataframe with the test data in it. +def make_barchart( + df: pandas.DataFrame, spec: altair.ChartSpec, title: str +) -> altair.Chart | None: + """Make a single Altair barchart for a given test.""" + df = df.dropna(subset=[spec.field, "start"]) + if not len(df): + # Some tests do not have average_memory or peak_memory measures, only runtime + return None - spec: ChartSpec - Data for how to render the timeseries - """ + df = df.sort_values("runtime", key=runtime_sort_key_pd) + df = df.assign(**{spec.field: df[spec.field] / spec.scale}) + df = df[ + [ + spec.field, + "name", + "dask_version", + "distributed_version", + "runtime", + ] + ] + # Altair will re-sort alphabetically, unless it's explicitly asked to sort by + # something else. Could not find an option to NOT sort. + df["order"] = pandas.RangeIndex(0, df.shape[0]) + height = df.shape[0] * 20 + 50 + return ( + altair.Chart(df, width=800, height=height) + .mark_bar() + .encode( + x=altair.X(spec.field, title=spec.label), + y=altair.Y("runtime", sort=altair.EncodingSortField(field="order")), + tooltip=[ + altair.Tooltip("dask_version:N", title="Dask"), + altair.Tooltip("distributed_version:N", title="Distributed"), + altair.Tooltip(f"{spec.field}:Q", title=spec.label), + ], + ) + .properties(title=title) + .configure(autosize="fit") + ) + + +def make_timeseries( + df: pandas.DataFrame, spec: altair.ChartSpec, title: str +) -> altair.Chart | None: + """Make a single Altair timeseries chart for a given test""" df = df.dropna(subset=[spec.field, "start"]) if not len(df): + # Some tests do not have average_memory or peak_memory measures, only runtime return None + df = df.assign(**{spec.field: df[spec.field] / spec.scale}) df = df.fillna({"ci_run_url": "https://github.com/coiled/coiled-runtime"}) - path = df.path.iloc[0] kwargs = {} # Reduce the size of the altair spec df = df[ @@ -73,6 +96,7 @@ def make_timeseries(originalname, df, spec) -> altair.Chart | None: "call_outcome", "coiled_runtime_version", "dask_version", + "distributed_version", ] ] if len(df.name.unique()) > 1: @@ -96,28 +120,24 @@ def make_timeseries(originalname, df, spec) -> altair.Chart | None: altair.Tooltip("call_outcome:N", title="Test Outcome"), altair.Tooltip("coiled_runtime_version:N", title="Coiled Runtime"), altair.Tooltip("dask_version:N", title="Dask"), + altair.Tooltip("distributed_version:N", title="Distributed"), altair.Tooltip(f"{spec.field}:Q", title=spec.label), altair.Tooltip("ci_run_url:N", title="CI Run URL"), ], **kwargs, ) - .properties(title=f"{path}::{originalname}") + .properties(title=title) .configure(autosize="fit") ) -def make_test_report(group_keys, df): - """ - Make a tab panel for a single test. - - originalname: str - The name of the test without any fixture or other modifications. - - df: pandas.DataFrame - A dataframe with the test data in it. - """ - path, originalname = group_keys - +def make_test_report( + df: pandas.DataFrame, + kind: Literal["barchart" | "timeseries"], + title: str, + sourcename: str, +) -> panel.Tabs: + """Make a tab panel for a single test""" ChartSpec = collections.namedtuple("ChartSpec", ["field", "scale", "label"]) specs = [ ChartSpec("duration", 1, "Wall Clock (s)"), @@ -126,33 +146,223 @@ def make_test_report(group_keys, df): ] tabs = [] for s in specs: - chart = make_timeseries(originalname, df, s) + if kind == "timeseries": + chart = make_timeseries(df, s, title) + else: + chart = make_barchart(df, s, title) if not chart: continue tabs.append((s.label, chart)) - sourcename = path + "::" + originalname + if kind == "timeseries": + height = 384 + else: + height = df.shape[0] * 20 + 50 + if sourcename in source: code = panel.pane.Markdown( f"```python\n{source[sourcename]}\n```", width=800, - height=384, + height=height, style={"overflow": "auto"}, ) tabs.append(("Source", code)) + else: + print("Source code not found for", sourcename) return panel.Tabs(*tabs, margin=12, width=800) -if __name__ == "__main__": - DB_NAME = ( - sys.argv[1] if len(sys.argv) > 1 else os.environ.get("DB_NAME", "benchmark.db") +def make_timeseries_html_report( + df: pandas.DataFrame, output_dir: pathlib.Path, runtime: str +) -> None: + """Generate HTML report for one runtime (e.g. coiled-upstream-py3.9), showing + evolution of measures (wall clock, average memory, peak memory) over historical CI + runs. + + Create one tab for each test category (e.g. benchmarks, runtime, stability), + one graph for each test, + and one graph tab for each measure (wall clock, average memory, peak memory). + """ + out_fname = str(output_dir.joinpath(runtime + ".html")) + print(f"Generating {out_fname}") + categories = sorted(df[df.runtime == runtime].category.unique()) + tabs = [] + for category in categories: + df_by_test = ( + df[(df.runtime == runtime) & (df.category == category)] + .sort_values("sourcename") + .groupby("sourcename") + ) + panes = [ + make_test_report( + df_by_test.get_group(sourcename), + kind="timeseries", + title=sourcename, + sourcename=sourcename, + ) + for sourcename in df_by_test.groups + ] + flex = panel.FlexBox(*panes, align_items="start", justify_content="start") + tabs.append((category.title(), flex)) + doc = panel.Tabs(*tabs, margin=12) + + doc.save(out_fname, title=runtime, resources=INLINE) + + +def make_ab_html_report( + df: pandas.DataFrame, + output_dir: pathlib.Path, + by_test: bool, + baseline: str | None, +) -> None: + """Generate HTML report for the latest CI run, comparing all runtimes (e.g. + coiled-upstream-py3.9) against a baseline runtime + + Create one tab for each test category (e.g. benchmarks, runtime, stability), + one graph for each runtime and one bar for each test + OR one graph for each test and one bar for each runtime, + and one graph tab for each measure (wall clock, average memory, peak memory). + + If a baseline runtime is defined, all measures are expressed relative to the + baseline; otherwise they're expressed in absolute terms. + """ + out_fname = str( + output_dir.joinpath( + "AB_by_" + + ("test" if by_test else "runtime") + + (f"_vs_{baseline}" if baseline else "") + + ".html" + ) ) - static = pathlib.Path("static") - static.mkdir(exist_ok=True) + print(f"Generating {out_fname}") - engine = sqlalchemy.create_engine(f"sqlite:///{DB_NAME}") + categories = sorted(df.category.unique()) + tabs = [] + for category in categories: + if by_test: + df_by_test = ( + df[df.category == category] + .sort_values(["sourcename", "fullname"]) + .groupby(["sourcename", "fullname"]) + ) + panes = [ + make_test_report( + df_by_test.get_group((sourcename, fullname)), + kind="barchart", + title=fullname, + sourcename=sourcename, + ) + for sourcename, fullname in df_by_test.groups + ] + else: + return # WIP + flex = panel.FlexBox(*panes, align_items="start", justify_content="start") + tabs.append((category.title(), flex)) + doc = panel.Tabs(*tabs, margin=12) + + doc.save( + out_fname, + title="A/B by " + + ("test" if by_test else "runtime") + + (f" vs. {baseline}" if baseline else ""), + resources=INLINE, + ) + + +def make_index_html_report( + output_dir: pathlib.Path, runtimes: list[str], baselines: list[str] +) -> None: + """Generate index.html""" + index_txt = """# Coiled Runtime Benchmarks\n""" + index_txt += "### Historical timeseries\n" + for runtime in runtimes: + index_txt += f"- [{runtime}](./{runtime}.html)\n" + index_txt += "\n\n### A/B tests\n" + for baseline in baselines: + index_txt += ( + f"- [by runtime vs. {baseline}](./AB_by_runtime_vs_{baseline}.html)\n" + ) + index_txt += "- [by test](./AB_by_test.html)\n" + + index = panel.pane.Markdown(index_txt, width=800) + out_fname = str(output_dir.joinpath("index.html")) + print(f"Generating {out_fname}") + index.save( + out_fname, + title="Coiled Runtime Benchmarks", + resources=INLINE, + ) + + +def runtime_sort_key(runtime: str) -> tuple: + """Runtimes are in the format coiled--py + e.g. coiled-latest-py3.8 + + Sort them by version descending and by python version ascending + """ + t = runtime.split("-") + assert len(t) == 3 + assert t[0] == "coiled" + # upstream > latest > 0.1.0 > 0.0.4 + if t[1] == "upstream": + coiled_version = [-2] + elif t[1] == "latest": + coiled_version = [-1] + else: + coiled_version = [0] + [-int(v) for v in t[1].split(".")] + + assert t[2][:2] == "py" + py_version = [int(v) for v in t[2][2:].split(".")] + return coiled_version, py_version + + +def runtime_sort_key_pd(s: pandas.Series) -> pandas.Series: + return pandas.Series([runtime_sort_key(v) for v in s], index=s.index) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate a static HTML report comparing metrics from the runs" + ) + parser.add_argument( + "--db-file", + "-d", + help="Path to SQLite database file containing the metrics", + ) + parser.add_argument( + "--output-dir", + "-o", + help="Output directory", + default="build/html", + ) + parser.add_argument( + "--baseline", + "-b", + nargs="+", + default=[], + help="Baseline runtime(s) for A/B comparison", + ) + parser.add_argument( + "--pickle", + action="store_true", + help="Dump raw dataframe to pickle file", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + output_dir = pathlib.Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + load_test_source() + print(f"Discovered {len(source)} tests") + + # Load SQLite database into a pandas DataFrame + engine = sqlalchemy.create_engine(f"sqlite:///{args.db_file}") df = pandas.read_sql( - "select * from test_run where platform = 'linux' and call_outcome in ('passed', 'failed')", + "select * from test_run where platform = 'linux' " + "and call_outcome in ('passed', 'failed')", engine, ) df = df.assign( @@ -164,34 +374,35 @@ def make_test_report(group_keys, df): ), category=df.path.str.split("/", n=1).str[0], ) + df["start"] = pandas.to_datetime(df["start"]) + df["end"] = pandas.to_datetime(df["end"]) + df["sourcename"] = df["path"].str.cat(df["originalname"], "::") + df["fullname"] = df["path"].str.cat(df["name"], "::") + df = df.set_index("id") + + if args.pickle: + out_fname = str(output_dir.joinpath("records.pickle")) + print(f"Generating {out_fname}") + df.to_pickle(out_fname) - runtimes = list(df.runtime.unique()) + # Generate HTML pages + runtimes = sorted(df.runtime.unique(), key=runtime_sort_key) for runtime in runtimes: - print(f"Generating dashboard for {runtime}") - categories = df[df.runtime == runtime].category.unique() - tabs = [] - for category in categories: - by_test = ( - df[(df.runtime == runtime) & (df.category == category)] - .sort_values(["path", "originalname"]) - .groupby(["path", "originalname"]) - ) - panes = [ - make_test_report(test_name, by_test.get_group(test_name)) - for test_name in by_test.groups - ] - flex = panel.FlexBox(*panes, align_items="start", justify_content="start") - tabs.append((category.title(), flex)) - doc = panel.Tabs(*tabs, margin=12) + make_timeseries_html_report(df, output_dir, runtime) - doc.save( - str(static.joinpath(runtime + ".html")), title=runtime, resources=INLINE - ) - index = """# Coiled Runtime Benchmarks\n\n""" - index += "\n\n".join([f"[{r}](./{r}.html)" for r in reversed(sorted(runtimes))]) - index = panel.pane.Markdown(index, width=800) - index.save( - str(static.joinpath("index.html")), - title="Coiled Runtime Benchmarks", - resources=INLINE, - ) + # Select only the latest run for each runtime. This may pick up historical runs (up + # to 2d old) if they have not been rerun in the current pull/PR. + max_end = df.sort_values("end").groupby(["runtime", "category"]).tail(1) + max_end = max_end[max_end["end"] > max_end["end"].max() - pandas.Timedelta("2d")] + session_ids = max_end["session_id"].unique() + latest_run = df[df["session_id"].isin(session_ids)] + + for baseline in args.baseline: + make_ab_html_report(latest_run, output_dir, by_test=False, baseline=baseline) + make_ab_html_report(latest_run, output_dir, by_test=True, baseline=None) + + make_index_html_report(output_dir, runtimes, args.baseline) + + +if __name__ == "__main__": + main()