From 80a68f17efc5dab48700695d5bf79f611641db9a Mon Sep 17 00:00:00 2001 From: Ian Rose Date: Wed, 31 Aug 2022 17:57:20 -0700 Subject: [PATCH 1/4] WIP barchart for comparing different commits rather than a timeseries. --- dashboard.py | 73 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/dashboard.py b/dashboard.py index 42b03d91ff..c5ae569b11 100644 --- a/dashboard.py +++ b/dashboard.py @@ -7,6 +7,7 @@ import os import pathlib import sys +from typing import Literal import altair import pandas @@ -43,6 +44,63 @@ def get_test_source(): source = get_test_source() +def make_barchart(originalname, df, spec) -> altair.Chart | None: + """ + Make a single timeseries altair chart for a given test. + + originalname: str + The name of the test without any fixture or other modifications. + + df: pandas.DataFrame + A dataframe with the test data in it. + + spec: ChartSpec + Data for how to render the timeseries + """ + df = df.dropna(subset=[spec.field, "start"]) + if not len(df): + return None + df = df.assign(**{spec.field: df[spec.field] / spec.scale}) + path = df.path.iloc[0] + kwargs = {} + # Reduce the size of the altair spec + x = max( + ["dask_version", "distributed_version", "session_id"], + key=lambda k: len(df[k].unique()), + ) + df = df[ + [ + spec.field, + "name", + "call_outcome", + "coiled_runtime_version", + "dask_version", + "distributed_version", + "session_id", + ] + ] + if len(df.name.unique()) > 1: + kwargs["column"] = altair.Color("name:N") + return ( + altair.Chart(df, width=800, height=512) + .mark_bar() + .encode( + x=altair.X(f"{x}:N"), + y=altair.Y(f"{spec.field}:Q", title=spec.label), + color=altair.Color(f"{x}:N"), + tooltip=[ + altair.Tooltip("name:N", title="Test Name"), + altair.Tooltip("dask_version:N", title="Dask"), + altair.Tooltip("distributed_version:N", title="Dask"), + altair.Tooltip(f"{spec.field}:Q", title=spec.label), + ], + **kwargs, + ) + .properties(title=f"{path}::{originalname}") + .configure(autosize="fit") + ) + + def make_timeseries(originalname, df, spec) -> altair.Chart | None: """ Make a single timeseries altair chart for a given test. @@ -106,7 +164,9 @@ def make_timeseries(originalname, df, spec) -> altair.Chart | None: ) -def make_test_report(group_keys, df): +def make_test_report( + group_keys, df, kind: Literal["bar"] | Literal["timeseries"] = "timeseries" +): """ Make a tab panel for a single test. @@ -126,17 +186,21 @@ def make_test_report(group_keys, df): ] tabs = [] for s in specs: - chart = make_timeseries(originalname, df, s) + if kind == "timeseries": + chart = make_timeseries(originalname, df, s) + else: + chart = make_barchart(originalname, df, s) if not chart: continue tabs.append((s.label, chart)) sourcename = path + "::" + originalname + height = 384 if kind == "timeseries" else 640 if sourcename in source: code = panel.pane.Markdown( f"```python\n{source[sourcename]}\n```", width=800, - height=384, + height=height, style={"overflow": "auto"}, ) tabs.append(("Source", code)) @@ -147,6 +211,7 @@ def make_test_report(group_keys, df): DB_NAME = ( sys.argv[1] if len(sys.argv) > 1 else os.environ.get("DB_NAME", "benchmark.db") ) + KIND = "barchart" static = pathlib.Path("static") static.mkdir(exist_ok=True) @@ -177,7 +242,7 @@ def make_test_report(group_keys, df): .groupby(["path", "originalname"]) ) panes = [ - make_test_report(test_name, by_test.get_group(test_name)) + make_test_report(test_name, by_test.get_group(test_name), KIND) for test_name in by_test.groups ] flex = panel.FlexBox(*panes, align_items="start", justify_content="start") From b03c3f1be0c67672112b0858ec5fd21e6cf60110 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Fri, 2 Sep 2022 14:59:38 +0100 Subject: [PATCH 2/4] Add PyCharm files to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index c3d511b0c9..272b49eb25 100644 --- a/.gitignore +++ b/.gitignore @@ -120,6 +120,9 @@ venv.bak/ # Rope project settings .ropeproject +# PyCharm project settings +.idea + # mkdocs documentation /site From 689f39ab9132a39d120b57892b06f121079a6973 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Tue, 6 Sep 2022 11:25:01 +0100 Subject: [PATCH 3/4] WIP --- .github/workflows/tests.yml | 13 +- ci/environment-dashboard.yml | 1 + dashboard.py | 389 ++++++++++++++++++++++++----------- 3 files changed, 277 insertions(+), 126 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 81b6095712..7a5c5e4755 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -57,16 +57,19 @@ jobs: if [[ "$COMMIT" == *"test-upstream"* || ${{ github.event_name }} == "schedule" ]] then export TEST_UPSTREAM="true" + export AB_BASELINE="coiled-upstream-py3.9 coiled-0.1.0-py3.9" else export TEST_UPSTREAM="false" + export AB_BASELINE="coiled-0.1.0-py3.9" fi # Put TEST_UPSTREAM into $GITHUB_ENV so it can be used in subsequent workflow steps echo $TEST_UPSTREAM echo TEST_UPSTREAM=$TEST_UPSTREAM >> $GITHUB_ENV - # Put TEST_UPSTREAM into a file so it can be downloaded in subsequent workflow jobs + # Put env variables into files so it can be downloaded in subsequent workflow jobs echo $TEST_UPSTREAM > test_upstream.txt + echo $AB_BASELINE > ab_baseline.txt - name: Build Coiled Software Environment env: @@ -104,6 +107,7 @@ jobs: latest.yaml software_name.txt test_upstream.txt + ab_baseline.txt runtime: name: Runtime - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }} @@ -457,9 +461,14 @@ jobs: python-version: "3.9" environment-file: ci/environment-dashboard.yml + - name: Download software environment assets + uses: actions/download-artifact@v3 + with: + name: software-environment-py3.9 + - name: Generate dashboards run: | - python dashboard.py + python dashboard.py -d benchmark.db -o static -b <(cat ab_baseline.txt) - name: Deploy 🚀 uses: JamesIves/github-pages-deploy-action@4.1.7 diff --git a/ci/environment-dashboard.yml b/ci/environment-dashboard.yml index bf298d61e6..4ebdbd3ca1 100644 --- a/ci/environment-dashboard.yml +++ b/ci/environment-dashboard.yml @@ -11,6 +11,7 @@ dependencies: - dask - dask-ml - distributed + - xarray - xgboost - pandas - tabulate diff --git a/dashboard.py b/dashboard.py index c5ae569b11..1a3acdc832 100644 --- a/dashboard.py +++ b/dashboard.py @@ -1,12 +1,11 @@ from __future__ import annotations +import argparse import collections import glob import importlib import inspect -import os import pathlib -import sys from typing import Literal import altair @@ -18,108 +17,73 @@ panel.extension("vega") -def get_test_source(): - """ - Crawl the tests directory and try to grab code for each test on a best-effort - basis. This relies on the tests being importable from this script, so the - environment should be similar enough that that is possible. - """ - source: dict[str, str] = {} - files = glob.glob("tests/**/test_*.py", recursive=True) - for f in files: - try: - # Fragile! - mod = importlib.import_module(f.replace("/", ".")[: -len(".py")]) - tests = [a for a in dir(mod) if a.startswith("test_")] - for test in tests: - if fn := getattr(mod, test, None): - if not callable(fn): - continue - source[f[len("tests/") :] + "::" + test] = inspect.getsource(fn) - except BaseException: # Some pytest exceptions inherit directly from BaseException - pass - return source - - -source = get_test_source() - - -def make_barchart(originalname, df, spec) -> altair.Chart | None: - """ - Make a single timeseries altair chart for a given test. +source: dict[str, str] = {} - originalname: str - The name of the test without any fixture or other modifications. - df: pandas.DataFrame - A dataframe with the test data in it. - - spec: ChartSpec - Data for how to render the timeseries +def load_test_source() -> None: + """Crawl the tests directory and try to grab code for each test. This relies on the + tests being importable from this script. """ + for fname in glob.iglob("tests/**/test_*.py", recursive=True): + mod = importlib.import_module(fname.replace("/", ".")[: -len(".py")]) + tests = [a for a in dir(mod) if a.startswith("test_")] + for test in tests: + if (func := getattr(mod, test, None)) and callable(func): + # FIXME missing decorators, namely @pytest.mark.parametrize + source[fname[len("tests/") :] + "::" + test] = inspect.getsource(func) + + +def make_barchart( + df: pandas.DataFrame, spec: altair.ChartSpec, title: str +) -> altair.Chart | None: + """Make a single Altair barchart for a given test.""" df = df.dropna(subset=[spec.field, "start"]) if not len(df): + # Some tests do not have average_memory or peak_memory measures, only runtime return None + + df = df.sort_values("runtime", key=runtime_sort_key_pd) df = df.assign(**{spec.field: df[spec.field] / spec.scale}) - path = df.path.iloc[0] - kwargs = {} - # Reduce the size of the altair spec - x = max( - ["dask_version", "distributed_version", "session_id"], - key=lambda k: len(df[k].unique()), - ) df = df[ [ spec.field, "name", - "call_outcome", - "coiled_runtime_version", "dask_version", "distributed_version", - "session_id", + "runtime", ] ] - if len(df.name.unique()) > 1: - kwargs["column"] = altair.Color("name:N") + # Altair will re-sort alphabetically, unless it's explicitly asked to sort by + # something else. Could not find an option to NOT sort. + df["order"] = pandas.RangeIndex(0, df.shape[0]) return ( altair.Chart(df, width=800, height=512) .mark_bar() .encode( - x=altair.X(f"{x}:N"), - y=altair.Y(f"{spec.field}:Q", title=spec.label), - color=altair.Color(f"{x}:N"), + x=altair.X(spec.field, title=spec.label), + y=altair.Y("runtime", sort=altair.EncodingSortField(field="order")), tooltip=[ - altair.Tooltip("name:N", title="Test Name"), altair.Tooltip("dask_version:N", title="Dask"), - altair.Tooltip("distributed_version:N", title="Dask"), + altair.Tooltip("distributed_version:N", title="Distributed"), altair.Tooltip(f"{spec.field}:Q", title=spec.label), ], - **kwargs, ) - .properties(title=f"{path}::{originalname}") + .properties(title=title) .configure(autosize="fit") ) -def make_timeseries(originalname, df, spec) -> altair.Chart | None: - """ - Make a single timeseries altair chart for a given test. - - originalname: str - The name of the test without any fixture or other modifications. - - df: pandas.DataFrame - A dataframe with the test data in it. - - spec: ChartSpec - Data for how to render the timeseries - """ +def make_timeseries( + df: pandas.DataFrame, spec: altair.ChartSpec, title: str +) -> altair.Chart | None: + """Make a single Altair timeseries chart for a given test""" df = df.dropna(subset=[spec.field, "start"]) if not len(df): + # Some tests do not have average_memory or peak_memory measures, only runtime return None + df = df.assign(**{spec.field: df[spec.field] / spec.scale}) df = df.fillna({"ci_run_url": "https://github.com/coiled/coiled-runtime"}) - path = df.path.iloc[0] kwargs = {} # Reduce the size of the altair spec df = df[ @@ -131,6 +95,7 @@ def make_timeseries(originalname, df, spec) -> altair.Chart | None: "call_outcome", "coiled_runtime_version", "dask_version", + "distributed_version", ] ] if len(df.name.unique()) > 1: @@ -154,30 +119,24 @@ def make_timeseries(originalname, df, spec) -> altair.Chart | None: altair.Tooltip("call_outcome:N", title="Test Outcome"), altair.Tooltip("coiled_runtime_version:N", title="Coiled Runtime"), altair.Tooltip("dask_version:N", title="Dask"), + altair.Tooltip("distributed_version:N", title="Distributed"), altair.Tooltip(f"{spec.field}:Q", title=spec.label), altair.Tooltip("ci_run_url:N", title="CI Run URL"), ], **kwargs, ) - .properties(title=f"{path}::{originalname}") + .properties(title=title) .configure(autosize="fit") ) def make_test_report( - group_keys, df, kind: Literal["bar"] | Literal["timeseries"] = "timeseries" -): - """ - Make a tab panel for a single test. - - originalname: str - The name of the test without any fixture or other modifications. - - df: pandas.DataFrame - A dataframe with the test data in it. - """ - path, originalname = group_keys - + df: pandas.DataFrame, + kind: Literal["barchart" | "timeseries"], + title: str, + sourcename: str, +) -> panel.Tabs: + """Make a tab panel for a single test""" ChartSpec = collections.namedtuple("ChartSpec", ["field", "scale", "label"]) specs = [ ChartSpec("duration", 1, "Wall Clock (s)"), @@ -187,14 +146,13 @@ def make_test_report( tabs = [] for s in specs: if kind == "timeseries": - chart = make_timeseries(originalname, df, s) + chart = make_timeseries(df, s, title) else: - chart = make_barchart(originalname, df, s) + chart = make_barchart(df, s, title) if not chart: continue tabs.append((s.label, chart)) - sourcename = path + "::" + originalname height = 384 if kind == "timeseries" else 640 if sourcename in source: code = panel.pane.Markdown( @@ -204,20 +162,202 @@ def make_test_report( style={"overflow": "auto"}, ) tabs.append(("Source", code)) + else: + print("Source code not found for", sourcename) return panel.Tabs(*tabs, margin=12, width=800) -if __name__ == "__main__": - DB_NAME = ( - sys.argv[1] if len(sys.argv) > 1 else os.environ.get("DB_NAME", "benchmark.db") +def make_timeseries_html_report( + df: pandas.DataFrame, output_dir: pathlib.Path, runtime: str +) -> None: + """Generate HTML report for one runtime (e.g. coiled-upstream-py3.9), showing + evolution of measures (wall clock, average memory, peak memory) over historical CI + runs. + + Create one tab for each test category (e.g. benchmarks, runtime, stability), + one graph for each test, + and one graph tab for each measure (wall clock, average memory, peak memory). + """ + out_fname = str(output_dir.joinpath(runtime + ".html")) + print(f"Generating {out_fname}") + categories = sorted(df[df.runtime == runtime].category.unique()) + tabs = [] + for category in categories: + df_by_test = ( + df[(df.runtime == runtime) & (df.category == category)] + .sort_values("sourcename") + .groupby("sourcename") + ) + panes = [ + make_test_report( + df_by_test.get_group(sourcename), + kind="timeseries", + title=sourcename, + sourcename=sourcename, + ) + for sourcename in df_by_test.groups + ] + flex = panel.FlexBox(*panes, align_items="start", justify_content="start") + tabs.append((category.title(), flex)) + doc = panel.Tabs(*tabs, margin=12) + + doc.save(out_fname, title=runtime, resources=INLINE) + + +def make_ab_html_report( + df: pandas.DataFrame, + output_dir: pathlib.Path, + by_test: bool, + baseline: str | None, +) -> None: + """Generate HTML report for the latest CI run, comparing all runtimes (e.g. + coiled-upstream-py3.9) against a baseline runtime + + Create one tab for each test category (e.g. benchmarks, runtime, stability), + one graph for each runtime and one bar for each test + OR one graph for each test and one bar for each runtime, + and one graph tab for each measure (wall clock, average memory, peak memory). + + If a baseline runtime is defined, all measures are expressed relative to the + baseline; otherwise they're expressed in absolute terms. + """ + out_fname = str( + output_dir.joinpath( + "AB_by_" + + ("test" if by_test else "runtime") + + (f"_vs_{baseline}" if baseline else "") + + ".html" + ) ) - KIND = "barchart" - static = pathlib.Path("static") - static.mkdir(exist_ok=True) + print(f"Generating {out_fname}") - engine = sqlalchemy.create_engine(f"sqlite:///{DB_NAME}") + categories = sorted(df.category.unique()) + tabs = [] + for category in categories: + if by_test: + df_by_test = ( + df[df.category == category] + .sort_values(["sourcename", "fullname"]) + .groupby(["sourcename", "fullname"]) + ) + panes = [ + make_test_report( + df_by_test.get_group((sourcename, fullname)), + kind="barchart", + title=fullname, + sourcename=sourcename, + ) + for sourcename, fullname in df_by_test.groups + ] + else: + return # WIP + flex = panel.FlexBox(*panes, align_items="start", justify_content="start") + tabs.append((category.title(), flex)) + doc = panel.Tabs(*tabs, margin=12) + + doc.save( + out_fname, + title="A/B by " + + ("test" if by_test else "runtime") + + (f" vs. {baseline}" if baseline else ""), + resources=INLINE, + ) + + +def make_index_html_report( + output_dir: pathlib.Path, runtimes: list[str], baselines: list[str] +) -> None: + """Generate index.html""" + index_txt = """# Coiled Runtime Benchmarks\n""" + index_txt += "### Historical timeseries\n" + for runtime in runtimes: + index_txt += f"- [{runtime}](./{runtime}.html)\n" + index_txt += "\n\n### A/B tests\n" + for baseline in baselines: + index_txt += ( + f"- [by runtime vs. {baseline}](./AB_by_runtime_vs_{baseline}.html)\n" + ) + index_txt += "- [by test](./AB_by_test.html)\n" + + index = panel.pane.Markdown(index_txt, width=800) + out_fname = str(output_dir.joinpath("index.html")) + print(f"Generating {out_fname}") + index.save( + out_fname, + title="Coiled Runtime Benchmarks", + resources=INLINE, + ) + + +def runtime_sort_key(runtime: str) -> tuple: + """Runtimes are in the format coiled--py + e.g. coiled-latest-py3.8 + + Sort them by version descending and by python version ascending + """ + t = runtime.split("-") + assert len(t) == 3 + assert t[0] == "coiled" + # upstream > latest > 0.1.0 > 0.0.4 + if t[1] == "upstream": + coiled_version = [-2] + elif t[1] == "latest": + coiled_version = [-1] + else: + coiled_version = [0] + [-int(v) for v in t[1].split(".")] + + assert t[2][:2] == "py" + py_version = [int(v) for v in t[2][2:].split(".")] + return coiled_version, py_version + + +def runtime_sort_key_pd(s: pandas.Series) -> pandas.Series: + return pandas.Series([runtime_sort_key(v) for v in s], index=s.index) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate a static HTML report comparing metrics from the runs" + ) + parser.add_argument( + "--db-file", + "-d", + help="Path to SQLite database file containing the metrics", + ) + parser.add_argument( + "--output-dir", + "-o", + help="Output directory", + default="build/html", + ) + parser.add_argument( + "--baseline", + "-b", + nargs="+", + default=[], + help="Baseline runtime(s) for A/B comparison", + ) + parser.add_argument( + "--pickle", + action="store_true", + help="Dump raw dataframe to pickle file", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + output_dir = pathlib.Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + load_test_source() + print(f"Discovered {len(source)} tests") + + # Load SQLite database into a pandas DataFrame + engine = sqlalchemy.create_engine(f"sqlite:///{args.db_file}") df = pandas.read_sql( - "select * from test_run where platform = 'linux' and call_outcome in ('passed', 'failed')", + "select * from test_run where platform = 'linux' " + "and call_outcome in ('passed', 'failed')", engine, ) df = df.assign( @@ -229,34 +369,35 @@ def make_test_report( ), category=df.path.str.split("/", n=1).str[0], ) + df["start"] = pandas.to_datetime(df["start"]) + df["end"] = pandas.to_datetime(df["end"]) + df["sourcename"] = df["path"].str.cat(df["originalname"], "::") + df["fullname"] = df["path"].str.cat(df["name"], "::") + df = df.set_index("id") + + if args.pickle: + out_fname = str(output_dir.joinpath("records.pickle")) + print(f"Generating {out_fname}") + df.to_pickle(out_fname) - runtimes = list(df.runtime.unique()) + # Generate HTML pages + runtimes = sorted(df.runtime.unique(), key=runtime_sort_key) for runtime in runtimes: - print(f"Generating dashboard for {runtime}") - categories = df[df.runtime == runtime].category.unique() - tabs = [] - for category in categories: - by_test = ( - df[(df.runtime == runtime) & (df.category == category)] - .sort_values(["path", "originalname"]) - .groupby(["path", "originalname"]) - ) - panes = [ - make_test_report(test_name, by_test.get_group(test_name), KIND) - for test_name in by_test.groups - ] - flex = panel.FlexBox(*panes, align_items="start", justify_content="start") - tabs.append((category.title(), flex)) - doc = panel.Tabs(*tabs, margin=12) + make_timeseries_html_report(df, output_dir, runtime) - doc.save( - str(static.joinpath(runtime + ".html")), title=runtime, resources=INLINE - ) - index = """# Coiled Runtime Benchmarks\n\n""" - index += "\n\n".join([f"[{r}](./{r}.html)" for r in reversed(sorted(runtimes))]) - index = panel.pane.Markdown(index, width=800) - index.save( - str(static.joinpath("index.html")), - title="Coiled Runtime Benchmarks", - resources=INLINE, - ) + # Select only the latest run for each runtime. This may pick up historical runs (up + # to 2d old) if they have not been rerun in the current pull/PR. + max_end = df.sort_values("end").groupby(["runtime", "category"]).tail(1) + max_end = max_end[max_end["end"] > max_end["end"].max() - pandas.Timedelta("2d")] + session_ids = max_end["session_id"].unique() + latest_run = df[df["session_id"].isin(session_ids)] + + for baseline in args.baseline: + make_ab_html_report(latest_run, output_dir, by_test=False, baseline=baseline) + make_ab_html_report(latest_run, output_dir, by_test=True, baseline=None) + + make_index_html_report(output_dir, runtimes, args.baseline) + + +if __name__ == "__main__": + main() From 273a0c0f0b3e34928d27106e0bc7ce01fe2b399b Mon Sep 17 00:00:00 2001 From: crusaderky Date: Tue, 6 Sep 2022 12:10:15 +0100 Subject: [PATCH 4/4] dynamic barchart height --- dashboard.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dashboard.py b/dashboard.py index 1a3acdc832..988de74ec5 100644 --- a/dashboard.py +++ b/dashboard.py @@ -56,8 +56,9 @@ def make_barchart( # Altair will re-sort alphabetically, unless it's explicitly asked to sort by # something else. Could not find an option to NOT sort. df["order"] = pandas.RangeIndex(0, df.shape[0]) + height = df.shape[0] * 20 + 50 return ( - altair.Chart(df, width=800, height=512) + altair.Chart(df, width=800, height=height) .mark_bar() .encode( x=altair.X(spec.field, title=spec.label), @@ -153,7 +154,11 @@ def make_test_report( continue tabs.append((s.label, chart)) - height = 384 if kind == "timeseries" else 640 + if kind == "timeseries": + height = 384 + else: + height = df.shape[0] * 20 + 50 + if sourcename in source: code = panel.pane.Markdown( f"```python\n{source[sourcename]}\n```",