-
Notifications
You must be signed in to change notification settings - Fork 1
🔔 added health check alerts #905
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,7 +9,8 @@ | |
| import contextlib | ||
| import logging | ||
| import psutil | ||
|
|
||
| import os | ||
| import typing | ||
|
|
||
| from .pynvml import ( | ||
| nvmlDeviceGetComputeRunningProcesses, | ||
|
|
@@ -158,6 +159,8 @@ def to_dict(self) -> dict[str, float]: | |
| _metrics: dict[str, float] = { | ||
| f"{RESOURCES_METRIC_PREFIX}/cpu.usage.percentage": self.cpu_percent, | ||
| f"{RESOURCES_METRIC_PREFIX}/cpu.usage.memory": self.cpu_memory, | ||
| f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage": self.memory_available_percent, | ||
| f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage": self.disk_available_percent, | ||
| } | ||
|
|
||
| for i, gpu in enumerate(self.gpus or []): | ||
|
|
@@ -177,3 +180,11 @@ def gpu_percent(self) -> float: | |
| @property | ||
| def gpu_memory(self) -> float: | ||
| return sum(m[1] for m in self.gpus or []) / (len(self.gpus or []) or 1) | ||
|
|
||
| @property | ||
| def memory_available_percent(self) -> float: | ||
| return 100 - typing.cast("float", psutil.virtual_memory().percent) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you tested that these work on windows? |
||
|
|
||
| @property | ||
| def disk_available_percent(self) -> float: | ||
| return 100 - psutil.disk_usage(os.getcwd()).percent | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -499,6 +499,27 @@ def _dispatch_callback( | |
|
|
||
| return _dispatch_callback | ||
|
|
||
| def _define_system_health_alerts(self, terminate_on_alert: bool) -> None: | ||
| """Define system health resource metric alerts.""" | ||
| _ = self.create_metric_threshold_alert( | ||
| name="low_available_virtual_memory", | ||
| metric=f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage", | ||
| threshold=5, | ||
| aggregation="at least one", | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does this do? |
||
| window=2, | ||
| rule="is below", | ||
| trigger_abort=terminate_on_alert, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could also add an email notification option? |
||
| ) | ||
| _ = self.create_metric_threshold_alert( | ||
| name="low_disk_space", | ||
| metric=f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage", | ||
| threshold=5, | ||
| aggregation="at least one", | ||
| window=2, | ||
| rule="is below", | ||
| trigger_abort=terminate_on_alert, | ||
| ) | ||
|
|
||
| def _start(self) -> bool: | ||
| """Start a run | ||
|
|
||
|
|
@@ -627,6 +648,7 @@ def init( | |
| retention_period: str | None = None, | ||
| timeout: int | None = 180, | ||
| visibility: typing.Literal["public", "tenant"] | list[str] | None = None, | ||
| terminate_on_low_system_health: bool = True, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would default this to false personally |
||
| no_color: bool = False, | ||
| record_shell_vars: set[str] | None = None, | ||
| ) -> bool: | ||
|
|
@@ -664,6 +686,10 @@ def init( | |
| * public - run viewable to all. | ||
| * tenant - run viewable to all within the current tenant. | ||
| * A list of usernames with which to share this run | ||
| terminate_on_low_system_health : bool, optional | ||
| whether to terminate this run if the resource metrics are | ||
| registering unhealthy values, e.g. very low available memory | ||
| default is True | ||
| no_color : bool, optional | ||
| disable terminal colors. Default False. | ||
| record_shell_vars : list[str] | None, | ||
|
|
@@ -774,6 +800,8 @@ def init( | |
| if self._status == "running": | ||
| self._start() | ||
|
|
||
| self._define_system_health_alerts(terminate_on_low_system_health) | ||
|
|
||
| if self._user_config.run.mode == "online": | ||
| click.secho( | ||
| f"[simvue] Run {self.name} created", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,13 +19,13 @@ | |
| import random | ||
| import datetime | ||
| import simvue | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need to add unit tests:
|
||
| from simvue.api.objects import Alert, Metrics | ||
| from simvue.api.objects.grids import GridMetrics | ||
| from simvue.exception import ObjectNotFoundError, SimvueRunError | ||
| from simvue.sender import Sender | ||
| import simvue.run as sv_run | ||
| import simvue.client as sv_cl | ||
| import simvue.config.user as sv_cfg | ||
|
|
||
| from simvue.api.objects import Run as RunObject | ||
|
|
||
|
|
@@ -1052,6 +1052,7 @@ def test_update_tags_offline( | |
|
|
||
|
|
||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| @pytest.mark.parametrize("object_type", ("DataFrame", "ndarray")) | ||
| def test_save_object( | ||
| create_plain_run: tuple[sv_run.Run, dict], object_type: str | ||
|
|
@@ -1074,6 +1075,7 @@ def test_save_object( | |
|
|
||
|
|
||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| def test_add_alerts() -> None: | ||
| _uuid = f"{uuid.uuid4()}".split("-")[0] | ||
|
|
||
|
|
@@ -1259,6 +1261,7 @@ def test_add_alerts_offline(monkeypatch) -> None: | |
|
|
||
|
|
||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| def test_log_alert() -> None: | ||
| _uuid = f"{uuid.uuid4()}".split("-")[0] | ||
|
|
||
|
|
@@ -1309,6 +1312,7 @@ def test_log_alert() -> None: | |
|
|
||
|
|
||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| def test_abort_on_alert_process(mocker: pytest_mock.MockerFixture) -> None: | ||
| def testing_exit(status: int) -> None: | ||
| raise SystemExit(status) | ||
|
|
@@ -1362,6 +1366,7 @@ def abort_callback(abort_run=trigger) -> None: | |
|
|
||
|
|
||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| def test_abort_on_alert_python( | ||
| speedy_heartbeat, create_plain_run: tuple[sv_run.Run, dict], mocker: pytest_mock.MockerFixture | ||
| ) -> None: | ||
|
|
@@ -1382,6 +1387,7 @@ def test_abort_on_alert_python( | |
|
|
||
|
|
||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| def test_abort_on_alert_raise( | ||
| create_plain_run: tuple[sv_run.Run, dict] | ||
| ) -> None: | ||
|
|
@@ -1406,6 +1412,7 @@ def test_abort_on_alert_raise( | |
|
|
||
|
|
||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| def test_kill_all_processes(create_plain_run: tuple[sv_run.Run, dict]) -> None: | ||
| run, _ = create_plain_run | ||
| run.config(system_metrics_interval=1) | ||
|
|
@@ -1421,6 +1428,7 @@ def test_kill_all_processes(create_plain_run: tuple[sv_run.Run, dict]) -> None: | |
|
|
||
|
|
||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| def test_run_created_with_no_timeout() -> None: | ||
| _uuid = f"{uuid.uuid4()}".split("-")[0] | ||
| with simvue.Run() as run: | ||
|
|
@@ -1443,6 +1451,7 @@ def test_run_created_with_no_timeout() -> None: | |
|
|
||
| @pytest.mark.parametrize("mode", ("online", "offline"), ids=("online", "offline")) | ||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| def test_reconnect_functionality(mode, monkeypatch: pytest.MonkeyPatch) -> None: | ||
| temp_d: tempfile.TemporaryDirectory | None = None | ||
| _uuid = f"{uuid.uuid4()}".split("-")[0] | ||
|
|
@@ -1486,6 +1495,7 @@ def test_reconnect_functionality(mode, monkeypatch: pytest.MonkeyPatch) -> None: | |
|
|
||
|
|
||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| def test_env_var_metadata() -> None: | ||
| # Add some environment variables to glob | ||
| _recorded_env = { | ||
|
|
@@ -1506,6 +1516,7 @@ def test_env_var_metadata() -> None: | |
| assert all(key in _recorded_meta.get("shell") for key in _recorded_env) | ||
|
|
||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| def test_reconnect_with_process() -> None: | ||
| _uuid = f"{uuid.uuid4()}".split("-")[0] | ||
| with simvue.Run() as run: | ||
|
|
@@ -1537,6 +1548,8 @@ def test_reconnect_with_process() -> None: | |
| @pytest.mark.parametrize( | ||
| "environment", ("python_conda", "python_poetry", "python_uv", "julia", "rust", "nodejs") | ||
| ) | ||
| @pytest.mark.run | ||
| @pytest.mark.online | ||
| def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFixture) -> None: | ||
| """Tests that the environment information is compatible with the server.""" | ||
| from simvue.config.user import SimvueConfiguration | ||
|
|
@@ -1558,3 +1571,4 @@ def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFi | |
| ) | ||
| run.update_metadata(env_func(_target_dir)) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to rethink how these things are named, the resources metrics page now looks very confusing: