From 5602edde4572865601322c2d66b758f8a47d0d52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20Zar=C4=99bski?= Date: Fri, 9 Jan 2026 13:40:42 +0000 Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=94=94=20added=20health=20check=20ale?= =?UTF-8?q?rts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- simvue/metrics.py | 13 ++++++++++++- simvue/run.py | 28 ++++++++++++++++++++++++++++ simvue/system.py | 17 +++++++++++++++++ tests/functional/test_run_class.py | 18 ++++++++++++++++-- 4 files changed, 73 insertions(+), 3 deletions(-) diff --git a/simvue/metrics.py b/simvue/metrics.py index 2914b351..105f58bb 100644 --- a/simvue/metrics.py +++ b/simvue/metrics.py @@ -9,7 +9,8 @@ import contextlib import logging import psutil - +import os +import typing from .pynvml import ( nvmlDeviceGetComputeRunningProcesses, @@ -158,6 +159,8 @@ def to_dict(self) -> dict[str, float]: _metrics: dict[str, float] = { f"{RESOURCES_METRIC_PREFIX}/cpu.usage.percentage": self.cpu_percent, f"{RESOURCES_METRIC_PREFIX}/cpu.usage.memory": self.cpu_memory, + f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage": self.memory_available_percent, + f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage": self.disk_available_percent, } for i, gpu in enumerate(self.gpus or []): @@ -177,3 +180,11 @@ def gpu_percent(self) -> float: @property def gpu_memory(self) -> float: return sum(m[1] for m in self.gpus or []) / (len(self.gpus or []) or 1) + + @property + def memory_available_percent(self) -> float: + return 100 - typing.cast("float", psutil.virtual_memory().percent) + + @property + def disk_available_percent(self) -> float: + return 100 - psutil.disk_usage(os.getcwd()).percent diff --git a/simvue/run.py b/simvue/run.py index 7e3e96fb..6d513abe 100644 --- a/simvue/run.py +++ b/simvue/run.py @@ -499,6 +499,27 @@ def _dispatch_callback( return _dispatch_callback + def _define_system_health_alerts(self, terminate_on_alert: bool) -> None: + """Define system health resource metric alerts.""" + _ = self.create_metric_threshold_alert( + name="low_available_virtual_memory", + metric=f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage", + threshold=5, + aggregation="at least one", + window=2, + rule="is below", + trigger_abort=terminate_on_alert, + ) + _ = self.create_metric_threshold_alert( + name="low_disk_space", + metric=f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage", + threshold=5, + aggregation="at least one", + window=2, + rule="is below", + trigger_abort=terminate_on_alert, + ) + def _start(self) -> bool: """Start a run @@ -627,6 +648,7 @@ def init( retention_period: str | None = None, timeout: int | None = 180, visibility: typing.Literal["public", "tenant"] | list[str] | None = None, + terminate_on_low_system_health: bool = True, no_color: bool = False, record_shell_vars: set[str] | None = None, ) -> bool: @@ -664,6 +686,10 @@ def init( * public - run viewable to all. * tenant - run viewable to all within the current tenant. * A list of usernames with which to share this run + terminate_on_low_system_health : bool, optional + whether to terminate this run if the resource metrics are + registering unhealthy values, e.g. very low available memory + default is True no_color : bool, optional disable terminal colors. Default False. record_shell_vars : list[str] | None, @@ -774,6 +800,8 @@ def init( if self._status == "running": self._start() + self._define_system_health_alerts(terminate_on_low_system_health) + if self._user_config.run.mode == "online": click.secho( f"[simvue] Run {self.name} created", diff --git a/simvue/system.py b/simvue/system.py index 84ce016b..5a3dded4 100644 --- a/simvue/system.py +++ b/simvue/system.py @@ -1,3 +1,10 @@ +""" +System Information +================== + +Retrieve and assemble information on the current system. +""" + import os import platform import socket @@ -5,6 +12,7 @@ import shutil import sys import contextlib +import psutil import typing @@ -60,6 +68,14 @@ def get_gpu_info(): return _gpu_info +def get_memory_info() -> dict[str, int]: + """Get total available memory in GB.""" + return { + "virtual": typing.cast("int", psutil.virtual_memory().total) // 1024**3, + "swap": psutil.swap_memory().total // 1024**3, + } + + def get_system() -> dict[str, typing.Any]: """ Get system details @@ -76,6 +92,7 @@ def get_system() -> dict[str, typing.Any]: system["platform"]["system"] = platform.system() system["platform"]["release"] = platform.release() system["platform"]["version"] = platform.version() + system["memory"] = {k: f"{v}GB" for k, v in get_memory_info().items()} system["cpu"] = {} system["cpu"]["arch"] = cpu[1] system["cpu"]["processor"] = cpu[0] diff --git a/tests/functional/test_run_class.py b/tests/functional/test_run_class.py index 0a990629..3c0e7322 100644 --- a/tests/functional/test_run_class.py +++ b/tests/functional/test_run_class.py @@ -18,14 +18,14 @@ import concurrent.futures import random import datetime +import json import simvue -from simvue.api.objects import Alert, Metrics +from simvue.api.objects import Alert, Metrics, Folder from simvue.api.objects.grids import GridMetrics from simvue.exception import ObjectNotFoundError, SimvueRunError from simvue.sender import Sender import simvue.run as sv_run import simvue.client as sv_cl -import simvue.config.user as sv_cfg from simvue.api.objects import Run as RunObject @@ -1052,6 +1052,7 @@ def test_update_tags_offline( @pytest.mark.run +@pytest.mark.online @pytest.mark.parametrize("object_type", ("DataFrame", "ndarray")) def test_save_object( create_plain_run: tuple[sv_run.Run, dict], object_type: str @@ -1074,6 +1075,7 @@ def test_save_object( @pytest.mark.run +@pytest.mark.online def test_add_alerts() -> None: _uuid = f"{uuid.uuid4()}".split("-")[0] @@ -1259,6 +1261,7 @@ def test_add_alerts_offline(monkeypatch) -> None: @pytest.mark.run +@pytest.mark.online def test_log_alert() -> None: _uuid = f"{uuid.uuid4()}".split("-")[0] @@ -1309,6 +1312,7 @@ def test_log_alert() -> None: @pytest.mark.run +@pytest.mark.online def test_abort_on_alert_process(mocker: pytest_mock.MockerFixture) -> None: def testing_exit(status: int) -> None: raise SystemExit(status) @@ -1362,6 +1366,7 @@ def abort_callback(abort_run=trigger) -> None: @pytest.mark.run +@pytest.mark.online def test_abort_on_alert_python( speedy_heartbeat, create_plain_run: tuple[sv_run.Run, dict], mocker: pytest_mock.MockerFixture ) -> None: @@ -1382,6 +1387,7 @@ def test_abort_on_alert_python( @pytest.mark.run +@pytest.mark.online def test_abort_on_alert_raise( create_plain_run: tuple[sv_run.Run, dict] ) -> None: @@ -1406,6 +1412,7 @@ def test_abort_on_alert_raise( @pytest.mark.run +@pytest.mark.online def test_kill_all_processes(create_plain_run: tuple[sv_run.Run, dict]) -> None: run, _ = create_plain_run run.config(system_metrics_interval=1) @@ -1421,6 +1428,7 @@ def test_kill_all_processes(create_plain_run: tuple[sv_run.Run, dict]) -> None: @pytest.mark.run +@pytest.mark.online def test_run_created_with_no_timeout() -> None: _uuid = f"{uuid.uuid4()}".split("-")[0] with simvue.Run() as run: @@ -1443,6 +1451,7 @@ def test_run_created_with_no_timeout() -> None: @pytest.mark.parametrize("mode", ("online", "offline"), ids=("online", "offline")) @pytest.mark.run +@pytest.mark.online def test_reconnect_functionality(mode, monkeypatch: pytest.MonkeyPatch) -> None: temp_d: tempfile.TemporaryDirectory | None = None _uuid = f"{uuid.uuid4()}".split("-")[0] @@ -1486,6 +1495,7 @@ def test_reconnect_functionality(mode, monkeypatch: pytest.MonkeyPatch) -> None: @pytest.mark.run +@pytest.mark.online def test_env_var_metadata() -> None: # Add some environment variables to glob _recorded_env = { @@ -1506,6 +1516,7 @@ def test_env_var_metadata() -> None: assert all(key in _recorded_meta.get("shell") for key in _recorded_env) @pytest.mark.run +@pytest.mark.online def test_reconnect_with_process() -> None: _uuid = f"{uuid.uuid4()}".split("-")[0] with simvue.Run() as run: @@ -1537,6 +1548,8 @@ def test_reconnect_with_process() -> None: @pytest.mark.parametrize( "environment", ("python_conda", "python_poetry", "python_uv", "julia", "rust", "nodejs") ) +@pytest.mark.run +@pytest.mark.online def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFixture) -> None: """Tests that the environment information is compatible with the server.""" from simvue.config.user import SimvueConfiguration @@ -1558,3 +1571,4 @@ def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFi ) run.update_metadata(env_func(_target_dir)) + From fb3bf3f6db0f54eb07f8e77de5bbff35e627522e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20Zar=C4=99bski?= Date: Fri, 9 Jan 2026 13:48:09 +0000 Subject: [PATCH 2/2] Addressed review comments by GH CQ --- tests/functional/test_run_class.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/functional/test_run_class.py b/tests/functional/test_run_class.py index 3c0e7322..9b91d248 100644 --- a/tests/functional/test_run_class.py +++ b/tests/functional/test_run_class.py @@ -18,9 +18,9 @@ import concurrent.futures import random import datetime -import json import simvue -from simvue.api.objects import Alert, Metrics, Folder + +from simvue.api.objects import Alert, Metrics from simvue.api.objects.grids import GridMetrics from simvue.exception import ObjectNotFoundError, SimvueRunError from simvue.sender import Sender