From 5602edde4572865601322c2d66b758f8a47d0d52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20Zar=C4=99bski?= <kristian.zarebski@ukaea.uk>
Date: Fri, 9 Jan 2026 13:40:42 +0000
Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=94=94=20added=20health=20check=20ale?=
 =?UTF-8?q?rts?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 simvue/metrics.py                  | 13 ++++++++++++-
 simvue/run.py                      | 28 ++++++++++++++++++++++++++++
 simvue/system.py                   | 17 +++++++++++++++++
 tests/functional/test_run_class.py | 18 ++++++++++++++++--
 4 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/simvue/metrics.py b/simvue/metrics.py
index 2914b351..105f58bb 100644
--- a/simvue/metrics.py
+++ b/simvue/metrics.py
@@ -9,7 +9,8 @@
 import contextlib
 import logging
 import psutil
-
+import os
+import typing
 
 from .pynvml import (
     nvmlDeviceGetComputeRunningProcesses,
@@ -158,6 +159,8 @@ def to_dict(self) -> dict[str, float]:
         _metrics: dict[str, float] = {
             f"{RESOURCES_METRIC_PREFIX}/cpu.usage.percentage": self.cpu_percent,
             f"{RESOURCES_METRIC_PREFIX}/cpu.usage.memory": self.cpu_memory,
+            f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage": self.memory_available_percent,
+            f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage": self.disk_available_percent,
         }
 
         for i, gpu in enumerate(self.gpus or []):
@@ -177,3 +180,11 @@ def gpu_percent(self) -> float:
     @property
     def gpu_memory(self) -> float:
         return sum(m[1] for m in self.gpus or []) / (len(self.gpus or []) or 1)
+
+    @property
+    def memory_available_percent(self) -> float:
+        return 100 - typing.cast("float", psutil.virtual_memory().percent)
+
+    @property
+    def disk_available_percent(self) -> float:
+        return 100 - psutil.disk_usage(os.getcwd()).percent
diff --git a/simvue/run.py b/simvue/run.py
index 7e3e96fb..6d513abe 100644
--- a/simvue/run.py
+++ b/simvue/run.py
@@ -499,6 +499,27 @@ def _dispatch_callback(
 
         return _dispatch_callback
 
+    def _define_system_health_alerts(self, terminate_on_alert: bool) -> None:
+        """Define system health resource metric alerts."""
+        _ = self.create_metric_threshold_alert(
+            name="low_available_virtual_memory",
+            metric=f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage",
+            threshold=5,
+            aggregation="at least one",
+            window=2,
+            rule="is below",
+            trigger_abort=terminate_on_alert,
+        )
+        _ = self.create_metric_threshold_alert(
+            name="low_disk_space",
+            metric=f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage",
+            threshold=5,
+            aggregation="at least one",
+            window=2,
+            rule="is below",
+            trigger_abort=terminate_on_alert,
+        )
+
     def _start(self) -> bool:
         """Start a run
 
@@ -627,6 +648,7 @@ def init(
         retention_period: str | None = None,
         timeout: int | None = 180,
         visibility: typing.Literal["public", "tenant"] | list[str] | None = None,
+        terminate_on_low_system_health: bool = True,
         no_color: bool = False,
         record_shell_vars: set[str] | None = None,
     ) -> bool:
@@ -664,6 +686,10 @@ def init(
                 * public - run viewable to all.
                 * tenant - run viewable to all within the current tenant.
                 * A list of usernames with which to share this run
+        terminate_on_low_system_health : bool, optional
+            whether to terminate this run if the resource metrics are
+            registering unhealthy values, e.g. very low available memory
+            default is True
         no_color : bool, optional
             disable terminal colors. Default False.
         record_shell_vars : list[str] | None,
@@ -774,6 +800,8 @@ def init(
         if self._status == "running":
             self._start()
 
+        self._define_system_health_alerts(terminate_on_low_system_health)
+
         if self._user_config.run.mode == "online":
             click.secho(
                 f"[simvue] Run {self.name} created",
diff --git a/simvue/system.py b/simvue/system.py
index 84ce016b..5a3dded4 100644
--- a/simvue/system.py
+++ b/simvue/system.py
@@ -1,3 +1,10 @@
+"""
+System Information
+==================
+
+Retrieve and assemble information on the current system.
+"""
+
 import os
 import platform
 import socket
@@ -5,6 +12,7 @@
 import shutil
 import sys
 import contextlib
+import psutil
 import typing
 
 
@@ -60,6 +68,14 @@ def get_gpu_info():
     return _gpu_info
 
 
+def get_memory_info() -> dict[str, int]:
+    """Get total available memory in GB."""
+    return {
+        "virtual": typing.cast("int", psutil.virtual_memory().total) // 1024**3,
+        "swap": psutil.swap_memory().total // 1024**3,
+    }
+
+
 def get_system() -> dict[str, typing.Any]:
     """
     Get system details
@@ -76,6 +92,7 @@ def get_system() -> dict[str, typing.Any]:
     system["platform"]["system"] = platform.system()
     system["platform"]["release"] = platform.release()
     system["platform"]["version"] = platform.version()
+    system["memory"] = {k: f"{v}GB" for k, v in get_memory_info().items()}
     system["cpu"] = {}
     system["cpu"]["arch"] = cpu[1]
     system["cpu"]["processor"] = cpu[0]
diff --git a/tests/functional/test_run_class.py b/tests/functional/test_run_class.py
index 0a990629..3c0e7322 100644
--- a/tests/functional/test_run_class.py
+++ b/tests/functional/test_run_class.py
@@ -18,14 +18,14 @@
 import concurrent.futures
 import random
 import datetime
+import json
 import simvue
-from simvue.api.objects import Alert, Metrics
+from simvue.api.objects import Alert, Metrics, Folder
 from simvue.api.objects.grids import GridMetrics
 from simvue.exception import ObjectNotFoundError, SimvueRunError
 from simvue.sender import Sender
 import simvue.run as sv_run
 import simvue.client as sv_cl
-import simvue.config.user as sv_cfg
 
 from simvue.api.objects import Run as RunObject
 
@@ -1052,6 +1052,7 @@ def test_update_tags_offline(
 
 
 @pytest.mark.run
+@pytest.mark.online
 @pytest.mark.parametrize("object_type", ("DataFrame", "ndarray"))
 def test_save_object(
     create_plain_run: tuple[sv_run.Run, dict], object_type: str
@@ -1074,6 +1075,7 @@ def test_save_object(
 
 
 @pytest.mark.run
+@pytest.mark.online
 def test_add_alerts() -> None:
     _uuid = f"{uuid.uuid4()}".split("-")[0]
 
@@ -1259,6 +1261,7 @@ def test_add_alerts_offline(monkeypatch) -> None:
 
 
 @pytest.mark.run
+@pytest.mark.online
 def test_log_alert() -> None:
     _uuid = f"{uuid.uuid4()}".split("-")[0]
 
@@ -1309,6 +1312,7 @@ def test_log_alert() -> None:
 
 
 @pytest.mark.run
+@pytest.mark.online
 def test_abort_on_alert_process(mocker: pytest_mock.MockerFixture) -> None:
     def testing_exit(status: int) -> None:
         raise SystemExit(status)
@@ -1362,6 +1366,7 @@ def abort_callback(abort_run=trigger) -> None:
 
 
 @pytest.mark.run
+@pytest.mark.online
 def test_abort_on_alert_python(
     speedy_heartbeat, create_plain_run: tuple[sv_run.Run, dict], mocker: pytest_mock.MockerFixture
 ) -> None:
@@ -1382,6 +1387,7 @@ def test_abort_on_alert_python(
 
 
 @pytest.mark.run
+@pytest.mark.online
 def test_abort_on_alert_raise(
     create_plain_run: tuple[sv_run.Run, dict]
 ) -> None:
@@ -1406,6 +1412,7 @@ def test_abort_on_alert_raise(
 
 
 @pytest.mark.run
+@pytest.mark.online
 def test_kill_all_processes(create_plain_run: tuple[sv_run.Run, dict]) -> None:
     run, _ = create_plain_run
     run.config(system_metrics_interval=1)
@@ -1421,6 +1428,7 @@ def test_kill_all_processes(create_plain_run: tuple[sv_run.Run, dict]) -> None:
 
 
 @pytest.mark.run
+@pytest.mark.online
 def test_run_created_with_no_timeout() -> None:
     _uuid = f"{uuid.uuid4()}".split("-")[0]
     with simvue.Run() as run:
@@ -1443,6 +1451,7 @@ def test_run_created_with_no_timeout() -> None:
 
 @pytest.mark.parametrize("mode", ("online", "offline"), ids=("online", "offline"))
 @pytest.mark.run
+@pytest.mark.online
 def test_reconnect_functionality(mode, monkeypatch: pytest.MonkeyPatch) -> None:
     temp_d: tempfile.TemporaryDirectory | None = None
     _uuid = f"{uuid.uuid4()}".split("-")[0]
@@ -1486,6 +1495,7 @@ def test_reconnect_functionality(mode, monkeypatch: pytest.MonkeyPatch) -> None:
 
 
 @pytest.mark.run
+@pytest.mark.online
 def test_env_var_metadata() -> None:
     # Add some environment variables to glob
     _recorded_env = {
@@ -1506,6 +1516,7 @@ def test_env_var_metadata() -> None:
     assert all(key in _recorded_meta.get("shell") for key in _recorded_env)
 
 @pytest.mark.run
+@pytest.mark.online
 def test_reconnect_with_process() -> None:
     _uuid = f"{uuid.uuid4()}".split("-")[0]
     with simvue.Run() as run:
@@ -1537,6 +1548,8 @@ def test_reconnect_with_process() -> None:
 @pytest.mark.parametrize(
     "environment", ("python_conda", "python_poetry", "python_uv", "julia", "rust", "nodejs")
 )
+@pytest.mark.run
+@pytest.mark.online
 def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFixture) -> None:
     """Tests that the environment information is compatible with the server."""
     from simvue.config.user import SimvueConfiguration
@@ -1558,3 +1571,4 @@ def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFi
         )
         run.update_metadata(env_func(_target_dir))
 
+

From fb3bf3f6db0f54eb07f8e77de5bbff35e627522e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20Zar=C4=99bski?= <kristian.zarebski@ukaea.uk>
Date: Fri, 9 Jan 2026 13:48:09 +0000
Subject: [PATCH 2/2] Addressed review comments by GH CQ

---
 tests/functional/test_run_class.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/functional/test_run_class.py b/tests/functional/test_run_class.py
index 3c0e7322..9b91d248 100644
--- a/tests/functional/test_run_class.py
+++ b/tests/functional/test_run_class.py
@@ -18,9 +18,9 @@
 import concurrent.futures
 import random
 import datetime
-import json
 import simvue
-from simvue.api.objects import Alert, Metrics, Folder
+
+from simvue.api.objects import Alert, Metrics
 from simvue.api.objects.grids import GridMetrics
 from simvue.exception import ObjectNotFoundError, SimvueRunError
 from simvue.sender import Sender