Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion simvue/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import contextlib
import logging
import psutil

import os
import typing

from .pynvml import (
nvmlDeviceGetComputeRunningProcesses,
Expand Down Expand Up @@ -158,6 +159,8 @@ def to_dict(self) -> dict[str, float]:
_metrics: dict[str, float] = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to rethink how these things are named, the resources metrics page now looks very confusing:

Image

f"{RESOURCES_METRIC_PREFIX}/cpu.usage.percentage": self.cpu_percent,
f"{RESOURCES_METRIC_PREFIX}/cpu.usage.memory": self.cpu_memory,
f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage": self.memory_available_percent,
f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage": self.disk_available_percent,
}

for i, gpu in enumerate(self.gpus or []):
Expand All @@ -177,3 +180,11 @@ def gpu_percent(self) -> float:
@property
def gpu_memory(self) -> float:
return sum(m[1] for m in self.gpus or []) / (len(self.gpus or []) or 1)

@property
def memory_available_percent(self) -> float:
return 100 - typing.cast("float", psutil.virtual_memory().percent)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you tested that these work on windows?


@property
def disk_available_percent(self) -> float:
return 100 - psutil.disk_usage(os.getcwd()).percent
28 changes: 28 additions & 0 deletions simvue/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,27 @@ def _dispatch_callback(

return _dispatch_callback

def _define_system_health_alerts(self, terminate_on_alert: bool) -> None:
"""Define system health resource metric alerts."""
_ = self.create_metric_threshold_alert(
name="low_available_virtual_memory",
metric=f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage",
threshold=5,
aggregation="at least one",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this do?

window=2,
rule="is below",
trigger_abort=terminate_on_alert,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could also add an email notification option?

)
_ = self.create_metric_threshold_alert(
name="low_disk_space",
metric=f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage",
threshold=5,
aggregation="at least one",
window=2,
rule="is below",
trigger_abort=terminate_on_alert,
)

def _start(self) -> bool:
"""Start a run

Expand Down Expand Up @@ -627,6 +648,7 @@ def init(
retention_period: str | None = None,
timeout: int | None = 180,
visibility: typing.Literal["public", "tenant"] | list[str] | None = None,
terminate_on_low_system_health: bool = True,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would default this to false personally

no_color: bool = False,
record_shell_vars: set[str] | None = None,
) -> bool:
Expand Down Expand Up @@ -664,6 +686,10 @@ def init(
* public - run viewable to all.
* tenant - run viewable to all within the current tenant.
* A list of usernames with which to share this run
terminate_on_low_system_health : bool, optional
whether to terminate this run if the resource metrics are
registering unhealthy values, e.g. very low available memory
default is True
no_color : bool, optional
disable terminal colors. Default False.
record_shell_vars : list[str] | None,
Expand Down Expand Up @@ -774,6 +800,8 @@ def init(
if self._status == "running":
self._start()

self._define_system_health_alerts(terminate_on_low_system_health)

if self._user_config.run.mode == "online":
click.secho(
f"[simvue] Run {self.name} created",
Expand Down
17 changes: 17 additions & 0 deletions simvue/system.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
"""
System Information
==================

Retrieve and assemble information on the current system.
"""

import os
import platform
import socket
import subprocess
import shutil
import sys
import contextlib
import psutil
import typing


Expand Down Expand Up @@ -60,6 +68,14 @@ def get_gpu_info():
return _gpu_info


def get_memory_info() -> dict[str, int]:
"""Get total available memory in GB."""
return {
"virtual": typing.cast("int", psutil.virtual_memory().total) // 1024**3,
"swap": psutil.swap_memory().total // 1024**3,
}


def get_system() -> dict[str, typing.Any]:
"""
Get system details
Expand All @@ -76,6 +92,7 @@ def get_system() -> dict[str, typing.Any]:
system["platform"]["system"] = platform.system()
system["platform"]["release"] = platform.release()
system["platform"]["version"] = platform.version()
system["memory"] = {k: f"{v}GB" for k, v in get_memory_info().items()}
system["cpu"] = {}
system["cpu"]["arch"] = cpu[1]
system["cpu"]["processor"] = cpu[0]
Expand Down
16 changes: 15 additions & 1 deletion tests/functional/test_run_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@
import random
import datetime
import simvue

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to add unit tests:

  • Check the metrics appear automatically
  • Check if you add a process that spikes the RAM usage / create a large tempfile, the available RAM / memory metrics change appropriately
  • Check the options for the alert (terminate, email if you decide to add that) are added to the alert correctly (ie get the alert definition back once its created, check it matches)

from simvue.api.objects import Alert, Metrics
from simvue.api.objects.grids import GridMetrics
from simvue.exception import ObjectNotFoundError, SimvueRunError
from simvue.sender import Sender
import simvue.run as sv_run
import simvue.client as sv_cl
import simvue.config.user as sv_cfg

from simvue.api.objects import Run as RunObject

Expand Down Expand Up @@ -1052,6 +1052,7 @@ def test_update_tags_offline(


@pytest.mark.run
@pytest.mark.online
@pytest.mark.parametrize("object_type", ("DataFrame", "ndarray"))
def test_save_object(
create_plain_run: tuple[sv_run.Run, dict], object_type: str
Expand All @@ -1074,6 +1075,7 @@ def test_save_object(


@pytest.mark.run
@pytest.mark.online
def test_add_alerts() -> None:
_uuid = f"{uuid.uuid4()}".split("-")[0]

Expand Down Expand Up @@ -1259,6 +1261,7 @@ def test_add_alerts_offline(monkeypatch) -> None:


@pytest.mark.run
@pytest.mark.online
def test_log_alert() -> None:
_uuid = f"{uuid.uuid4()}".split("-")[0]

Expand Down Expand Up @@ -1309,6 +1312,7 @@ def test_log_alert() -> None:


@pytest.mark.run
@pytest.mark.online
def test_abort_on_alert_process(mocker: pytest_mock.MockerFixture) -> None:
def testing_exit(status: int) -> None:
raise SystemExit(status)
Expand Down Expand Up @@ -1362,6 +1366,7 @@ def abort_callback(abort_run=trigger) -> None:


@pytest.mark.run
@pytest.mark.online
def test_abort_on_alert_python(
speedy_heartbeat, create_plain_run: tuple[sv_run.Run, dict], mocker: pytest_mock.MockerFixture
) -> None:
Expand All @@ -1382,6 +1387,7 @@ def test_abort_on_alert_python(


@pytest.mark.run
@pytest.mark.online
def test_abort_on_alert_raise(
create_plain_run: tuple[sv_run.Run, dict]
) -> None:
Expand All @@ -1406,6 +1412,7 @@ def test_abort_on_alert_raise(


@pytest.mark.run
@pytest.mark.online
def test_kill_all_processes(create_plain_run: tuple[sv_run.Run, dict]) -> None:
run, _ = create_plain_run
run.config(system_metrics_interval=1)
Expand All @@ -1421,6 +1428,7 @@ def test_kill_all_processes(create_plain_run: tuple[sv_run.Run, dict]) -> None:


@pytest.mark.run
@pytest.mark.online
def test_run_created_with_no_timeout() -> None:
_uuid = f"{uuid.uuid4()}".split("-")[0]
with simvue.Run() as run:
Expand All @@ -1443,6 +1451,7 @@ def test_run_created_with_no_timeout() -> None:

@pytest.mark.parametrize("mode", ("online", "offline"), ids=("online", "offline"))
@pytest.mark.run
@pytest.mark.online
def test_reconnect_functionality(mode, monkeypatch: pytest.MonkeyPatch) -> None:
temp_d: tempfile.TemporaryDirectory | None = None
_uuid = f"{uuid.uuid4()}".split("-")[0]
Expand Down Expand Up @@ -1486,6 +1495,7 @@ def test_reconnect_functionality(mode, monkeypatch: pytest.MonkeyPatch) -> None:


@pytest.mark.run
@pytest.mark.online
def test_env_var_metadata() -> None:
# Add some environment variables to glob
_recorded_env = {
Expand All @@ -1506,6 +1516,7 @@ def test_env_var_metadata() -> None:
assert all(key in _recorded_meta.get("shell") for key in _recorded_env)

@pytest.mark.run
@pytest.mark.online
def test_reconnect_with_process() -> None:
_uuid = f"{uuid.uuid4()}".split("-")[0]
with simvue.Run() as run:
Expand Down Expand Up @@ -1537,6 +1548,8 @@ def test_reconnect_with_process() -> None:
@pytest.mark.parametrize(
"environment", ("python_conda", "python_poetry", "python_uv", "julia", "rust", "nodejs")
)
@pytest.mark.run
@pytest.mark.online
def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFixture) -> None:
"""Tests that the environment information is compatible with the server."""
from simvue.config.user import SimvueConfiguration
Expand All @@ -1558,3 +1571,4 @@ def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFi
)
run.update_metadata(env_func(_target_dir))


Loading