From 38e0581153236349431d4486a96bbeda3e8c5ff8 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Tue, 19 Aug 2025 11:22:16 -0700 Subject: [PATCH 01/28] Update package_info.py (#322) Signed-off-by: Pablo Garay --- nemo_run/package_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/package_info.py b/nemo_run/package_info.py index 141bafa3..50fd2aec 100644 --- a/nemo_run/package_info.py +++ b/nemo_run/package_info.py @@ -13,7 +13,7 @@ # limitations under the License. from packaging.version import Version -__version__ = "0.6.0rc0.dev0" +__version__ = "0.7.0rc0.dev0" MAJOR = Version(__version__).major MINOR = Version(__version__).minor From 8196011b9ad5a1af33590cb6de2f582018a6f423 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Wed, 20 Aug 2025 07:56:38 -0700 Subject: [PATCH 02/28] Add ray head start timeout (#324) Signed-off-by: Hemil Desai --- nemo_run/run/ray/templates/ray.sub.j2 | 12 +++++++++++- .../execution/artifacts/expected_ray_cluster.sub | 12 +++++++++++- .../execution/artifacts/expected_ray_cluster_ssh.sub | 12 +++++++++++- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/nemo_run/run/ray/templates/ray.sub.j2 b/nemo_run/run/ray/templates/ray.sub.j2 index cb66d3aa..025aa393 100644 --- a/nemo_run/run/ray/templates/ray.sub.j2 +++ b/nemo_run/run/ray/templates/ray.sub.j2 @@ -47,6 +47,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster} # Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/ RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-} +# Timeout in seconds for Ray head node to start (default 10 minutes) +RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600} + # Directory setup export CLUSTER_DIR={{ cluster_dir }} mkdir -p $CLUSTER_DIR @@ -208,9 +211,16 @@ EOF srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" & # Wait for the head node container to start and for Ray to be ready +elapsed_time=0 while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do - echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..." + if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then + echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..." + touch $LOG_DIR/ENDED + exit 1 + fi + echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)" sleep 2 + elapsed_time=$((elapsed_time + 2)) done NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) diff --git a/test/core/execution/artifacts/expected_ray_cluster.sub b/test/core/execution/artifacts/expected_ray_cluster.sub index ceaea0cc..7bcb8af0 100644 --- a/test/core/execution/artifacts/expected_ray_cluster.sub +++ b/test/core/execution/artifacts/expected_ray_cluster.sub @@ -49,6 +49,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster} # Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/ RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-} +# Timeout in seconds for Ray head node to start (default 10 minutes) +RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600} + # Directory setup export CLUSTER_DIR=/tmp/test_jobs/test-ray-cluster mkdir -p $CLUSTER_DIR @@ -202,9 +205,16 @@ EOF srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" & # Wait for the head node container to start and for Ray to be ready +elapsed_time=0 while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do - echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..." + if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then + echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..." + touch $LOG_DIR/ENDED + exit 1 + fi + echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)" sleep 2 + elapsed_time=$((elapsed_time + 2)) done NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) diff --git a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub index 983b62fc..5d16afdd 100644 --- a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub +++ b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub @@ -50,6 +50,9 @@ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster} # Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/ RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:-} +# Timeout in seconds for Ray head node to start (default 10 minutes) +RAY_HEAD_START_TIMEOUT=${RAY_HEAD_START_TIMEOUT:-600} + # Directory setup export CLUSTER_DIR=/lustre/fsw/projects/research/jobs/multi-node-training mkdir -p $CLUSTER_DIR @@ -207,9 +210,16 @@ EOF srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" & # Wait for the head node container to start and for Ray to be ready +elapsed_time=0 while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD && srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w $head_node ray status --address $ip_head 2>/dev/null); do - echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready..." + if [[ $elapsed_time -ge $RAY_HEAD_START_TIMEOUT ]]; then + echo "[ERROR][$(date)] Ray head node failed to start within $RAY_HEAD_START_TIMEOUT seconds. Exiting..." + touch $LOG_DIR/ENDED + exit 1 + fi + echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)" sleep 2 + elapsed_time=$((elapsed_time + 2)) done NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) From 8199175d091c97e803cc7daa56a3ca37d499c1ea Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 21 Aug 2025 13:18:02 -0500 Subject: [PATCH 03/28] Remove ray deprecated dashboard-grpc-port arg (#325) * Remove ray deprecated dashboard-grpo-port arg Signed-off-by: Charlie Truong * Fix nemo run ray cluster tests Signed-off-by: Charlie Truong * Remove DASHBOARD_GRPC_PORT Signed-off-by: Charlie Truong --------- Signed-off-by: Charlie Truong --- nemo_run/run/ray/templates/ray.sub.j2 | 2 -- test/core/execution/artifacts/expected_ray_cluster.sub | 2 -- test/core/execution/artifacts/expected_ray_cluster_ssh.sub | 2 -- 3 files changed, 6 deletions(-) diff --git a/nemo_run/run/ray/templates/ray.sub.j2 b/nemo_run/run/ray/templates/ray.sub.j2 index 025aa393..292bff0b 100644 --- a/nemo_run/run/ray/templates/ray.sub.j2 +++ b/nemo_run/run/ray/templates/ray.sub.j2 @@ -31,7 +31,6 @@ METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009} PORT=${PORT:-6379} RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001} #REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ?? -DASHBOARD_GRPC_PORT=${DASHBOARD_GRPC_PORT:-52367} DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365} @@ -183,7 +182,6 @@ ray start --head \ --node-ip-address="$head_node_ip" \ --port=${PORT} \ --ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \ - --dashboard-grpc-port=${DASHBOARD_GRPC_PORT} \ --dashboard-port=${DASHBOARD_PORT} \ \ --node-manager-port=${NODE_MANAGER_PORT} \ diff --git a/test/core/execution/artifacts/expected_ray_cluster.sub b/test/core/execution/artifacts/expected_ray_cluster.sub index 7bcb8af0..bdefc288 100644 --- a/test/core/execution/artifacts/expected_ray_cluster.sub +++ b/test/core/execution/artifacts/expected_ray_cluster.sub @@ -33,7 +33,6 @@ METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009} PORT=${PORT:-6379} RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001} #REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ?? -DASHBOARD_GRPC_PORT=${DASHBOARD_GRPC_PORT:-52367} DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365} @@ -177,7 +176,6 @@ ray start --head \ --node-ip-address="$head_node_ip" \ --port=${PORT} \ --ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \ - --dashboard-grpc-port=${DASHBOARD_GRPC_PORT} \ --dashboard-port=${DASHBOARD_PORT} \ \ --node-manager-port=${NODE_MANAGER_PORT} \ diff --git a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub index 5d16afdd..948cd6ea 100644 --- a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub +++ b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub @@ -34,7 +34,6 @@ METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009} PORT=${PORT:-6379} RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001} #REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ?? -DASHBOARD_GRPC_PORT=${DASHBOARD_GRPC_PORT:-52367} DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365} @@ -182,7 +181,6 @@ ray start --head \ --node-ip-address="$head_node_ip" \ --port=${PORT} \ --ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \ - --dashboard-grpc-port=${DASHBOARD_GRPC_PORT} \ --dashboard-port=${DASHBOARD_PORT} \ \ --node-manager-port=${NODE_MANAGER_PORT} \ From f00ef04c3ba52a1281e4aae70761b4c88be5483a Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 21 Aug 2025 17:14:26 -0500 Subject: [PATCH 04/28] Update community-bot to add issues to shared project (#321) Signed-off-by: Charlie Truong --- .github/workflows/community-bot.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/community-bot.yml b/.github/workflows/community-bot.yml index bf314b7e..c651945a 100644 --- a/.github/workflows/community-bot.yml +++ b/.github/workflows/community-bot.yml @@ -8,6 +8,8 @@ on: jobs: community-bot: - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.49.1 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.54.0 + with: + community_project_id: ${{ vars.COMMUNITY_PROJECT_ID }} secrets: GH_TOKEN: ${{ secrets.PAT }} From 19f7cab7f989d420a8d46e4d0e3d71db2eed1318 Mon Sep 17 00:00:00 2001 From: Prekshi Vyas <34834085+prekshivyas@users.noreply.github.com> Date: Fri, 22 Aug 2025 08:49:44 -0700 Subject: [PATCH 05/28] add a grace for Jobs that may start in Unknown (#291) * add a grace for Jobs that may start in Unknown Signed-off-by: Prekshi Vyas * add a grace for Jobs that may start in Unknown Signed-off-by: Prekshi Vyas * add a grace for Jobs that may start in Unknown Signed-off-by: Prekshi Vyas * fix linting Signed-off-by: Prekshi Vyas * make the handling of Unknown job status better by polling Signed-off-by: prekshivyas --------- Signed-off-by: Prekshi Vyas Signed-off-by: prekshivyas Co-authored-by: prekshivyas --- nemo_run/core/execution/lepton.py | 41 +++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/nemo_run/core/execution/lepton.py b/nemo_run/core/execution/lepton.py index e3aec777..b22bb87c 100644 --- a/nemo_run/core/execution/lepton.py +++ b/nemo_run/core/execution/lepton.py @@ -83,7 +83,13 @@ def copy_directory_data_command(self, local_dir_path: str, dest_path: str) -> Li full_command = ["sh", "-c", cmd] return full_command - def move_data(self, sleep: float = 10, timeout: int = 600, poll_interval: int = 5) -> None: + def move_data( + self, + sleep: float = 10, + timeout: int = 600, + poll_interval: int = 5, + unknowns_grace_period: int = 60, + ) -> None: """ Moves job directory into remote storage and deletes the workload after completion. """ @@ -122,20 +128,39 @@ def move_data(self, sleep: float = 10, timeout: int = 600, poll_interval: int = job_id = response.metadata.id_ start_time = time.time() - count = 0 while True: if time.time() - start_time > timeout: raise TimeoutError(f"Job {job_id} did not complete within {timeout} seconds.") + current_job = client.job.get(job_id) current_job_status = current_job.status.state - if count > 0 and current_job_status in [ - LeptonJobState.Completed, - LeptonJobState.Failed, - LeptonJobState.Unknown, - ]: + if ( + current_job_status == LeptonJobState.Completed + or current_job_status == LeptonJobState.Failed + ): break - count += 1 + elif current_job_status == LeptonJobState.Unknown: + logging.warning( + f"Job {job_id} entered Unknown state, checking for up to {unknowns_grace_period} seconds every 2 seconds..." + ) + unknown_start_time = time.time() + recovered = False + while time.time() - unknown_start_time < unknowns_grace_period: + time.sleep(2) + current_job = client.job.get(job_id) + current_job_status = current_job.status.state + if current_job_status != LeptonJobState.Unknown: + logging.info( + f"Job {job_id} recovered from Unknown state to {current_job_status}" + ) + recovered = True + break + if not recovered: + logging.error( + f"Job {job_id} has been in Unknown state for more than {unknowns_grace_period} seconds" + ) + break time.sleep(poll_interval) if current_job_status != LeptonJobState.Completed: From c4c83ab36954e36972b0203663f68c7428852b83 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Tue, 26 Aug 2025 09:21:31 -0700 Subject: [PATCH 06/28] Add image pull secrets param for lepton (#330) * add image pull secrets for lepton Signed-off-by: Pablo Garay * update format Signed-off-by: Pablo Garay --------- Signed-off-by: Pablo Garay --- nemo_run/core/execution/lepton.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo_run/core/execution/lepton.py b/nemo_run/core/execution/lepton.py index b22bb87c..b1d10ce6 100644 --- a/nemo_run/core/execution/lepton.py +++ b/nemo_run/core/execution/lepton.py @@ -53,6 +53,9 @@ class LeptonExecutor(Executor): node_group: str = "" mounts: list[dict[str, Any]] = field(default_factory=list) lepton_job_dir: str = field(init=False, default="") + image_pull_secrets: list[str] = field( + default_factory=list + ) # Image pull secrets for container registry authentication custom_spec: dict[str, Any] = field(default_factory=dict) pre_launch_commands: list[str] = field(default_factory=list) # Custom commands before launch @@ -249,7 +252,7 @@ def create_lepton_job(self, name: str): max_job_failure_retry=None, envs=envs, mounts=[Mount(**mount) for mount in self.mounts], - image_pull_secrets=[], + image_pull_secrets=self.image_pull_secrets, ttl_seconds_after_finished=None, intra_job_communication=True, privileged=False, From 087facfe3d959aa339d29ea7f64493e0392754cb Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 28 Aug 2025 10:47:57 -0500 Subject: [PATCH 07/28] Bump community-bot to 0.54.4 (#332) Signed-off-by: Charlie Truong --- .github/workflows/community-bot.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/community-bot.yml b/.github/workflows/community-bot.yml index c651945a..fa004e28 100644 --- a/.github/workflows/community-bot.yml +++ b/.github/workflows/community-bot.yml @@ -8,7 +8,7 @@ on: jobs: community-bot: - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.54.0 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.54.4 with: community_project_id: ${{ vars.COMMUNITY_PROJECT_ID }} secrets: From 178ab3cd64285ae98b1efe528e7bb924b0b3b6cb Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Wed, 3 Sep 2025 10:42:38 -0500 Subject: [PATCH 08/28] Add broken links check in docs (#333) --- .github/workflows/build-docs.yml | 26 ++++++++++++++++++++++++++ docs/source/conf.py | 6 ++++++ docs/source/guides/management.md | 2 +- docs/source/index.rst | 12 ++++++------ 4 files changed, 39 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/build-docs.yml diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml new file mode 100644 index 00000000..2f20739b --- /dev/null +++ b/.github/workflows/build-docs.yml @@ -0,0 +1,26 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Build docs + +on: + pull_request: + types: [opened, synchronize, reopened, labeled, unlabeled] + workflow_call: + +jobs: + build-docs: + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0 + with: + docs-directory: docs/source diff --git a/docs/source/conf.py b/docs/source/conf.py index b3f747be..9609288f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -103,3 +103,9 @@ ], } html_extra_path = ["project.json", "versions1.json"] + +# Github links are now getting rate limited from the Github Actions +linkcheck_ignore = [ + ".*github\\.com.*", + ".*githubusercontent\\.com.*", +] diff --git a/docs/source/guides/management.md b/docs/source/guides/management.md index a2e2c147..30e7af6b 100644 --- a/docs/source/guides/management.md +++ b/docs/source/guides/management.md @@ -135,4 +135,4 @@ nemorun experiment cancel experiment_with_scripts_1720556256 0 This information is specific to each experiment on how to manage it. -See [this notebook](../../../examples/hello-world/hello_experiments.ipynb) for more details and a playable experience. +See [this notebook](https://github.com/NVIDIA-NeMo/Run/blob/main/examples/hello-world/hello_experiments.ipynb) for more details and a playable experience. diff --git a/docs/source/index.rst b/docs/source/index.rst index 714e7823..4329df4f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -8,9 +8,9 @@ NeMo-Run Documentation NeMo-Run is a powerful tool designed to streamline the configuration, execution and management of Machine Learning experiments across various computing environments. NeMo Run has three core responsibilities: -1. `Configuration <./guides/configuration.html>`_ -2. `Execution <./guides/execution.html>`_ -3. `Management <./guides/management.html>`_ +1. :doc:`Configuration ` +2. :doc:`Execution ` +3. :doc:`Management ` Please click into each link to learn more. This is also the typical order Nemo Run users will follow to setup and launch experiments. @@ -61,6 +61,6 @@ The ``hello_world`` tutorial series provides a comprehensive introduction to NeM You can find the tutorial series below: -1. `Part 1: Hello World <../../../examples/hello-world/hello_world.ipynb>`_ -2. `Part 2: Hello Experiments <../../../examples/hello-world/hello_experiments.ipynb>`_ -3. `Part 3: Hello Scripts <../../../examples/hello-world/hello_scripts.py>`_ +1. `Part 1: Hello World `_ +2. `Part 2: Hello Experiments `_ +3. `Part 3: Hello Scripts `_ From 1adb499121464912dde25754e2f86bee8d93ce82 Mon Sep 17 00:00:00 2001 From: Robert Clark Date: Wed, 10 Sep 2025 15:10:15 -0500 Subject: [PATCH 09/28] Add node reservations for LeptonExecutor (#336) Allow users to specify an existing node reservation with the LeptonExecutor to be able to run on dedicated resources. Signed-off-by: Robert Clark --- docs/source/guides/execution.md | 6 ++ nemo_run/core/execution/lepton.py | 14 +++- test/core/execution/test_lepton.py | 118 +++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+), 2 deletions(-) diff --git a/docs/source/guides/execution.md b/docs/source/guides/execution.md index 1eb8d82e..c6e54fa9 100644 --- a/docs/source/guides/execution.md +++ b/docs/source/guides/execution.md @@ -295,6 +295,12 @@ def your_lepton_executor(nodes: int, gpus_per_node: int, container_image: str): mounts=[{"path": storage_path, "mount_path": mount_path}], # Optional: Add custom environment variables or PyTorch specs if needed env_vars=common_envs(), + # Optional: Specify a node reservation to schedule jobs with + # node_reservation="my-node-reservation", + # Optional: Specify commands to run at container launch prior to the job starting + # pre_launch_commands=["nvidia-smi"], + # Optional: Specify image pull secrets for authenticating with container registries + # image_pull_secrets=["my-image-pull-secret"], # packager=run.GitArchivePackager() # Choose appropriate packager ) return executor diff --git a/nemo_run/core/execution/lepton.py b/nemo_run/core/execution/lepton.py index b1d10ce6..61a70b43 100644 --- a/nemo_run/core/execution/lepton.py +++ b/nemo_run/core/execution/lepton.py @@ -20,7 +20,12 @@ LeptonContainer, Mount, ) -from leptonai.api.v1.types.job import LeptonJob, LeptonJobState, LeptonJobUserSpec +from leptonai.api.v1.types.job import ( + LeptonJob, + LeptonJobState, + LeptonJobUserSpec, + ReservationConfig, +) from leptonai.api.v1.types.replica import Replica from nemo_run.config import get_nemorun_home @@ -51,6 +56,7 @@ class LeptonExecutor(Executor): shared_memory_size: int = 65536 resource_shape: str = "" node_group: str = "" + node_reservation: str = "" mounts: list[dict[str, Any]] = field(default_factory=list) lepton_job_dir: str = field(init=False, default="") image_pull_secrets: list[str] = field( @@ -260,8 +266,12 @@ def create_lepton_job(self, name: str): log=None, queue_config=None, stopped=None, - reservation_config=None, ) + + if self.node_reservation: + job_spec.reservation_config = ReservationConfig(reservation_id=self.node_reservation) + job_spec.reservation_config.reservation_id = self.node_reservation + job = LeptonJob(spec=job_spec, metadata=Metadata(id=name)) created_job = client.job.create(job) diff --git a/test/core/execution/test_lepton.py b/test/core/execution/test_lepton.py index 0ce503f0..7fdc08cc 100644 --- a/test/core/execution/test_lepton.py +++ b/test/core/execution/test_lepton.py @@ -59,6 +59,42 @@ def test_init(self): assert executor.nemo_run_dir == "/workspace/nemo_run" assert executor.mounts == [{"path": "/workspace", "mount_path": "/workspace"}] + def test_init_with_node_reservation(self): + """Test initialization with node_reservation parameter.""" + executor = LeptonExecutor( + resource_shape="gpu.8xh100-80gb", + node_group="my-node-group", + container_image="test-image", + nodes=2, + gpus_per_node=8, + nemo_run_dir="/workspace/nemo_run", + mounts=[{"path": "/workspace", "mount_path": "/workspace"}], + node_reservation="my-reservation-id", + ) + + assert executor.node_reservation == "my-reservation-id" + + def test_init_with_empty_node_reservation(self): + """Test initialization with empty node_reservation string.""" + executor = LeptonExecutor( + container_image="test-image", + nemo_run_dir="/test/path", + mounts=[{"path": "/test", "mount_path": "/test"}], + node_reservation="", + ) + + assert executor.node_reservation == "" + + def test_init_without_node_reservation(self): + """Test initialization without node_reservation parameter (default behavior).""" + executor = LeptonExecutor( + container_image="test-image", + nemo_run_dir="/test/path", + mounts=[{"path": "/test", "mount_path": "/test"}], + ) + + assert executor.node_reservation == "" + @patch("nemo_run.core.execution.lepton.APIClient") def test_stop_job(self, mock_APIClient): mock_instance = MagicMock() @@ -344,6 +380,88 @@ def test_create_lepton_job(self, mock_APIClient_class): mock_client.job.create.assert_called_once() + @patch("nemo_run.core.execution.lepton.APIClient") + def test_create_lepton_job_with_reservation_config(self, mock_APIClient_class): + """Test create_lepton_job creates ReservationConfig when node_reservation is set.""" + mock_client = mock_APIClient_class.return_value + mock_client.job.create.return_value = LeptonJob(metadata=Metadata(id="my-lepton-job")) + node_group = SimpleNamespace(metadata=SimpleNamespace(id_="123456")) + + mock_client.nodegroup.list_all.return_value = [] + valid_node_ids = ["node-id-1", "node-id-2"] + + executor = LeptonExecutor( + container_image="test-image", + nemo_run_dir="/test/path", + node_group="123456", + mounts=[{"path": "/test", "mount_path": "/test"}], + node_reservation="my-reservation-id", + ) + executor._valid_node_ids = MagicMock(return_value=valid_node_ids) + executor._node_group_id = MagicMock(return_value=node_group) + + executor.create_lepton_job("my-lepton-job") + + # Verify that job.create was called with the correct ReservationConfig + mock_client.job.create.assert_called_once() + created_job = mock_client.job.create.call_args[0][0] + assert created_job.spec.reservation_config is not None + assert created_job.spec.reservation_config.reservation_id == "my-reservation-id" + + @patch("nemo_run.core.execution.lepton.APIClient") + def test_create_lepton_job_without_reservation_config(self, mock_APIClient_class): + """Test create_lepton_job creates no ReservationConfig when node_reservation is not set.""" + mock_client = mock_APIClient_class.return_value + mock_client.job.create.return_value = LeptonJob(metadata=Metadata(id="my-lepton-job")) + node_group = SimpleNamespace(metadata=SimpleNamespace(id_="123456")) + + mock_client.nodegroup.list_all.return_value = [] + valid_node_ids = ["node-id-1", "node-id-2"] + + executor = LeptonExecutor( + container_image="test-image", + nemo_run_dir="/test/path", + node_group="123456", + mounts=[{"path": "/test", "mount_path": "/test"}], + # No node_reservation set + ) + executor._valid_node_ids = MagicMock(return_value=valid_node_ids) + executor._node_group_id = MagicMock(return_value=node_group) + + executor.create_lepton_job("my-lepton-job") + + # Verify that job.create was called with no ReservationConfig + mock_client.job.create.assert_called_once() + created_job = mock_client.job.create.call_args[0][0] + assert created_job.spec.reservation_config is None + + @patch("nemo_run.core.execution.lepton.APIClient") + def test_create_lepton_job_with_empty_reservation_config(self, mock_APIClient_class): + """Test create_lepton_job creates no ReservationConfig when node_reservation is empty string.""" + mock_client = mock_APIClient_class.return_value + mock_client.job.create.return_value = LeptonJob(metadata=Metadata(id="my-lepton-job")) + node_group = SimpleNamespace(metadata=SimpleNamespace(id_="123456")) + + mock_client.nodegroup.list_all.return_value = [] + valid_node_ids = ["node-id-1", "node-id-2"] + + executor = LeptonExecutor( + container_image="test-image", + nemo_run_dir="/test/path", + node_group="123456", + mounts=[{"path": "/test", "mount_path": "/test"}], + node_reservation="", # Empty string + ) + executor._valid_node_ids = MagicMock(return_value=valid_node_ids) + executor._node_group_id = MagicMock(return_value=node_group) + + executor.create_lepton_job("my-lepton-job") + + # Verify that job.create was called with no ReservationConfig + mock_client.job.create.assert_called_once() + created_job = mock_client.job.create.call_args[0][0] + assert created_job.spec.reservation_config is None + def test_nnodes(self): executor = LeptonExecutor( container_image="nvcr.io/nvidia/test:latest", From 2caf5d7dae208b2827ba97aef35b765bedb708f1 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 11 Sep 2025 21:08:31 -0700 Subject: [PATCH 10/28] fix nodes -> num_nodes (#338) Signed-off-by: Romil Bhardwaj --- docs/source/guides/execution.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/guides/execution.md b/docs/source/guides/execution.md index c6e54fa9..cd7c49d6 100644 --- a/docs/source/guides/execution.md +++ b/docs/source/guides/execution.md @@ -205,8 +205,8 @@ def your_skypilot_executor(nodes: int, devices: int, container_image: str): return SkypilotExecutor( gpus="RTX5880-ADA-GENERATION", gpus_per_node=devices, - nodes = nodes - env_vars=common_envs() + num_nodes=nodes, + env_vars=common_envs(), container_image=container_image, cloud="kubernetes", # Optional to reuse Skypilot cluster From c16a5728a55741003026f441acf83a6d9e19ad5d Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 11 Sep 2025 21:15:28 -0700 Subject: [PATCH 11/28] Add retry_until_up (#340) Signed-off-by: Romil Bhardwaj --- nemo_run/core/execution/skypilot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_run/core/execution/skypilot.py b/nemo_run/core/execution/skypilot.py index d3823c43..3f161dec 100644 --- a/nemo_run/core/execution/skypilot.py +++ b/nemo_run/core/execution/skypilot.py @@ -107,6 +107,7 @@ class SkypilotExecutor(Executor): cluster_config_overrides: Optional[dict[str, Any]] = None infra: Optional[str] = None network_tier: Optional[str] = None + retry_until_up: bool = False packager: Packager = field(default_factory=lambda: GitArchivePackager()) # type: ignore # noqa: F821 def __post_init__(self): @@ -410,7 +411,7 @@ def launch( idle_minutes_to_autostop=self.idle_minutes_to_autostop, down=self.autodown, fast=True, - # retry_until_up=retry_until_up, + retry_until_up=self.retry_until_up, # clone_disk_from=clone_disk_from, ) ) From afcd979ba1e8a5716561fd0c8dcba220f42e1bcf Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Thu, 11 Sep 2025 21:16:10 -0700 Subject: [PATCH 12/28] Support SkyPilot Storage configurations in `file_mounts` for automatic cloud sync (#335) * fix: support for SkyPilot Storage configurations in file_mounts - Modified SkypilotExecutor to handle both string paths and dict configs in file_mounts - Dictionary configs are automatically converted to sky.Storage objects - Enables automatic cloud storage mounting (GCS, S3, etc.) for outputs This change allows users to specify cloud storage backends directly in file_mounts, enabling automatic synchronization of training outputs to cloud storage without manual rsync operations. Signed-off-by: Andy Lee * refactor: Separate storage_mounts from file_mounts for cleaner API Signed-off-by: Andy Lee * test: Add unit tests for storage_mounts functionality - Test storage_mounts parameter initialization - Test to_task() method with storage_mounts configurations - Test combined file_mounts and storage_mounts usage - Verify Storage.from_yaml_config() integration - Ensure backward compatibility when storage_mounts is None Signed-off-by: Andy Lee * fix tests Signed-off-by: Hemil Desai --------- Signed-off-by: Andy Lee Signed-off-by: Hemil Desai Co-authored-by: Hemil Desai --- nemo_run/core/execution/skypilot.py | 31 +++++- test/core/execution/test_skypilot.py | 157 +++++++++++++++++++++++++++ 2 files changed, 187 insertions(+), 1 deletion(-) diff --git a/nemo_run/core/execution/skypilot.py b/nemo_run/core/execution/skypilot.py index 3f161dec..49d5dd57 100644 --- a/nemo_run/core/execution/skypilot.py +++ b/nemo_run/core/execution/skypilot.py @@ -65,7 +65,22 @@ class SkypilotExecutor(Executor): network_tier="best", cluster_name="nemo_tester", file_mounts={ - "nemo_run.whl": "nemo_run.whl" + "nemo_run.whl": "nemo_run.whl", + "/workspace/code": "/local/path/to/code", + }, + storage_mounts={ + "/workspace/outputs": { + "name": "my-training-outputs", + "store": "gcs", # or "s3", "azure", etc. + "mode": "MOUNT", + "persistent": True, + }, + "/workspace/checkpoints": { + "name": "model-checkpoints", + "store": "s3", + "mode": "MOUNT", + "persistent": True, + } }, setup=\"\"\" conda deactivate @@ -99,6 +114,7 @@ class SkypilotExecutor(Executor): disk_tier: Optional[Union[str, list[str]]] = None ports: Optional[tuple[str]] = None file_mounts: Optional[dict[str, str]] = None + storage_mounts: Optional[dict[str, dict[str, Any]]] = None # Can be str or dict configs cluster_name: Optional[str] = None setup: Optional[str] = None autodown: bool = False @@ -372,9 +388,22 @@ def to_task( envs=self.env_vars, num_nodes=self.num_nodes, ) + # Handle regular file mounts file_mounts = self.file_mounts or {} file_mounts["/nemo_run"] = self.job_dir task.set_file_mounts(file_mounts) + + # Handle storage mounts separately + if self.storage_mounts: + from sky.data import Storage + + storage_objects = {} + for mount_path, config in self.storage_mounts.items(): + # Create Storage object from config dict + storage_obj = Storage.from_yaml_config(config) + storage_objects[mount_path] = storage_obj + task.set_storage_mounts(storage_objects) + task.set_resources(self.to_resources()) if env_vars: diff --git a/test/core/execution/test_skypilot.py b/test/core/execution/test_skypilot.py index fe975049..5d35c39c 100644 --- a/test/core/execution/test_skypilot.py +++ b/test/core/execution/test_skypilot.py @@ -561,3 +561,160 @@ def test_to_task(self, mock_task, mock_skypilot_imports, executor): # Verify the returned task is our mock assert result == mock_task_instance + + @patch("sky.task.Task") + def test_to_task_with_storage_mounts(self, mock_task, mock_skypilot_imports): + # Create a mock task instance + mock_task_instance = MagicMock() + mock_task.return_value = mock_task_instance + mock_task_instance.set_file_mounts = MagicMock() + mock_task_instance.set_storage_mounts = MagicMock() + mock_task_instance.set_resources = MagicMock() + + # Mock sky.data.Storage + mock_storage_class = MagicMock() + mock_storage_obj = MagicMock() + mock_storage_class.from_yaml_config.return_value = mock_storage_obj + + executor = SkypilotExecutor( + container_image="test:latest", + storage_mounts={ + "/workspace/outputs": { + "name": "my-outputs", + "store": "gcs", + "mode": "MOUNT", + "persistent": True, + } + }, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + executor.job_dir = tmp_dir + + with patch.object(SkypilotExecutor, "to_resources") as mock_to_resources: + mock_to_resources.return_value = MagicMock() + + with patch("sky.data.Storage", mock_storage_class): + executor.to_task("test_task") + + # Verify Storage.from_yaml_config was called with the config + mock_storage_class.from_yaml_config.assert_called_once_with( + { + "name": "my-outputs", + "store": "gcs", + "mode": "MOUNT", + "persistent": True, + } + ) + + # Verify set_storage_mounts was called with Storage objects + mock_task_instance.set_storage_mounts.assert_called_once() + storage_mounts_call = mock_task_instance.set_storage_mounts.call_args[0][0] + assert "/workspace/outputs" in storage_mounts_call + assert storage_mounts_call["/workspace/outputs"] == mock_storage_obj + + @patch("sky.task.Task") + def test_to_task_with_both_file_and_storage_mounts(self, mock_task, mock_skypilot_imports): + # Create a mock task instance + mock_task_instance = MagicMock() + mock_task.return_value = mock_task_instance + mock_task_instance.set_file_mounts = MagicMock() + mock_task_instance.set_storage_mounts = MagicMock() + mock_task_instance.set_resources = MagicMock() + + # Mock sky.data.Storage + mock_storage_class = MagicMock() + mock_storage_obj = MagicMock() + mock_storage_class.from_yaml_config.return_value = mock_storage_obj + + executor = SkypilotExecutor( + container_image="test:latest", + file_mounts={ + "/workspace/code": "/local/path/to/code", + }, + storage_mounts={ + "/workspace/outputs": { + "name": "my-outputs", + "store": "s3", + "mode": "MOUNT", + }, + "/workspace/checkpoints": { + "name": "my-checkpoints", + "store": "gcs", + "mode": "MOUNT_CACHED", + }, + }, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + executor.job_dir = tmp_dir + + with patch.object(SkypilotExecutor, "to_resources") as mock_to_resources: + mock_to_resources.return_value = MagicMock() + + with patch("sky.data.Storage", mock_storage_class): + executor.to_task("test_task") + + # Verify file_mounts includes both user files and nemo_run + file_mounts_call = mock_task_instance.set_file_mounts.call_args[0][0] + assert "/workspace/code" in file_mounts_call + assert file_mounts_call["/workspace/code"] == "/local/path/to/code" + assert "/nemo_run" in file_mounts_call + assert file_mounts_call["/nemo_run"] == tmp_dir + + # Verify Storage.from_yaml_config was called for both storage mounts + assert mock_storage_class.from_yaml_config.call_count == 2 + + # Verify set_storage_mounts was called with both Storage objects + mock_task_instance.set_storage_mounts.assert_called_once() + storage_mounts_call = mock_task_instance.set_storage_mounts.call_args[0][0] + assert "/workspace/outputs" in storage_mounts_call + assert "/workspace/checkpoints" in storage_mounts_call + assert len(storage_mounts_call) == 2 + + @patch("sky.task.Task") + def test_to_task_without_storage_mounts(self, mock_task, mock_skypilot_imports): + # Test that set_storage_mounts is not called when storage_mounts is None + mock_task_instance = MagicMock() + mock_task.return_value = mock_task_instance + mock_task_instance.set_file_mounts = MagicMock() + mock_task_instance.set_storage_mounts = MagicMock() + mock_task_instance.set_resources = MagicMock() + + executor = SkypilotExecutor( + container_image="test:latest", + file_mounts={"/workspace/code": "/local/path"}, + storage_mounts=None, # Explicitly set to None + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + executor.job_dir = tmp_dir + + with patch.object(SkypilotExecutor, "to_resources") as mock_to_resources: + mock_to_resources.return_value = MagicMock() + + executor.to_task("test_task") + + # Verify set_storage_mounts was NOT called + mock_task_instance.set_storage_mounts.assert_not_called() + + # Verify file_mounts still works + mock_task_instance.set_file_mounts.assert_called_once() + + def test_init_with_storage_mounts(self, mock_skypilot_imports): + # Test initialization with storage_mounts parameter + executor = SkypilotExecutor( + container_image="test:latest", + storage_mounts={ + "/workspace/data": { + "name": "training-data", + "store": "s3", + "mode": "MOUNT", + } + }, + ) + + assert executor.storage_mounts is not None + assert "/workspace/data" in executor.storage_mounts + assert executor.storage_mounts["/workspace/data"]["name"] == "training-data" + assert executor.storage_mounts["/workspace/data"]["store"] == "s3" From 3ec63b951a3cf3733358f3ed2a55e87bf466d263 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 19 Sep 2025 11:11:17 -0700 Subject: [PATCH 13/28] Backward compatibility for SkyPilot 0.10.3+ (#339) Signed-off-by: Romil Bhardwaj --- nemo_run/core/execution/skypilot.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/nemo_run/core/execution/skypilot.py b/nemo_run/core/execution/skypilot.py index 49d5dd57..4320f074 100644 --- a/nemo_run/core/execution/skypilot.py +++ b/nemo_run/core/execution/skypilot.py @@ -419,11 +419,17 @@ def launch( dryrun: bool = False, ) -> tuple[Optional[int], Optional["backends.ResourceHandle"]]: from sky import backends, launch, stream_and_get - from sky.utils import common_utils + + # Backward compatibility for SkyPilot 0.10.3+ + # dump_yaml_str moved from sky.utils.common_utils to yaml_utils + try: + from sky.utils import yaml_utils + except ImportError: + from sky.utils import common_utils as yaml_utils task_yml = os.path.join(self.job_dir, "skypilot_task.yml") with open(task_yml, "w+") as f: - f.write(common_utils.dump_yaml_str(task.to_yaml_config())) + f.write(yaml_utils.dump_yaml_str(task.to_yaml_config())) backend = backends.CloudVmRayBackend() if num_nodes: From f5970c59529949593b845476348da16c8235289b Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Mon, 29 Sep 2025 08:37:50 -0700 Subject: [PATCH 14/28] Update cherry-pick workflow to use version 0.63.0 (#344) Signed-off-by: Pablo Garay --- .github/workflows/cherry-pick-release-commit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 8b21bd3f..32f06df5 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -7,7 +7,7 @@ on: jobs: cherry-pick: - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cherry_pick.yml@v0.22.7 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cherry_pick.yml@v0.63.0 secrets: PAT: ${{ secrets.PAT }} SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} From 6bc43190cfd84fe8ccfd05d0543f6194782aa21d Mon Sep 17 00:00:00 2001 From: Rahim Dharssi Date: Tue, 30 Sep 2025 09:12:13 -0700 Subject: [PATCH 15/28] Create SkypilotJobsExecutor to allow running managed jobs (#343) * Create SkypilotJobsExecutor to allow running managed jobs with Skypilot API Signed-off-by: Rahim Dharssi * Remove unnecessary comments Signed-off-by: Rahim Dharssi * fix lints Signed-off-by: Rahim Dharssi * Add comment for suppressing import error Signed-off-by: Rahim Dharssi * Write unit tests for _save_job_dir and _get_job_dirs Signed-off-by: Rahim Dharssi * Fix lints Signed-off-by: Rahim Dharssi --------- Signed-off-by: Rahim Dharssi --- nemo_run/__init__.py | 2 + nemo_run/core/execution/skypilot_jobs.py | 422 ++++++++++++++ nemo_run/run/experiment.py | 3 + nemo_run/run/torchx_backend/schedulers/api.py | 3 + .../schedulers/skypilot_jobs.py | 240 ++++++++ pyproject.toml | 1 + test/core/execution/test_skypilot_jobs.py | 514 ++++++++++++++++++ .../schedulers/test_skypilot_jobs.py | 229 ++++++++ 8 files changed, 1414 insertions(+) create mode 100644 nemo_run/core/execution/skypilot_jobs.py create mode 100644 nemo_run/run/torchx_backend/schedulers/skypilot_jobs.py create mode 100644 test/core/execution/test_skypilot_jobs.py create mode 100644 test/run/torchx_backend/schedulers/test_skypilot_jobs.py diff --git a/nemo_run/__init__.py b/nemo_run/__init__.py index 07755c2c..04f56916 100644 --- a/nemo_run/__init__.py +++ b/nemo_run/__init__.py @@ -29,6 +29,7 @@ from nemo_run.core.execution.local import LocalExecutor from nemo_run.core.execution.skypilot import SkypilotExecutor from nemo_run.core.execution.slurm import SlurmExecutor +from nemo_run.core.execution.skypilot_jobs import SkypilotJobsExecutor from nemo_run.core.packaging import GitArchivePackager, HybridPackager, Packager, PatternPackager from nemo_run.core.tunnel.client import LocalTunnel, SSHTunnel from nemo_run.devspace.base import DevSpace @@ -68,6 +69,7 @@ "run", "Script", "SkypilotExecutor", + "SkypilotJobsExecutor", "SlurmExecutor", "SSHTunnel", "Torchrun", diff --git a/nemo_run/core/execution/skypilot_jobs.py b/nemo_run/core/execution/skypilot_jobs.py new file mode 100644 index 00000000..ddb9854b --- /dev/null +++ b/nemo_run/core/execution/skypilot_jobs.py @@ -0,0 +1,422 @@ +import logging +import os +import subprocess +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional, Type, Union + +from invoke.context import Context + +from nemo_run.config import RUNDIR_NAME +from nemo_run.core.execution.base import ( + Executor, + ExecutorMacros, +) +from nemo_run.core.packaging.base import Packager +from nemo_run.core.packaging.git import GitArchivePackager + +_SKYPILOT_AVAILABLE: bool = False +try: + import sky + import sky.task as skyt + from sky import backends + + _SKYPILOT_AVAILABLE = True +except ImportError: + # suppress import error so we don't crash if skypilot is not installed. + pass + +logger = logging.getLogger(__name__) + + +@dataclass(kw_only=True) +class SkypilotJobsExecutor(Executor): + """ + Dataclass to configure a Skypilot Jobs Executor. + + This executor launches managed jobs and requires the `Skypilot API Server `. + + Some familiarity with `Skypilot `_ is necessary. + In order to use this executor, you need to install NeMo Run + with either `skypilot` (for only Kubernetes) or `skypilot-all` (for all clouds) optional features. + + Example: + + .. code-block:: python + + skypilot = SkypilotJobsExecutor( + gpus="A10G", + gpus_per_node=devices, + container_image="nvcr.io/nvidia/nemo:dev", + infra="k8s/my-context", + network_tier="best", + cluster_name="nemo_tester", + file_mounts={ + "nemo_run.whl": "nemo_run.whl", + "/workspace/code": "/local/path/to/code", + }, + storage_mounts={ + "/workspace/outputs": { + "name": "my-training-outputs", + "store": "gcs", # or "s3", "azure", etc. + "mode": "MOUNT", + "persistent": True, + }, + "/workspace/checkpoints": { + "name": "model-checkpoints", + "store": "s3", + "mode": "MOUNT", + "persistent": True, + } + }, + setup=\"\"\" + conda deactivate + nvidia-smi + ls -al ./ + pip install nemo_run.whl + cd /opt/NeMo && git pull origin main && pip install . + \"\"\", + ) + + """ + + HEAD_NODE_IP_VAR = "head_node_ip" + NPROC_PER_NODE_VAR = "SKYPILOT_NUM_GPUS_PER_NODE" + NUM_NODES_VAR = "num_nodes" + NODE_RANK_VAR = "SKYPILOT_NODE_RANK" + HET_GROUP_HOST_VAR = "het_group_host" + + container_image: Optional[str] = None + cloud: Optional[Union[str, list[str]]] = None + region: Optional[Union[str, list[str]]] = None + zone: Optional[Union[str, list[str]]] = None + gpus: Optional[Union[str, list[str]]] = None + gpus_per_node: Optional[int] = None + cpus: Optional[Union[int | float, list[int | float]]] = None + memory: Optional[Union[int | float, list[int | float]]] = None + instance_type: Optional[Union[str, list[str]]] = None + num_nodes: int = 1 + use_spot: Optional[Union[bool, list[bool]]] = None + disk_size: Optional[Union[int, list[int]]] = None + disk_tier: Optional[Union[str, list[str]]] = None + ports: Optional[tuple[str]] = None + file_mounts: Optional[dict[str, str]] = None + storage_mounts: Optional[dict[str, dict[str, Any]]] = None + cluster_name: Optional[str] = None + setup: Optional[str] = None + autodown: bool = False + idle_minutes_to_autostop: Optional[int] = None + torchrun_nproc_per_node: Optional[int] = None + cluster_config_overrides: Optional[dict[str, Any]] = None + infra: Optional[str] = None + network_tier: Optional[str] = None + retry_until_up: bool = False + packager: Packager = field(default_factory=GitArchivePackager) # type: ignore # noqa: F821 + + def __post_init__(self): + assert _SKYPILOT_AVAILABLE, ( + 'Skypilot is not installed. Please install it using `pip install "nemo_run[skypilot]"`.' + ) + assert isinstance(self.packager, GitArchivePackager), ( + "Only GitArchivePackager is currently supported for SkypilotExecutor." + ) + if self.infra is not None: + assert self.cloud is None, "Cannot specify both `infra` and `cloud` parameters." + assert self.region is None, "Cannot specify both `infra` and `region` parameters." + assert self.zone is None, "Cannot specify both `infra` and `zone` parameters." + logger.info( + "`cloud` is deprecated and will be removed in a future version. Use `infra` instead." + ) + + @classmethod + def parse_app(cls: Type["SkypilotJobsExecutor"], app_id: str) -> tuple[str, str, int]: + app = app_id.split("___") + cluster, task, job_id = app[0], app[1], app[2] + assert cluster and task and job_id, f"Invalid app id for Skypilot: {app_id}" + return cluster, task, int(job_id) + + def to_resources(self) -> Union[set["sky.Resources"], set["sky.Resources"]]: + from sky.resources import Resources + + resources_cfg = {} + accelerators = None + if self.gpus: + if not self.gpus_per_node: + self.gpus_per_node = 1 + else: + assert isinstance(self.gpus_per_node, int) + + gpus = [self.gpus] if isinstance(self.gpus, str) else self.gpus + + accelerators = {} + for gpu in gpus: + accelerators[gpu] = self.gpus_per_node + + resources_cfg["accelerators"] = accelerators + + if self.container_image: + resources_cfg["image_id"] = self.container_image + + any_of = [] + + def parse_attr(attr: str): + if getattr(self, attr, None) is not None: + value = getattr(self, attr) + if isinstance(value, list): + for i, val in enumerate(value): + if len(any_of) < i + 1: + any_of.append({}) + + if isinstance(val, str) and val.lower() == "none": + any_of[i][attr] = None + else: + any_of[i][attr] = val + else: + if isinstance(value, str) and value.lower() == "none": + resources_cfg[attr] = None + else: + resources_cfg[attr] = value + + attrs = [ + "cloud", + "region", + "zone", + "cpus", + "memory", + "instance_type", + "use_spot", + "infra", + "network_tier", + "image_id", + "disk_size", + "disk_tier", + "ports", + ] + + for attr in attrs: + parse_attr(attr) + + resources_cfg["any_of"] = any_of + if self.cluster_config_overrides: + resources_cfg["_cluster_config_overrides"] = self.cluster_config_overrides + + resources = Resources.from_yaml_config(resources_cfg) + + return resources # type: ignore + + @classmethod + def status(cls: Type["SkypilotJobsExecutor"], app_id: str) -> Optional[dict]: + from sky import stream_and_get + import sky.exceptions as sky_exceptions + import sky.jobs.client.sdk as sky_jobs + + _, _, job_id = cls.parse_app(app_id) + + try: + job_details: List[Dict[str, Any]] = stream_and_get( + sky_jobs.queue(refresh=True, all_users=True, job_ids=[job_id]), + )[0] + except sky_exceptions.ClusterNotUpError: + return None + + return job_details + + @classmethod + def cancel(cls: Type["SkypilotJobsExecutor"], app_id: str): + from sky.jobs.client.sdk import cancel + + _, _, job_id = cls.parse_app(app_id=app_id) + job_details = cls.status(app_id=app_id) + if not job_details: + return + + cancel(job_ids=[job_id]) + + @classmethod + def logs(cls: Type["SkypilotJobsExecutor"], app_id: str, fallback_path: Optional[str]): + import sky.jobs.client.sdk as sky_jobs + + _, _, job_id = cls.parse_app(app_id) + job_details = cls.status(app_id) + + is_terminal = False + if job_details and job_details["status"]: + is_terminal = job_details["status"].is_terminal() + elif not job_details: + is_terminal = True + if fallback_path and is_terminal: + log_path = os.path.expanduser(os.path.join(fallback_path, "run.log")) + if os.path.isfile(log_path): + with open(os.path.expanduser(os.path.join(fallback_path, "run.log"))) as f: + for line in f: + print(line, end="", flush=True) + + return + + sky_jobs.tail_logs(job_id=job_id) + + @property + def workdir(self) -> str: + return os.path.join(f"{self.job_dir}", "workdir") + + def package_configs(self, *cfgs: tuple[str, str]) -> list[str]: + filenames = [] + basepath = os.path.join(self.job_dir, "configs") + for name, cfg in cfgs: + filename = os.path.join(basepath, name) + os.makedirs(os.path.dirname(filename), exist_ok=True) + with open(filename, "w") as f: + f.write(cfg) + + filenames.append( + os.path.join( + "/", + RUNDIR_NAME, + "configs", + name, + ) + ) + + return filenames + + def assign( + self, + exp_id: str, + exp_dir: str, + task_id: str, + task_dir: str, + ): + self.job_name = task_id + self.experiment_dir = exp_dir + self.job_dir = os.path.join(exp_dir, task_dir) + self.experiment_id = exp_id + + def package(self, packager: Packager, job_name: str): + assert self.experiment_id, "Executor not assigned to an experiment." + if isinstance(packager, GitArchivePackager): + output = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + check=True, + stdout=subprocess.PIPE, + ) + path = output.stdout.splitlines()[0].decode() + base_path = Path(path).absolute() + else: + base_path = Path(os.getcwd()).absolute() + + local_pkg = packager.package(base_path, self.job_dir, job_name) + local_code_extraction_path = os.path.join(self.job_dir, "code") + ctx = Context() + ctx.run(f"mkdir -p {local_code_extraction_path}") + + if self.get_launcher().nsys_profile: + remote_nsys_extraction_path = os.path.join( + self.job_dir, self.get_launcher().nsys_folder + ) + ctx.run(f"mkdir -p {remote_nsys_extraction_path}") + if local_pkg: + ctx.run( + f"tar -xvzf {local_pkg} -C {local_code_extraction_path} --ignore-zeros", hide=True + ) + + def nnodes(self) -> int: + return self.num_nodes + + def nproc_per_node(self) -> int: + if self.torchrun_nproc_per_node: + return self.torchrun_nproc_per_node + + return self.gpus_per_node or 1 + + def macro_values(self) -> Optional[ExecutorMacros]: + return ExecutorMacros( + head_node_ip_var=self.HEAD_NODE_IP_VAR, + nproc_per_node_var=self.NPROC_PER_NODE_VAR, + num_nodes_var=self.NUM_NODES_VAR, + node_rank_var=self.NODE_RANK_VAR, + het_group_host_var=self.HET_GROUP_HOST_VAR, + ) + + def to_task( + self, + name: str, + cmd: Optional[list[str]] = None, + env_vars: Optional[dict[str, str]] = None, + ) -> "skyt.Task": + from sky.task import Task + + run_cmd = None + if cmd: + run_cmd = f""" +conda deactivate + +num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l` +echo "num_nodes=$num_nodes" + +head_node_ip=`echo "$SKYPILOT_NODE_IPS" | head -n1` +echo "head_node_ip=$head_node_ip" + +cd /nemo_run/code + +{" ".join(cmd)} +""" + task = Task( + name=name, + setup=self.setup if self.setup else "", + run=run_cmd, + envs=self.env_vars, + num_nodes=self.num_nodes, + ) + # Handle regular file mounts + file_mounts = self.file_mounts or {} + file_mounts["/nemo_run"] = self.job_dir + task.set_file_mounts(file_mounts) + + # Handle storage mounts separately + if self.storage_mounts: + from sky.data import Storage + + storage_objects = {} + for mount_path, config in self.storage_mounts.items(): + # Create Storage object from config dict + storage_obj = Storage.from_yaml_config(config) + storage_objects[mount_path] = storage_obj + task.set_storage_mounts(storage_objects) + + task.set_resources(self.to_resources()) + + if env_vars: + task.update_envs(env_vars) + + return task + + def launch( + self, + task: "skyt.Task", + num_nodes: Optional[int] = None, + ) -> tuple[Optional[int], Optional["backends.ResourceHandle"]]: + from sky import stream_and_get + from sky.jobs.client.sdk import launch + + if num_nodes: + task.num_nodes = num_nodes + + job_id, handle = stream_and_get(launch(task)) + + return job_id, handle + + def cleanup(self, handle: str): + import sky.jobs.client.sdk as sky_jobs + + _, _, path_str = handle.partition("://") + path = path_str.split("/") + app_id = path[1] + + _, _, job_id = self.parse_app(app_id) + sky_jobs.download_logs( + name=None, + job_id=job_id, + refresh=True, + controller=True, + local_dir=self.job_dir, + ) diff --git a/nemo_run/run/experiment.py b/nemo_run/run/experiment.py index 67c2f50f..2e681d8c 100644 --- a/nemo_run/run/experiment.py +++ b/nemo_run/run/experiment.py @@ -56,6 +56,7 @@ from nemo_run.core.execution.local import LocalExecutor from nemo_run.core.execution.skypilot import SkypilotExecutor from nemo_run.core.execution.slurm import SlurmExecutor +from nemo_run.core.execution.skypilot_jobs import SkypilotJobsExecutor from nemo_run.core.frontend.console.api import CONSOLE, configure_logging, deconfigure_logging from nemo_run.core.serialization.zlib_json import ZlibJSONSerializer from nemo_run.core.tunnel.client import SSHTunnel, Tunnel @@ -201,6 +202,7 @@ class Experiment(ConfigurableMixin): SlurmExecutor, LocalExecutor, SkypilotExecutor, + SkypilotJobsExecutor, DockerExecutor, DGXCloudExecutor, LeptonExecutor, @@ -208,6 +210,7 @@ class Experiment(ConfigurableMixin): _DETACH_SUPPORTED_EXECUTORS = ( SlurmExecutor, SkypilotExecutor, + SkypilotJobsExecutor, DGXCloudExecutor, LeptonExecutor, ) diff --git a/nemo_run/run/torchx_backend/schedulers/api.py b/nemo_run/run/torchx_backend/schedulers/api.py index 5ade157d..a33ee20a 100644 --- a/nemo_run/run/torchx_backend/schedulers/api.py +++ b/nemo_run/run/torchx_backend/schedulers/api.py @@ -24,10 +24,12 @@ from nemo_run.core.execution.local import LocalExecutor from nemo_run.core.execution.skypilot import SkypilotExecutor from nemo_run.core.execution.slurm import SlurmExecutor +from nemo_run.core.execution.skypilot_jobs import SkypilotJobsExecutor EXECUTOR_MAPPING: dict[Type[Executor], str] = { SlurmExecutor: "slurm_tunnel", SkypilotExecutor: "skypilot", + SkypilotJobsExecutor: "skypilot_jobs", LocalExecutor: "local_persistent", DockerExecutor: "docker_persistent", DGXCloudExecutor: "dgx_cloud", @@ -37,6 +39,7 @@ REVERSE_EXECUTOR_MAPPING: dict[str, Type[Executor]] = { "slurm_tunnel": SlurmExecutor, "skypilot": SkypilotExecutor, + "skypilot_jobs": SkypilotJobsExecutor, "local_persistent": LocalExecutor, "docker_persistent": DockerExecutor, "dgx_cloud": DGXCloudExecutor, diff --git a/nemo_run/run/torchx_backend/schedulers/skypilot_jobs.py b/nemo_run/run/torchx_backend/schedulers/skypilot_jobs.py new file mode 100644 index 00000000..f812d251 --- /dev/null +++ b/nemo_run/run/torchx_backend/schedulers/skypilot_jobs.py @@ -0,0 +1,240 @@ +import json +import os +import shutil +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Optional + +from torchx.schedulers.api import ( + AppDryRunInfo, + DescribeAppResponse, + ListAppResponse, + Scheduler, +) +from torchx.specs import ( + AppDef, + AppState, + ReplicaStatus, + Role, + RoleStatus, + runopts, +) + +from nemo_run.config import get_nemorun_home +from nemo_run.core.execution.base import Executor +from nemo_run.core.execution.skypilot import _SKYPILOT_AVAILABLE +from nemo_run.core.execution.skypilot_jobs import SkypilotJobsExecutor +from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin + +try: + import fcntl + + FCNTL_AVAILABLE = True +except ModuleNotFoundError: + fcntl = None + FCNTL_AVAILABLE = False + +SKYPILOT_STATES = {} +try: + import sky.task as skyt + from sky.jobs.state import ManagedJobStatus + + SKYPILOT_STATES: dict[ManagedJobStatus, AppState] = { + ManagedJobStatus.PENDING: AppState.PENDING, + ManagedJobStatus.DEPRECATED_SUBMITTED: AppState.SUBMITTED, + ManagedJobStatus.STARTING: AppState.SUBMITTED, + ManagedJobStatus.RUNNING: AppState.RUNNING, + ManagedJobStatus.RECOVERING: AppState.RUNNING, + ManagedJobStatus.CANCELLING: AppState.CANCELLED, + ManagedJobStatus.SUCCEEDED: AppState.SUCCEEDED, + ManagedJobStatus.CANCELLED: AppState.CANCELLED, + ManagedJobStatus.FAILED: AppState.FAILED, + ManagedJobStatus.FAILED_SETUP: AppState.FAILED, + ManagedJobStatus.FAILED_PRECHECKS: AppState.FAILED, + ManagedJobStatus.FAILED_NO_RESOURCE: AppState.FAILED, + ManagedJobStatus.FAILED_CONTROLLER: AppState.FAILED, + } +except ImportError: + # suppress import error so we don't crash if skypilot is not installed. + pass + +SKYPILOT_JOB_DIRS = os.path.join(get_nemorun_home(), ".skypilot_jobs.json") + + +@dataclass +class SkypilotJobsRequest: + task: "skyt.Task" + executor: SkypilotJobsExecutor + + +class SkypilotJobsScheduler(SchedulerMixin, Scheduler[dict[str, str]]): # type: ignore + def __init__(self, session_name: str) -> None: + super().__init__("skypilot_jobs", session_name) + assert _SKYPILOT_AVAILABLE, ( + 'Skypilot is not installed. Please install it using `pip install "nemo_run[skypilot]"`' + ) + + def _run_opts(self) -> runopts: + opts = runopts() + opts.add( + "job_dir", + type_=str, + help="""The directory to place the job code and outputs. The + directory must not exist and will be created. + """, + ) + return opts + + def schedule(self, dryrun_info: AppDryRunInfo[SkypilotJobsRequest]) -> str: + req = dryrun_info.request + task = req.task + + executor = req.executor + executor.package(executor.packager, job_name=executor.job_name) + job_id, handle = executor.launch(task) + assert job_id and handle, ( + f"Failed scheduling run on Skypilot. Job id: {job_id}, Handle: {handle}" + ) + app_id = f"{handle.get_cluster_name()}___{task.name}___{job_id}" + task_details = SkypilotJobsExecutor.status(app_id=app_id) + if task_details: + _save_job_dir( + app_id, + job_status=task_details["status"].value, + ) + + return app_id + + def _submit_dryrun( # type: ignore + self, app: AppDef, cfg: Executor + ) -> AppDryRunInfo[SkypilotJobsRequest]: + from sky.utils import common_utils + + assert isinstance(cfg, SkypilotJobsExecutor), ( + f"{cfg.__class__} not supported for skypilot jobs scheduler." + ) + executor = cfg + + assert len(app.roles) == 1, "Only 1 role supported for Skypilot jobs executor." + role = app.roles[0] + values = executor.macro_values() + if values: + role = values.apply(role) + + cmd = [role.entrypoint] + role.args + task = cfg.to_task(name=role.name, cmd=cmd, env_vars=role.env) + + req = SkypilotJobsRequest(task=task, executor=cfg) + return AppDryRunInfo(req, lambda req: common_utils.dump_yaml_str(req.task.to_yaml_config())) + + def _validate(self, app: AppDef, scheduler: str) -> None: + # Skip validation step for skypilot + pass + + def describe(self, app_id: str) -> Optional[DescribeAppResponse]: + from sky.jobs.state import ManagedJobStatus + + _, task_name, _ = SkypilotJobsExecutor.parse_app(app_id=app_id) + task_details = SkypilotJobsExecutor.status(app_id=app_id) + + roles = [Role(name=task_name, image="", num_replicas=1)] + roles_statuses = [ + RoleStatus( + task_name, + replicas=[ + ReplicaStatus( + id=0, + role=task_name, + state=AppState.SUBMITTED, + hostname="skypilot-api", + ) + ], + ) + ] + + if not task_details: + past_apps = _get_job_dirs() + if app_id in past_apps and "job_status" in past_apps[app_id]: + job_status = ManagedJobStatus[past_apps[app_id]["job_status"]] + app_state = SKYPILOT_STATES[job_status] + roles_statuses[0].replicas[0].state = app_state + return DescribeAppResponse( + app_id=app_id, + roles=roles, + roles_statuses=roles_statuses, + state=app_state, + msg="", + ) + else: + return None + else: + app_state = SKYPILOT_STATES[task_details["status"]] + _save_job_dir( + app_id, + job_status=task_details["status"].value, + ) + roles_statuses[0].replicas[0].state = app_state + return DescribeAppResponse( + app_id=app_id, + roles=roles, + roles_statuses=roles_statuses, + state=app_state, + msg="", + ) + + def _cancel_existing(self, app_id: str) -> None: + SkypilotJobsExecutor.cancel(app_id=app_id) + + def list(self) -> list[ListAppResponse]: + pass + + +def create_scheduler(session_name: str, **kwargs: Any) -> SkypilotJobsScheduler: + return SkypilotJobsScheduler( + session_name=session_name, + ) + + +def _save_job_dir(app_id: str, job_status: str) -> None: + original_apps = {} + if not os.path.isfile(SKYPILOT_JOB_DIRS): + os.makedirs(os.path.dirname(SKYPILOT_JOB_DIRS), exist_ok=True) + Path(SKYPILOT_JOB_DIRS).touch() + + with open(SKYPILOT_JOB_DIRS, "r+") as f: + if FCNTL_AVAILABLE: + assert fcntl + fcntl.flock(f, fcntl.LOCK_EX) + + try: + try: + original_apps = json.load(f) + except Exception: + original_apps = {} + + app = { + "job_status": job_status, + } + original_apps[app_id] = app + + with tempfile.NamedTemporaryFile(mode="w+") as fp: + json.dump(original_apps, fp) + fp.flush() + + shutil.copy(fp.name, SKYPILOT_JOB_DIRS) + fp.close() + finally: + if FCNTL_AVAILABLE: + assert fcntl + fcntl.flock(f, fcntl.LOCK_UN) + + +def _get_job_dirs() -> dict[str, dict[str, str]]: + try: + with open(SKYPILOT_JOB_DIRS, "r") as f: + apps: dict[str, dict[str, str]] = json.load(f) + except FileNotFoundError: + return {} + + return apps diff --git a/pyproject.toml b/pyproject.toml index 56bfbdc5..05887f0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ local_persistent = "nemo_run.run.torchx_backend.schedulers.local:create_schedule docker_persistent = "nemo_run.run.torchx_backend.schedulers.docker:create_scheduler" dgx_cloud = "nemo_run.run.torchx_backend.schedulers.dgxcloud:create_scheduler" lepton = "nemo_run.run.torchx_backend.schedulers.lepton:create_scheduler" +skypilot_jobs = "nemo_run.run.torchx_backend.schedulers.skypilot_jobs:create_scheduler" [project.optional-dependencies] skypilot = [ diff --git a/test/core/execution/test_skypilot_jobs.py b/test/core/execution/test_skypilot_jobs.py new file mode 100644 index 00000000..f1fecb64 --- /dev/null +++ b/test/core/execution/test_skypilot_jobs.py @@ -0,0 +1,514 @@ +import os +import tempfile +from unittest.mock import MagicMock, mock_open, patch + +import pytest + +from nemo_run.core.execution.skypilot_jobs import SkypilotJobsExecutor +from nemo_run.core.packaging.git import GitArchivePackager + + +@pytest.fixture +def mock_skypilot_imports(): + class MockClusterNotUpError(Exception): + pass + + sky_mock = MagicMock() + sky_task_mock = MagicMock() + backends_mock = MagicMock() + sky_jobs_mock = MagicMock() + job_status_mock = MagicMock() + job_status_mock.RUNNING = "RUNNING" + job_status_mock.SUCCEEDED = "SUCCEEDED" + job_status_mock.FAILED = "FAILED" + job_status_mock.is_terminal = MagicMock() + + sky_exceptions_mock = MagicMock() + sky_exceptions_mock.ClusterNotUpError = MockClusterNotUpError + + modules = { + "sky": sky_mock, + "sky.task": sky_task_mock, + "sky.backends": backends_mock, + "sky.jobs.client.sdk": sky_jobs_mock, + "sky.resources": MagicMock(), + "sky.exceptions": sky_exceptions_mock, + } + + with patch.dict("sys.modules", modules): + with patch("nemo_run.core.execution.skypilot_jobs._SKYPILOT_AVAILABLE", True): + yield ( + sky_mock, + sky_task_mock, + backends_mock, + sky_jobs_mock, + sky_exceptions_mock, + job_status_mock, + ) + + +class TestSkypilotJobsExecutor: + @pytest.fixture + def executor(self, mock_skypilot_imports): + return SkypilotJobsExecutor( + container_image="nvcr.io/nvidia/nemo:latest", + cloud="kubernetes", + cluster_name="test-cluster", + gpus="A100", + gpus_per_node=8, + num_nodes=2, + use_spot=True, + file_mounts={ + "test_file": "/path/to/test_file", + }, + setup="pip install -r requirements.txt", + ) + + def test_init(self, mock_skypilot_imports): + executor = SkypilotJobsExecutor( + container_image="nvcr.io/nvidia/nemo:latest", + cloud="kubernetes", + cluster_name="test-cluster", + gpus="A100", + gpus_per_node=8, + ) + + assert executor.container_image == "nvcr.io/nvidia/nemo:latest" + assert executor.cloud == "kubernetes" + assert executor.cluster_name == "test-cluster" + assert executor.gpus == "A100" + assert executor.gpus_per_node == 8 + assert executor.num_nodes == 1 + assert isinstance(executor.packager, GitArchivePackager) + + def test_init_missing_skypilot(self): + with patch("nemo_run.core.execution.skypilot_jobs._SKYPILOT_AVAILABLE", False): + with pytest.raises(AssertionError, match="Skypilot is not installed"): + SkypilotJobsExecutor( + container_image="nvcr.io/nvidia/nemo:latest", + cloud="kubernetes", + ) + + def test_init_non_git_packager(self, mock_skypilot_imports): + non_git_packager = MagicMock() + + with pytest.raises(AssertionError, match="Only GitArchivePackager is currently supported"): + SkypilotJobsExecutor( + container_image="nvcr.io/nvidia/nemo:latest", + cloud="kubernetes", + packager=non_git_packager, + ) + + def test_init_with_infra_and_cloud_fails(self, mock_skypilot_imports): + with pytest.raises( + AssertionError, match="Cannot specify both `infra` and `cloud` parameters." + ): + SkypilotJobsExecutor( + infra="my-infra", + cloud="aws", + ) + + def test_init_with_infra_and_region_fails(self, mock_skypilot_imports): + with pytest.raises( + AssertionError, match="Cannot specify both `infra` and `region` parameters." + ): + SkypilotJobsExecutor( + infra="my-infra", + region="us-west-2", + ) + + def test_init_with_infra_and_zone_fails(self, mock_skypilot_imports): + with pytest.raises( + AssertionError, match="Cannot specify both `infra` and `zone` parameters." + ): + SkypilotJobsExecutor( + infra="my-infra", + zone="us-west-2a", + ) + + def test_parse_app(self, mock_skypilot_imports): + # Note: SkypilotJobsExecutor uses 3 components instead of 4 + app_id = "cluster-name___task-name___123" + cluster, task, job_id = SkypilotJobsExecutor.parse_app(app_id) + + assert cluster == "cluster-name" + assert task == "task-name" + assert job_id == 123 + + def test_parse_app_invalid(self, mock_skypilot_imports): + # The implementation raises IndexError when the app_id format is invalid + with pytest.raises(IndexError): + SkypilotJobsExecutor.parse_app("invalid_app_id") + + # Test with a partially valid app_id + with pytest.raises(IndexError): + SkypilotJobsExecutor.parse_app("cluster___task") + + @patch("sky.resources.Resources") + def test_to_resources_with_gpu(self, mock_resources, mock_skypilot_imports, executor): + executor.to_resources() + + mock_resources.from_yaml_config.assert_called_once() + config = mock_resources.from_yaml_config.call_args[0][0] + assert "accelerators" in config + assert config["accelerators"] == {"A100": 8} + + @patch("sky.resources.Resources") + def test_to_resources_with_container(self, mock_resources, mock_skypilot_imports): + executor = SkypilotJobsExecutor( + container_image="nvcr.io/nvidia/nemo:latest", + cloud="kubernetes", + ) + + executor.to_resources() + + mock_resources.from_yaml_config.assert_called_once() + config = mock_resources.from_yaml_config.call_args[0][0] + assert config["image_id"] == "nvcr.io/nvidia/nemo:latest" + + @patch("sky.resources.Resources") + def test_to_resources_with_list_values(self, mock_resources, mock_skypilot_imports): + executor = SkypilotJobsExecutor( + cloud=["aws", "azure"], + region=["us-west-2", "eastus"], + cpus=[16, 8], + memory=[64, 32], + ) + + executor.to_resources() + + mock_resources.from_yaml_config.assert_called_once() + config = mock_resources.from_yaml_config.call_args[0][0] + assert len(config["any_of"]) == 2 + assert config["any_of"][0]["cloud"] == "aws" + assert config["any_of"][0]["region"] == "us-west-2" + assert config["any_of"][0]["cpus"] == 16 + assert config["any_of"][0]["memory"] == 64 + assert config["any_of"][1]["cloud"] == "azure" + assert config["any_of"][1]["region"] == "eastus" + assert config["any_of"][1]["cpus"] == 8 + assert config["any_of"][1]["memory"] == 32 + + @patch("sky.resources.Resources") + def test_to_resources_with_none_string(self, mock_resources, mock_skypilot_imports): + executor = SkypilotJobsExecutor( + cloud="none", + region=["us-west-2", "none"], + ) + + executor.to_resources() + + mock_resources.from_yaml_config.assert_called_once() + config = mock_resources.from_yaml_config.call_args[0][0] + assert config["cloud"] is None + assert config["any_of"][1]["region"] is None + + @patch("sky.resources.Resources") + def test_to_resources_with_infra_and_network_tier(self, mock_resources, mock_skypilot_imports): + executor = SkypilotJobsExecutor(infra="k8s/my-context", network_tier="best") + + executor.to_resources() + + mock_resources.from_yaml_config.assert_called_once() + + config = mock_resources.from_yaml_config.call_args[0][0] + assert config["infra"] == "k8s/my-context" + assert config["network_tier"] == "best" + + @patch("sky.stream_and_get") + @patch("sky.jobs.client.sdk.queue") + @patch("nemo_run.core.execution.skypilot_jobs.SkypilotJobsExecutor.parse_app") + def test_status_success(self, mock_parse_app, mock_queue, mock_stream_and_get): + mock_job_details = {"job_id": 123, "status": "RUNNING"} + mock_stream_and_get.return_value = [mock_job_details] + mock_parse_app.return_value = ("cluster-name", "task-name", 123) + + details = SkypilotJobsExecutor.status("cluster-name___task-name___123") + + assert details == mock_job_details + mock_stream_and_get.assert_called_once() + mock_queue.assert_called_once_with(refresh=True, all_users=True, job_ids=[123]) + + @patch("sky.stream_and_get") + @patch("sky.jobs.client.sdk.queue") + @patch("nemo_run.core.execution.skypilot_jobs.SkypilotJobsExecutor.parse_app") + def test_status_cluster_not_up(self, mock_parse_app, mock_queue, mock_stream_and_get): + class MockClusterNotUpError(Exception): + pass + + mock_stream_and_get.side_effect = MockClusterNotUpError("Cluster not up") + mock_parse_app.return_value = ("cluster-name", "task-name", 123) + + with patch("sky.exceptions.ClusterNotUpError", MockClusterNotUpError): + job_details = SkypilotJobsExecutor.status("cluster-name___task-name___123") + assert job_details is None + + @patch("sky.jobs.client.sdk.tail_logs") + @patch("nemo_run.core.execution.skypilot_jobs.SkypilotJobsExecutor.status") + @patch("nemo_run.core.execution.skypilot_jobs.SkypilotJobsExecutor.parse_app") + def test_logs_running_job(self, mock_parse_app, mock_status, mock_tail_logs): + mock_parse_app.return_value = ("cluster-name", "task-name", 123) + mock_job_status = MagicMock() + mock_job_status.is_terminal.return_value = False + mock_status.return_value = {"job_id": 123, "status": mock_job_status} + + SkypilotJobsExecutor.logs("cluster-name___task-name___123", "/path/to/logs") + + mock_tail_logs.assert_called_once_with(job_id=123) + + @patch("nemo_run.core.execution.skypilot_jobs.SkypilotJobsExecutor.status") + @patch("nemo_run.core.execution.skypilot_jobs.SkypilotJobsExecutor.parse_app") + @patch("builtins.open", new_callable=mock_open, read_data="Test log content") + @patch("os.path.isfile") + @patch("builtins.print") + def test_logs_terminal_job_fallback( + self, mock_print, mock_isfile, mock_open, mock_parse_app, mock_status + ): + mock_parse_app.return_value = ("cluster-name", "task-name", 123) + mock_job_status = MagicMock() + mock_job_status.is_terminal.return_value = True + mock_status.return_value = {"job_id": 123, "status": mock_job_status} + mock_isfile.return_value = True + + SkypilotJobsExecutor.logs("cluster-name___task-name___123", "/path/to/logs") + + mock_open.assert_called_once() + mock_print.assert_called_with("Test log content", end="", flush=True) + + @patch("sky.jobs.client.sdk.cancel") + @patch("nemo_run.core.execution.skypilot_jobs.SkypilotJobsExecutor.status") + @patch("nemo_run.core.execution.skypilot_jobs.SkypilotJobsExecutor.parse_app") + def test_cancel(self, mock_parse_app, mock_status, mock_cancel): + mock_parse_app.return_value = ("cluster-name", "task-name", 123) + mock_status.return_value = {"job_id": 123, "status": "RUNNING"} + + SkypilotJobsExecutor.cancel("cluster-name___task-name___123") + + mock_cancel.assert_called_once_with(job_ids=[123]) + + @patch("sky.jobs.client.sdk.cancel") + @patch("nemo_run.core.execution.skypilot_jobs.SkypilotJobsExecutor.status") + @patch("nemo_run.core.execution.skypilot_jobs.SkypilotJobsExecutor.parse_app") + def test_cancel_no_job(self, mock_parse_app, mock_status, mock_cancel): + mock_parse_app.return_value = ("cluster-name", "task-name", 123) + mock_status.return_value = None + + SkypilotJobsExecutor.cancel("cluster-name___task-name___123") + + mock_cancel.assert_not_called() + + @patch("sky.stream_and_get") + @patch("sky.jobs.client.sdk.launch") + def test_launch(self, mock_launch, mock_stream_and_get, executor): + mock_handle = MagicMock() + mock_launch.return_value = MagicMock() + mock_stream_and_get.return_value = (123, mock_handle) + + job_id, handle = executor.launch(MagicMock()) + + assert job_id == 123 + assert handle is mock_handle + + def test_workdir(self, executor): + executor.job_dir = "/path/to/job" + assert executor.workdir == "/path/to/job/workdir" + + @patch("os.path.exists") + def test_package_configs(self, mock_exists, executor): + with tempfile.TemporaryDirectory() as tmp_dir: + executor.job_dir = tmp_dir + mock_exists.return_value = True + configs = executor.package_configs( + ("config1.yaml", "content1"), ("config2.yaml", "content2") + ) + + assert len(configs) == 2 + assert configs[0].endswith("config1.yaml") + assert configs[1].endswith("config2.yaml") + + def test_assign(self, executor): + with tempfile.TemporaryDirectory() as tmp_dir: + executor.assign( + exp_id="test_exp", + exp_dir=tmp_dir, + task_id="test_task", + task_dir="test_task_dir", + ) + + assert executor.experiment_id == "test_exp" + assert executor.experiment_dir == tmp_dir + assert executor.job_dir == os.path.join(tmp_dir, "test_task_dir") + assert executor.job_name == "test_task" + + def test_nnodes(self, executor): + assert executor.nnodes() == 2 + + default_executor = SkypilotJobsExecutor(container_image="test:latest") + assert default_executor.nnodes() == 1 + + def test_nproc_per_node(self, executor): + assert executor.nproc_per_node() == 8 + + executor.torchrun_nproc_per_node = 4 + assert executor.nproc_per_node() == 4 + + def test_macro_values(self, executor): + macro_values = executor.macro_values() + + assert macro_values is not None + assert macro_values.head_node_ip_var == "head_node_ip" + assert macro_values.nproc_per_node_var == "SKYPILOT_NUM_GPUS_PER_NODE" + assert macro_values.num_nodes_var == "num_nodes" + assert macro_values.node_rank_var == "SKYPILOT_NODE_RANK" + assert macro_values.het_group_host_var == "het_group_host" + + @patch("sky.task.Task") + def test_to_task(self, mock_task, mock_skypilot_imports, executor): + mock_task_instance = MagicMock() + mock_task.return_value = mock_task_instance + + with tempfile.TemporaryDirectory() as tmp_dir: + executor.job_dir = tmp_dir + executor.file_mounts = {"test_file": "/path/to/test_file"} + + result = executor.to_task( + "test_task", ["python", "train.py"], {"TEST_VAR": "test_value"} + ) + + mock_task.assert_called_once() + assert mock_task.call_args[1]["name"] == "test_task" + assert mock_task.call_args[1]["num_nodes"] == 2 + mock_task_instance.set_file_mounts.assert_called_once() + mock_task_instance.set_resources.assert_called_once() + mock_task_instance.update_envs.assert_called_once_with({"TEST_VAR": "test_value"}) + assert result == mock_task_instance + + @patch("sky.task.Task") + def test_to_task_with_storage_mounts(self, mock_task, mock_skypilot_imports): + # Create a mock task instance + mock_task_instance = MagicMock() + mock_task.return_value = mock_task_instance + mock_task_instance.set_file_mounts = MagicMock() + mock_task_instance.set_storage_mounts = MagicMock() + mock_task_instance.set_resources = MagicMock() + + # Mock sky.data.Storage + mock_storage_class = MagicMock() + mock_storage_obj = MagicMock() + mock_storage_class.from_yaml_config.return_value = mock_storage_obj + + executor = SkypilotJobsExecutor( + container_image="test:latest", + storage_mounts={ + "/workspace/outputs": { + "name": "my-outputs", + "store": "gcs", + "mode": "MOUNT", + "persistent": True, + } + }, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + executor.job_dir = tmp_dir + + with patch("sky.data.Storage", mock_storage_class): + executor.to_task("test_task") + + # Verify Storage.from_yaml_config was called with the config + mock_storage_class.from_yaml_config.assert_called_once_with( + { + "name": "my-outputs", + "store": "gcs", + "mode": "MOUNT", + "persistent": True, + } + ) + + # Verify set_storage_mounts was called with Storage objects + mock_task_instance.set_storage_mounts.assert_called_once() + storage_mounts_call = mock_task_instance.set_storage_mounts.call_args[0][0] + assert "/workspace/outputs" in storage_mounts_call + assert storage_mounts_call["/workspace/outputs"] == mock_storage_obj + + @patch("sky.task.Task") + def test_to_task_with_both_file_and_storage_mounts(self, mock_task, mock_skypilot_imports): + # Create a mock task instance + mock_task_instance = MagicMock() + mock_task.return_value = mock_task_instance + mock_task_instance.set_file_mounts = MagicMock() + mock_task_instance.set_storage_mounts = MagicMock() + mock_task_instance.set_resources = MagicMock() + + # Mock sky.data.Storage + mock_storage_class = MagicMock() + mock_storage_obj = MagicMock() + mock_storage_class.from_yaml_config.return_value = mock_storage_obj + + executor = SkypilotJobsExecutor( + container_image="test:latest", + file_mounts={ + "/workspace/code": "/local/path/to/code", + }, + storage_mounts={ + "/workspace/outputs": { + "name": "my-outputs", + "store": "s3", + "mode": "MOUNT", + }, + "/workspace/checkpoints": { + "name": "my-checkpoints", + "store": "gcs", + "mode": "MOUNT_CACHED", + }, + }, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + executor.job_dir = tmp_dir + + with patch("sky.data.Storage", mock_storage_class): + executor.to_task("test_task") + + # Verify file_mounts includes both user files and nemo_run + file_mounts_call = mock_task_instance.set_file_mounts.call_args[0][0] + assert "/workspace/code" in file_mounts_call + assert file_mounts_call["/workspace/code"] == "/local/path/to/code" + assert "/nemo_run" in file_mounts_call + assert file_mounts_call["/nemo_run"] == tmp_dir + + # Verify Storage.from_yaml_config was called for both storage mounts + assert mock_storage_class.from_yaml_config.call_count == 2 + + # Verify set_storage_mounts was called with both Storage objects + mock_task_instance.set_storage_mounts.assert_called_once() + storage_mounts_call = mock_task_instance.set_storage_mounts.call_args[0][0] + assert "/workspace/outputs" in storage_mounts_call + assert "/workspace/checkpoints" in storage_mounts_call + assert len(storage_mounts_call) == 2 + + @patch("sky.task.Task") + def test_to_task_without_storage_mounts(self, mock_task, mock_skypilot_imports): + # Test that set_storage_mounts is not called when storage_mounts is None + mock_task_instance = MagicMock() + mock_task.return_value = mock_task_instance + mock_task_instance.set_file_mounts = MagicMock() + mock_task_instance.set_storage_mounts = MagicMock() + mock_task_instance.set_resources = MagicMock() + + executor = SkypilotJobsExecutor( + container_image="test:latest", + file_mounts={"/workspace/code": "/local/path"}, + storage_mounts=None, # Explicitly set to None + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + executor.job_dir = tmp_dir + + executor.to_task("test_task") + + # Verify set_storage_mounts was NOT called + mock_task_instance.set_storage_mounts.assert_not_called() + + # Verify file_mounts still works + mock_task_instance.set_file_mounts.assert_called_once() diff --git a/test/run/torchx_backend/schedulers/test_skypilot_jobs.py b/test/run/torchx_backend/schedulers/test_skypilot_jobs.py new file mode 100644 index 00000000..9c8b1c6d --- /dev/null +++ b/test/run/torchx_backend/schedulers/test_skypilot_jobs.py @@ -0,0 +1,229 @@ +import json +import os +import tempfile +from unittest import mock + +import pytest +from torchx.schedulers.api import AppDryRunInfo +from torchx.specs import AppDef, Role + +from nemo_run.core.execution.skypilot_jobs import SkypilotJobsExecutor +from nemo_run.run.torchx_backend.schedulers.skypilot_jobs import ( + SkypilotJobsScheduler, + _get_job_dirs, + _save_job_dir, + create_scheduler, +) + + +@pytest.fixture +def mock_app_def(): + return AppDef(name="test_app", roles=[Role(name="test_role", image="")]) + + +@pytest.fixture +def skypilot_jobs_executor(): + return SkypilotJobsExecutor( + job_dir=tempfile.mkdtemp(), + gpus="V100", + gpus_per_node=1, + cloud="aws", + ) + + +@pytest.fixture +def skypilot_jobs_scheduler(): + return create_scheduler(session_name="test_session") + + +def test_create_scheduler(): + scheduler = create_scheduler(session_name="test_session") + assert isinstance(scheduler, SkypilotJobsScheduler) + assert scheduler.session_name == "test_session" + + +def test_skypilot_jobs_scheduler_methods(skypilot_jobs_scheduler): + assert hasattr(skypilot_jobs_scheduler, "_submit_dryrun") + assert hasattr(skypilot_jobs_scheduler, "schedule") + assert hasattr(skypilot_jobs_scheduler, "describe") + assert hasattr(skypilot_jobs_scheduler, "_validate") + + +def test_submit_dryrun(skypilot_jobs_scheduler, mock_app_def, skypilot_jobs_executor): + with mock.patch.object(SkypilotJobsExecutor, "package") as mock_package: + mock_package.return_value = None + + dryrun_info = skypilot_jobs_scheduler._submit_dryrun(mock_app_def, skypilot_jobs_executor) + assert isinstance(dryrun_info, AppDryRunInfo) + assert dryrun_info.request is not None + + +def test_schedule(skypilot_jobs_scheduler, mock_app_def, skypilot_jobs_executor): + class MockHandle: + def get_cluster_name(self): + return "test_cluster_name" + + with ( + mock.patch.object(SkypilotJobsExecutor, "package") as mock_package, + mock.patch.object(SkypilotJobsExecutor, "launch") as mock_launch, + mock.patch.object(SkypilotJobsExecutor, "status") as mock_status, + ): + mock_package.return_value = None + mock_launch.return_value = (123, MockHandle()) + mock_status.return_value = None + + skypilot_jobs_executor.job_name = "test_job" + skypilot_jobs_executor.experiment_id = "test_session" + + dryrun_info = skypilot_jobs_scheduler._submit_dryrun(mock_app_def, skypilot_jobs_executor) + app_id = skypilot_jobs_scheduler.schedule(dryrun_info) + + # Note: SkypilotJobsExecutor uses 3-component app_id format (no experiment_id prefix) + assert app_id == "test_cluster_name___test_role___123" + mock_package.assert_called_once() + mock_launch.assert_called_once() + + +def test_cancel_existing(skypilot_jobs_scheduler): + with mock.patch.object(SkypilotJobsExecutor, "cancel") as mock_cancel: + skypilot_jobs_scheduler._cancel_existing("test_cluster_name___test_role___123") + mock_cancel.assert_called_once_with(app_id="test_cluster_name___test_role___123") + + +def test_describe_no_status(skypilot_jobs_scheduler): + with ( + mock.patch.object(SkypilotJobsExecutor, "status") as mock_status, + mock.patch( + "nemo_run.run.torchx_backend.schedulers.skypilot_jobs._get_job_dirs" + ) as mock_get_job_dirs, + ): + mock_status.return_value = None + mock_get_job_dirs.return_value = {} + + result = skypilot_jobs_scheduler.describe("test_cluster___test_role___123") + assert result is None + + +def test_describe_with_status(skypilot_jobs_scheduler): + from sky.jobs.state import ManagedJobStatus + + task_details = {"status": ManagedJobStatus.RUNNING, "job_id": 123} + + with ( + mock.patch.object(SkypilotJobsExecutor, "status") as mock_status, + mock.patch( + "nemo_run.run.torchx_backend.schedulers.skypilot_jobs._save_job_dir" + ) as mock_save, + ): + mock_status.return_value = task_details + + result = skypilot_jobs_scheduler.describe("test_cluster___test_role___123") + + assert result is not None + assert result.app_id == "test_cluster___test_role___123" + assert len(result.roles) == 1 + assert result.roles[0].name == "test_role" + mock_save.assert_called_once() + + +def test_describe_with_past_jobs(skypilot_jobs_scheduler): + past_apps = {"test_cluster___test_role___123": {"job_status": "SUCCEEDED"}} + + with ( + mock.patch.object(SkypilotJobsExecutor, "status") as mock_status, + mock.patch( + "nemo_run.run.torchx_backend.schedulers.skypilot_jobs._get_job_dirs" + ) as mock_get_job_dirs, + ): + mock_status.return_value = None + mock_get_job_dirs.return_value = past_apps + + result = skypilot_jobs_scheduler.describe("test_cluster___test_role___123") + + assert result is not None + assert result.app_id == "test_cluster___test_role___123" + # The state should be mapped from SUCCEEDED status + from torchx.specs import AppState + + assert result.state == AppState.SUCCEEDED + + +def test_save_job_dir_new_file(): + """Test _save_job_dir when the job file doesn't exist.""" + with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as f: + temp_path = f.name + os.unlink(temp_path) # Remove file to test creation + + try: + with mock.patch( + "nemo_run.run.torchx_backend.schedulers.skypilot_jobs.SKYPILOT_JOB_DIRS", temp_path + ): + _save_job_dir("test_app_id", "RUNNING") + + # Verify the file was created and contains expected data + assert os.path.exists(temp_path) + with open(temp_path, "r") as f: + data = json.load(f) + + assert "test_app_id" in data + assert data["test_app_id"]["job_status"] == "RUNNING" + finally: + if os.path.exists(temp_path): + os.unlink(temp_path) + + +def test_save_job_dir_existing_file(): + """Test _save_job_dir when the job file already exists with data.""" + with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as f: + temp_path = f.name + json.dump({"existing_app": {"job_status": "SUCCEEDED"}}, f) + + try: + with mock.patch( + "nemo_run.run.torchx_backend.schedulers.skypilot_jobs.SKYPILOT_JOB_DIRS", temp_path + ): + _save_job_dir("new_app_id", "PENDING") + + # Verify both old and new data exist + with open(temp_path, "r") as f: + data = json.load(f) + + assert "existing_app" in data + assert data["existing_app"]["job_status"] == "SUCCEEDED" + assert "new_app_id" in data + assert data["new_app_id"]["job_status"] == "PENDING" + finally: + if os.path.exists(temp_path): + os.unlink(temp_path) + + +def test_get_job_dirs_existing_file(): + """Test _get_job_dirs with an existing file containing data.""" + test_data = { + "app1": {"job_status": "RUNNING"}, + "app2": {"job_status": "SUCCEEDED"}, + } + with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as f: + temp_path = f.name + json.dump(test_data, f) + + try: + with mock.patch( + "nemo_run.run.torchx_backend.schedulers.skypilot_jobs.SKYPILOT_JOB_DIRS", temp_path + ): + result = _get_job_dirs() + assert result == test_data + finally: + if os.path.exists(temp_path): + os.unlink(temp_path) + + +def test_get_job_dirs_file_not_found(): + """Test _get_job_dirs when the file doesn't exist.""" + non_existent_path = "/tmp/definitely_does_not_exist_12345.json" + + with mock.patch( + "nemo_run.run.torchx_backend.schedulers.skypilot_jobs.SKYPILOT_JOB_DIRS", non_existent_path + ): + result = _get_job_dirs() + assert result == {} From f104fe6f9313b188e56f1d48f0dbed5b5486ea28 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Fri, 3 Oct 2025 15:45:40 -0700 Subject: [PATCH 16/28] Refactor tar packaging logic to work for submodule and extra repo (#347) * Refactor tar packaging logic for improved performance and simplicity Signed-off-by: smajumdar * Clarify tar repacking logic to avoid issues with concatenating tar files Signed-off-by: smajumdar * Remove redundant test for concatenating tar files on Linux Signed-off-by: smajumdar * spell check fix Signed-off-by: Hemil Desai --------- Signed-off-by: smajumdar Signed-off-by: Hemil Desai Co-authored-by: Hemil Desai --- nemo_run/core/packaging/git.py | 25 ++++++++------------ nemo_run/run/experiment.py | 4 ++-- test/core/packaging/test_git.py | 41 --------------------------------- 3 files changed, 11 insertions(+), 59 deletions(-) diff --git a/nemo_run/core/packaging/git.py b/nemo_run/core/packaging/git.py index ae5eb047..79bb3974 100644 --- a/nemo_run/core/packaging/git.py +++ b/nemo_run/core/packaging/git.py @@ -88,22 +88,15 @@ def _concatenate_tar_files( quoted_files = [shlex.quote(f) for f in files_to_concatenate] quoted_output_file = shlex.quote(output_file) - if os.uname().sysname == "Linux": - # Start from the first archive then append the rest, to avoid self-append issues - first_file, *rest_files = quoted_files - ctx.run(f"cp {first_file} {quoted_output_file}") - if rest_files: - ctx.run(f"tar Af {quoted_output_file} {' '.join(rest_files)}") - # Remove all input fragments - ctx.run(f"rm {' '.join(quoted_files)}") - else: - # Extract all fragments and repack once (faster than iterative extract/append) - temp_dir = f"temp_extract_{uuid.uuid4()}" - ctx.run(f"mkdir -p {temp_dir}") - for file in quoted_files: - ctx.run(f"tar xf {file} -C {temp_dir}") - ctx.run(f"tar cf {quoted_output_file} -C {temp_dir} .") - ctx.run(f"rm -r {temp_dir} {' '.join(quoted_files)}") + # Extract all fragments and repack once (faster than iterative extract/append) + # Note: Avoid using tar Af based solution as it does not properly concatenate + # tar files for additional filepaths and submodules. + temp_dir = f"temp_extract_{uuid.uuid4()}" + ctx.run(f"mkdir -p {temp_dir}") + for file in quoted_files: + ctx.run(f"tar xf {file} -C {temp_dir}") + ctx.run(f"tar cf {quoted_output_file} -C {temp_dir} .") + ctx.run(f"rm -r {temp_dir} {' '.join(quoted_files)}") def package(self, path: Path, job_dir: str, name: str) -> str: output_file = os.path.join(job_dir, f"{name}.tar.gz") diff --git a/nemo_run/run/experiment.py b/nemo_run/run/experiment.py index 2e681d8c..9e6a6c88 100644 --- a/nemo_run/run/experiment.py +++ b/nemo_run/run/experiment.py @@ -55,8 +55,8 @@ from nemo_run.core.execution.lepton import LeptonExecutor from nemo_run.core.execution.local import LocalExecutor from nemo_run.core.execution.skypilot import SkypilotExecutor -from nemo_run.core.execution.slurm import SlurmExecutor from nemo_run.core.execution.skypilot_jobs import SkypilotJobsExecutor +from nemo_run.core.execution.slurm import SlurmExecutor from nemo_run.core.frontend.console.api import CONSOLE, configure_logging, deconfigure_logging from nemo_run.core.serialization.zlib_json import ZlibJSONSerializer from nemo_run.core.tunnel.client import SSHTunnel, Tunnel @@ -639,7 +639,7 @@ def run( If sequential=True, all tasks will be run one after the other. The order is based on the order in which they were added. - Parallel mode only works if all exectuors in the experiment support it. + Parallel mode only works if all executors in the experiment support it. Currently, all executors support parallel mode. In sequential mode, if all executor supports dependencies, then all tasks will be scheduled at once diff --git a/test/core/packaging/test_git.py b/test/core/packaging/test_git.py index 4ccfa46c..d750c83f 100644 --- a/test/core/packaging/test_git.py +++ b/test/core/packaging/test_git.py @@ -463,47 +463,6 @@ def test_concatenate_tar_files_non_linux_integration(tmp_path, monkeypatch): assert names == ["./fileA.txt", "./fileB.txt"] -def test_concatenate_tar_files_linux_emits_expected_commands(monkeypatch, tmp_path): - # Simulate Linux branch; use a dummy Context that records commands instead of executing - monkeypatch.setattr(os, "uname", lambda: SimpleNamespace(sysname="Linux")) - - class DummyContext: - def __init__(self): - self.commands: list[str] = [] - - def run(self, cmd: str, **_kwargs): - self.commands.append(cmd) - - # Support ctx.cd(...) context manager API - def cd(self, _path: Path): - class _CD: - def __enter__(self_nonlocal): - return self - - def __exit__(self_nonlocal, exc_type, exc, tb): - return False - - return _CD() - - # Fake inputs (do not need to exist since we don't execute) - tar1 = str(tmp_path / "one.tar") - tar2 = str(tmp_path / "two.tar") - tar3 = str(tmp_path / "three.tar") - out_tar = str(tmp_path / "out.tar") - - ctx = DummyContext() - packager = GitArchivePackager() - packager._concatenate_tar_files(ctx, out_tar, [tar1, tar2, tar3]) - - # Expected sequence: cp first -> tar Af rest -> rm all inputs - assert len(ctx.commands) == 3 - assert ctx.commands[0] == f"cp {shlex.quote(tar1)} {shlex.quote(out_tar)}" - assert ( - ctx.commands[1] == f"tar Af {shlex.quote(out_tar)} {shlex.quote(tar2)} {shlex.quote(tar3)}" - ) - assert ctx.commands[2] == f"rm {shlex.quote(tar1)} {shlex.quote(tar2)} {shlex.quote(tar3)}" - - @patch("nemo_run.core.packaging.git.Context", MockContext) def test_include_pattern_length_mismatch_raises(packager, temp_repo): # Mismatch between include_pattern and include_pattern_relative_path should raise From 0c6880919fab172ce9beb503592799cf0e28b043 Mon Sep 17 00:00:00 2001 From: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> Date: Tue, 7 Oct 2025 14:45:52 -0500 Subject: [PATCH 17/28] Documentation Restructurting (#350) * Fixing documentation layout Signed-off-by: Andrew Schilling * documentation.md Signed-off-by: Andrew Schilling * Removing live-server Signed-off-by: Andrew Schilling * Correctin .vscode Signed-off-by: Andrew Schilling --------- Signed-off-by: Andrew Schilling --- docs/{source => }/conf.py | 24 +++++-- docs/documentation.md | 47 +++++++++++++ docs/{source => }/faqs.md | 0 docs/{source => }/guides/cli.md | 21 +++--- docs/{source => }/guides/configuration.md | 5 +- docs/{source => }/guides/execution.md | 20 ++++-- docs/{source => }/guides/index.md | 12 ++-- docs/{source => }/guides/management.md | 6 +- docs/{source => }/guides/ray.md | 0 docs/{source => }/guides/why-use-nemo-run.md | 0 docs/{source/index.rst => index.md} | 74 +++++++++++++------- docs/{source => }/project.json | 0 docs/{source => }/versions1.json | 2 +- pyproject.toml | 2 + uv.lock | 60 +++++++++++++--- 15 files changed, 206 insertions(+), 67 deletions(-) rename docs/{source => }/conf.py (79%) create mode 100644 docs/documentation.md rename docs/{source => }/faqs.md (100%) rename docs/{source => }/guides/cli.md (98%) rename docs/{source => }/guides/configuration.md (98%) rename docs/{source => }/guides/execution.md (92%) rename docs/{source => }/guides/index.md (88%) rename docs/{source => }/guides/management.md (95%) rename docs/{source => }/guides/ray.md (100%) rename docs/{source => }/guides/why-use-nemo-run.md (100%) rename docs/{source/index.rst => index.md} (55%) rename docs/{source => }/project.json (100%) rename docs/{source => }/versions1.json (56%) diff --git a/docs/source/conf.py b/docs/conf.py similarity index 79% rename from docs/source/conf.py rename to docs/conf.py index 9609288f..f222af11 100644 --- a/docs/source/conf.py +++ b/docs/conf.py @@ -33,10 +33,26 @@ "sphinx.ext.githubpages", "sphinx.ext.napoleon", "sphinxcontrib.mermaid", + "sphinx_copybutton", + "sphinx_new_tab_link", ] templates_path = ["_templates"] -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "documentation.md"] + +# -- Options for MyST Parser (Markdown) -------------------------------------- +# MyST Parser settings +myst_enable_extensions = [ + "dollarmath", # Enables dollar math for inline math + "amsmath", # Enables LaTeX math for display mode + "colon_fence", # Enables code blocks using ::: delimiters instead of ``` + "deflist", # Supports definition lists with term: definition format + "fieldlist", # Enables field lists for metadata like :author: Name + "tasklist", # Adds support for GitHub-style task lists with [ ] and [x] +] +myst_heading_anchors = 5 # Generates anchor links for headings up to level 5 +myst_fence_as_directive = ["mermaid"] + python_maximum_signature_line_length = 88 # Autoapi settings @@ -44,7 +60,7 @@ autoapi_keep_files = False autoapi_add_toctree_entry = False autoapi_type = "python" -autoapi_dirs = ["../../nemo_run"] +autoapi_dirs = ["../nemo_run"] autoapi_file_pattern = "*.py" autoapi_root = "api" autoapi_options = [ @@ -58,10 +74,6 @@ # Autodoc settings autodoc_typehints = "signature" -# MyST settings -myst_heading_anchors = 3 -myst_fence_as_directive = ["mermaid"] - # Napoleon settings napoleon_google_docstring = True napoleon_numpy_docstring = True diff --git a/docs/documentation.md b/docs/documentation.md new file mode 100644 index 00000000..99399ea9 --- /dev/null +++ b/docs/documentation.md @@ -0,0 +1,47 @@ +# Documentation Development + +- [Documentation Development](#documentation-development) + - [Build the Documentation](#build-the-documentation) + - [Live Building](#live-building) + + +## Build the Documentation + +The following sections describe how to set up and build the NeMo RL documentation. + +Switch to the documentation source folder and generate HTML output. + +```sh +cd docs/ +uv run --group docs sphinx-build . _build/html +``` + +* The resulting HTML files are generated in a `_build/html` folder that is created under the project `docs/` folder. +* The generated python API docs are placed in `apidocs` under the `docs/` folder. + +## Checking for Broken Links + +To check for broken http links in the docs, run this command: + +```sh +cd docs/ +uv run --group docs sphinx-build --builder linkcheck . _build/linkcheck +``` + +It will output a JSON file at `_build/linkcheck/output.json` with links it found while building the +docs. Records will have a status of `broken` if the link is not reachable. The `docs/conf.py` file is +configured to ignore github links because the CI test will often experience rate limit errors. +Comment out the `linkcheck_ignore` variable there to check all the links. + +## Live Building + +When writing documentation, it can be helpful to serve the documentation and have it update live while you edit. + +To do so, run: + +```sh +cd docs/ +uv run --group docs sphinx-autobuild . _build/html --port 12345 --host 0.0.0.0 +``` + +Open a web browser and go to `http://${HOST_WHERE_SPHINX_COMMAND_RUN}:12345` to view the output. diff --git a/docs/source/faqs.md b/docs/faqs.md similarity index 100% rename from docs/source/faqs.md rename to docs/faqs.md diff --git a/docs/source/guides/cli.md b/docs/guides/cli.md similarity index 98% rename from docs/source/guides/cli.md rename to docs/guides/cli.md index 052a61ca..226b7d5e 100644 --- a/docs/source/guides/cli.md +++ b/docs/guides/cli.md @@ -2,7 +2,7 @@ NeMo Run CLI is a Python-based command-line tool designed to efficiently configure and execute machine learning experiments. It provides a type-safe, Python-centric alternative to argparse and Hydra, streamlining workflows from prototyping to scaling across diverse environments. -## 1. Introduction +## Introduction NeMo Run CLI simplifies experiment management by leveraging Python's capabilities: @@ -65,7 +65,7 @@ def train(): - **Typer**: General-purpose CLIs with good documentation that don't require nested configuration - **argparse**: Simple scripts with minimal configuration needs and standard library requirements -## 2. Core Concepts +## Core Concepts - **Entrypoints**: Python functions decorated with `@run.cli.entrypoint` serving as primary CLI commands. - **Factories**: Functions decorated with `@run.cli.factory` that configure complex objects (e.g., models, optimizers). @@ -73,7 +73,7 @@ def train(): - **Experiments**: Groups of tasks executed sequentially or concurrently. - **RunContext**: Manages execution settings, including executor configurations. -## 3. Getting Started +## Getting Started ### Example 1: Basic Entrypoint @@ -116,7 +116,7 @@ Output: Unknown argument 'epocks'. Did you mean 'epochs'? ``` -## 4. Advanced Configuration +## Advanced Configuration ### Nested Configurations with Dataclasses @@ -213,25 +213,25 @@ File contents: │ 2 target = "main.train_model" │ 3 batch_size = 32 │ 4 epochs = 10 -│ 5 +│ 5 │ 6 [model] │ 7 target = "main.Model" │ 8 activation = "relu" │ 9 hidden_size = 256 │ 10 num_layers = 5 -│ 11 +│ 11 │ 12 [optimizer] │ 13 target = "main.Optimizer" │ 14 betas = [ 0.9, 0.999,] │ 15 learning_rate = 0.001 │ 16 weight_decay = 1e-5 -│ 17 +│ 17 ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ Export complete. Skipping execution. ``` -## 5. Executors +## Executors Executors determine where your code runs, such as local environments, Docker containers, or Slurm clusters. @@ -283,7 +283,7 @@ def slurm_cluster() -> run.Executor: job_dir=BASE_DIR, container_image="nvcr.io/nvidia/nemo:dev", container_mounts=[ - f"/home/{USER}:/home/{USER}", + f"/home/{USER}:/home/{USER}", "/lustre:/lustre", ], time="4:00:00", @@ -298,7 +298,7 @@ Execute lazily: python script.py --lazy model=alexnet epochs=5 run.executor=slurm_cluster run.executor.nodes=2 ``` -## 6. Advanced CLI Features +## Advanced CLI Features ### Dry Runs and Help Messages @@ -393,4 +393,3 @@ The help output clearly shows: 5. Registered factory functions for each complex argument type This makes it easy for users to discover what factory functions they can use to configure complex arguments like `model` and `optimizer`, along with information about where these factories are defined (module name and line number). - diff --git a/docs/source/guides/configuration.md b/docs/guides/configuration.md similarity index 98% rename from docs/source/guides/configuration.md rename to docs/guides/configuration.md index 7d7b91b0..134f5708 100644 --- a/docs/source/guides/configuration.md +++ b/docs/guides/configuration.md @@ -113,7 +113,10 @@ In our context, this is equivalent to: _target_: nemo.collections.llm.gpt.model.llama.Llama3Config8B seq_length: 16384 ``` -> Note: we've used the [Hydra instantiation](https://hydra.cc/docs/advanced/instantiate_objects/overview/) syntax here. + +```{note} +We've used the [Hydra instantiation](https://hydra.cc/docs/advanced/instantiate_objects/overview/) syntax here. +``` Python operations are performed on the config rather than directly on the class. For example: diff --git a/docs/source/guides/execution.md b/docs/guides/execution.md similarity index 92% rename from docs/source/guides/execution.md rename to docs/guides/execution.md index cd7c49d6..7f34893d 100644 --- a/docs/source/guides/execution.md +++ b/docs/guides/execution.md @@ -14,9 +14,13 @@ A tuple of task and executor form an execution unit. A key goal of NeMo-Run is t Once an execution unit is created, the next step is to run it. The `run.run` function executes a single task, whereas `run.Experiment` offers more fine-grained control to define complex experiments. `run.run` wraps `run.Experiment` with a single task. `run.Experiment` is an API to launch and manage multiple tasks all using pure Python. The `run.Experiment` takes care of storing the run metadata, launching it on the specified cluster, and syncing the logs, etc. Additionally, `run.Experiment` also provides management tools to easily inspect and reproduce past experiments. The `run.Experiment` is inspired from [xmanager](https://github.com/google-deepmind/xmanager/tree/main) and uses [TorchX](https://pytorch.org/torchx/latest/) under the hood to handle execution. -> **_NOTE:_** NeMo-Run assumes familiarity with Docker and uses a docker image as the environment for remote execution. This means you must provide a Docker image that includes all necessary dependencies and configurations when using a remote executor. +```{note} +NeMo-Run assumes familiarity with Docker and uses a docker image as the environment for remote execution. This means you must provide a Docker image that includes all necessary dependencies and configurations when using a remote executor. +``` -> **_NOTE:_** All the experiment metadata is stored under `NEMORUN_HOME` env var on the machine where you launch the experiments. By default, the value for `NEMORUN_HOME` value is `~/.run`. Be sure to change this according to your needs. +```{note} +All the experiment metadata is stored under `NEMORUN_HOME` env var on the machine where you launch the experiments. By default, the value for `NEMORUN_HOME` value is `~/.run`. Be sure to change this according to your needs. +``` ## Executors Executors are dataclasses that configure your remote executor and set up the packaging of your code. All supported executors inherit from the base class `run.Executor`, but have configuration parameters specific to their execution environment. There is an initial cost to understanding the specifics of your executor and setting it up, but this effort is easily amortized over time. @@ -29,7 +33,9 @@ We support the following `launchers`: - `torchrun` or `run.Torchrun`: This will launch the task using `torchrun`. See the `Torchrun` class for configuration options. You can use it using `executor.launcher = "torchrun"` or `executor.launcher = Torchrun(...)`. - `ft` or `run.core.execution.FaultTolerance`: This will launch the task using NVIDIA's fault tolerant launcher. See the `FaultTolerance` class for configuration options. You can use it using `executor.launcher = "ft"` or `executor.launcher = FaultTolerance(...)`. -> **_NOTE:_** Launcher may not work very well with `run.Script`. Please report any issues at https://github.com/NVIDIA-NeMo/Run/issues. +```{attention} +Launcher may not work very well with `run.Script`. Please report any issues at [https://github.com/NVIDIA-NeMo/Run/issues](https://github.com/NVIDIA-NeMo/Run/issues). +``` ### Packagers @@ -65,7 +71,9 @@ Your working directory at the time of execution will look like: ``` If you're executing a Python function, this working directory will automatically be included in your Python path. -> **_NOTE:_** git archive doesn't package uncommitted changes. In the future, we may add support for including uncommitted changes while honoring `.gitignore`. +```{note} +Git archive doesn't package uncommitted changes. In the future, we may add support for including uncommitted changes while honoring `.gitignore`. +``` `run.PatternPackager` is a packager that uses a pattern to package your code. It is useful for packaging code that is not under version control. For example, if you have a directory structure like this: ``` @@ -228,7 +236,9 @@ As demonstrated in the examples, defining executors in Python offers great flexi The `DGXCloudExecutor` integrates with a DGX Cloud cluster's Run:ai API to launch distributed jobs. It uses REST API calls to authenticate, identify the target project and cluster, and submit the job specification. -> **_WARNING:_** Currently, the `DGXCloudExecutor` is only supported when launching experiments *from* a pod running on the DGX Cloud cluster itself. Furthermore, this launching pod must have access to a Persistent Volume Claim (PVC) where the experiment/job directories will be created, and this same PVC must also be configured to be mounted by the job being launched. +```{warning} +Currently, the `DGXCloudExecutor` is only supported when launching experiments *from* a pod running on the DGX Cloud cluster itself. Furthermore, this launching pod must have access to a Persistent Volume Claim (PVC) where the experiment/job directories will be created, and this same PVC must also be configured to be mounted by the job being launched. +``` Here's an example configuration: diff --git a/docs/source/guides/index.md b/docs/guides/index.md similarity index 88% rename from docs/source/guides/index.md rename to docs/guides/index.md index 912d7ce8..f8d5fcf1 100644 --- a/docs/source/guides/index.md +++ b/docs/guides/index.md @@ -1,7 +1,7 @@ -Guides -================= +# Guides -```{toctree} + +:::{toctree} :maxdepth: 2 :hidden: @@ -11,7 +11,7 @@ execution management ray cli -``` +::: Welcome to the NeMo-Run guides! This section provides comprehensive documentation on how to use NeMo-Run effectively for your machine learning experiments. @@ -36,7 +36,7 @@ For more advanced usage: NeMo-Run is built around three core responsibilities: 1. **Configuration** - Define your ML experiments using a flexible, Pythonic configuration system. -2. **Execution** - Run your experiments seamlessly across local machines, Slurm clusters, cloud providers, and more. -3. **Management** - Track, reproduce, and organize your experiments with built-in experiment management. +1. **Execution** - Run your experiments seamlessly across local machines, Slurm clusters, cloud providers, and more. +1. **Management** - Track, reproduce, and organize your experiments with built-in experiment management. Each guide dives deep into these concepts with practical examples and best practices. Choose a guide above to get started! diff --git a/docs/source/guides/management.md b/docs/guides/management.md similarity index 95% rename from docs/source/guides/management.md rename to docs/guides/management.md index 30e7af6b..66a4f25e 100644 --- a/docs/source/guides/management.md +++ b/docs/guides/management.md @@ -12,7 +12,9 @@ exp = Experiment("My Experiment") When executed, it will automatically generate a unique experiment ID for you, which represents one unique run of the experiment. -> [!NOTE] > `Experiment` is a context manager and `Experiment.add` and `Experiment.run` methods can currently only be used after entering the context manager. +```{note} +`Experiment` is a context manager and `Experiment.add` and `Experiment.run` methods can currently only be used after entering the context manager. +``` ## Add Tasks @@ -73,7 +75,7 @@ You can check the status of an experiment using the `status` method: exp.status() ``` -This method will display information about the status of each task in the experiment. The following is a sample output from the status of experiment in [hello_scripts.py](../../../examples/hello-world/hello_scripts.py): +This method will display information about the status of each task in the experiment. The following is a sample output from the status of experiment in [hello_scripts.py](../../examples/hello-world/hello_scripts.py): ```bash Experiment Status for experiment_with_scripts_1730761155 diff --git a/docs/source/guides/ray.md b/docs/guides/ray.md similarity index 100% rename from docs/source/guides/ray.md rename to docs/guides/ray.md diff --git a/docs/source/guides/why-use-nemo-run.md b/docs/guides/why-use-nemo-run.md similarity index 100% rename from docs/source/guides/why-use-nemo-run.md rename to docs/guides/why-use-nemo-run.md diff --git a/docs/source/index.rst b/docs/index.md similarity index 55% rename from docs/source/index.rst rename to docs/index.md index 4329df4f..db5ca539 100644 --- a/docs/source/index.rst +++ b/docs/index.md @@ -1,56 +1,52 @@ -.. NeMo-Run documentation master file, created by - sphinx-quickstart on Thu Jul 25 17:57:46 2024. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +# NeMo-Run Documentation -NeMo-Run Documentation -====================== NeMo-Run is a powerful tool designed to streamline the configuration, execution and management of Machine Learning experiments across various computing environments. NeMo Run has three core responsibilities: -1. :doc:`Configuration ` -2. :doc:`Execution ` -3. :doc:`Management ` +1. [Configuration](guides/configuration.md) +1. [Execution](guides/execution.md) +1. [Management](guides/management.md) Please click into each link to learn more. This is also the typical order Nemo Run users will follow to setup and launch experiments. -.. toctree:: - :maxdepth: 1 +## Install the Project - guides/index - API Reference - faqs - -Install the Project -------------------- To install the project, use the following command: -``pip install git+https://github.com/NVIDIA-NeMo/Run.git`` +```bash +ip install git+https://github.com/NVIDIA-NeMo/Run.git +``` To install Skypilot with optional features, use one of the following commands: - To install Skypilot with Kubernetes support: - ``pip install git+https://github.com/NVIDIA-NeMo/Run.git[skypilot]`` + ```bash + pip install git+https://github.com/NVIDIA-NeMo/Run.git[skypilot] + ``` - To install Skypilot with support for all cloud platforms: - ``pip install git+https://github.com/NVIDIA-NeMo/Run.git[skypilot-all]`` + ```bash + pip install git+https://github.com/NVIDIA-NeMo/Run.git[skypilot-all] + ``` You can also manually install Skypilot from https://skypilot.readthedocs.io/en/latest/getting-started/installation.html If using DGX Cloud Lepton, use the following command to install the Lepton CLI: -``pip install leptonai`` +```bash +pip install leptonai +``` To authenticate with the DGX Cloud Lepton cluster, navigate to the **Settings > Tokens** page in the DGX Cloud Lepton UI and copy the ``lep login`` command shown on the page and run it in the terminal. -Make sure you have `pip` installed and configured properly. +Make sure you have ``pip`` installed and configured properly. + +## Tutorials -Tutorials ---------- The ``hello_world`` tutorial series provides a comprehensive introduction to NeMo-Run, demonstrating its capabilities through a simple example. The tutorial covers: @@ -61,6 +57,30 @@ The ``hello_world`` tutorial series provides a comprehensive introduction to NeM You can find the tutorial series below: -1. `Part 1: Hello World `_ -2. `Part 2: Hello Experiments `_ -3. `Part 3: Hello Scripts `_ +1. [Part 1: Hello World](https://github.com/NVIDIA-NeMo/Run/blob/main/examples/hello-world/hello_world.ipynb) +1. [Part 2: Hello Experiments](https://github.com/NVIDIA-NeMo/Run/blob/main/examples/hello-world/hello_experiments.ipynb) +1. [Part 3: Hello Scripts](https://github.com/NVIDIA-NeMo/Run/blob/main/examples/hello-world/hello_scripts.py) + + +:::{toctree} +:hidden: +Home +::: + +:::{toctree} +:caption: Get Started +:maxdepth: 2 +:hidden: + +guides/index + +::: + +:::{toctree} +:hidden: +:caption: Reference +:maxdepth: 2 + +faqs +API Reference +::: diff --git a/docs/source/project.json b/docs/project.json similarity index 100% rename from docs/source/project.json rename to docs/project.json diff --git a/docs/source/versions1.json b/docs/versions1.json similarity index 56% rename from docs/source/versions1.json rename to docs/versions1.json index 604af762..c987bebc 100644 --- a/docs/source/versions1.json +++ b/docs/versions1.json @@ -2,6 +2,6 @@ { "preferred": true, "version": "0.1.0", - "url": "../0.1.0" + "url": "http://docs.nvidia.com/nemo/run/0.1.0" } ] diff --git a/pyproject.toml b/pyproject.toml index 05887f0f..34e636b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,8 @@ docs = [ "sphinx-autoapi>=3.0.0", "nvidia-sphinx-theme", "sphinxcontrib-mermaid", + "sphinx-copybutton>=0.5.2", + "sphinx-new-tab-link>=0.8.0", ] [build-system] diff --git a/uv.lock b/uv.lock index 363caffc..06869599 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13' and sys_platform == 'darwin' and extra != 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all' and extra != 'group-8-nemo-run-docs'", @@ -250,10 +250,10 @@ name = "anyio" version = "4.8.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11' or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs')" }, { name = "idna" }, { name = "sniffio" }, - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a3/73/199a98fc2dae33535d6b8e8e6ec01f8c1d76c9adb096c6b7d64823038cde/anyio-4.8.0.tar.gz", hash = "sha256:1d9fe889df5212298c0c0723fa20479d1b94883a2df44bd3897aa91083316f7a", size = 181126, upload-time = "2025-01-05T13:13:11.095Z" } wheels = [ @@ -2338,7 +2338,7 @@ name = "cryptography" version = "42.0.8" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, + { name = "cffi", marker = "platform_python_implementation != 'PyPy' or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/93/a7/1498799a2ea06148463a9a2c10ab2f6a921a74fb19e231b27dc412a748e2/cryptography-42.0.8.tar.gz", hash = "sha256:8d09d05439ce7baa8e9e95b07ec5b6c886f548deb7e0f69ef25f64b3bce842f2", size = 671250, upload-time = "2024-06-04T19:55:08.609Z" } wheels = [ @@ -2474,7 +2474,7 @@ name = "docker" version = "7.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "pywin32", marker = "sys_platform == 'win32' or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs')" }, { name = "requests" }, { name = "urllib3" }, ] @@ -4449,6 +4449,8 @@ docs = [ { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs')" }, { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs')" }, { name = "sphinx-autoapi" }, + { name = "sphinx-copybutton" }, + { name = "sphinx-new-tab-link" }, { name = "sphinxcontrib-mermaid" }, ] lint = [ @@ -4497,6 +4499,8 @@ docs = [ { name = "nvidia-sphinx-theme" }, { name = "sphinx", specifier = ">=7" }, { name = "sphinx-autoapi", specifier = ">=3.0.0" }, + { name = "sphinx-copybutton", specifier = ">=0.5.2" }, + { name = "sphinx-new-tab-link", specifier = ">=0.8.0" }, { name = "sphinxcontrib-mermaid" }, ] lint = [{ name = "ruff", specifier = ">=0.4.4" }] @@ -6100,7 +6104,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "attrs" }, { name = "rpds-py" }, - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/2f/db/98b5c277be99dd18bfd91dd04e1b759cad18d1a338188c936e92f921c7e2/referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa", size = 74744, upload-time = "2025-01-25T08:48:16.138Z" } wheels = [ @@ -6168,7 +6172,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "markdown-it-py" }, { name = "pygments" }, - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ab/3a/0316b28d0761c6734d6bc14e770d85506c986c85ffb239e688eeaab2c2bc/rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098", size = 223149, upload-time = "2024-11-01T16:43:57.873Z" } wheels = [ @@ -6764,6 +6768,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/58/17/0eda9dc80fcaf257222b506844207e71b5d59567c41bbdcca2a72da119b9/sphinx_autoapi-3.6.0-py3-none-any.whl", hash = "sha256:f3b66714493cab140b0e896d33ce7137654a16ac1edb6563edcbd47bf975f711", size = 35281, upload-time = "2025-02-18T01:50:52.789Z" }, ] +[[package]] +name = "sphinx-copybutton" +version = "0.5.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs')" }, + { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/2b/a964715e7f5295f77509e59309959f4125122d648f86b4fe7d70ca1d882c/sphinx-copybutton-0.5.2.tar.gz", hash = "sha256:4cf17c82fb9646d1bc9ca92ac280813a3b605d8c421225fd9913154103ee1fbd", size = 23039, upload-time = "2023-04-14T08:10:22.998Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/48/1ea60e74949eecb12cdd6ac43987f9fd331156388dcc2319b45e2ebb81bf/sphinx_copybutton-0.5.2-py3-none-any.whl", hash = "sha256:fb543fd386d917746c9a2c50360c7905b605726b9355cd26e9974857afeae06e", size = 13343, upload-time = "2023-04-14T08:10:20.844Z" }, +] + +[[package]] +name = "sphinx-new-tab-link" +version = "0.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs')" }, + { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs')" }, + { name = "sphinxcontrib-extdevhelper-kasane" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/de/f62360114d605d1c7c5fba060c76a3521655d388e3fa03747d31b9452a69/sphinx_new_tab_link-0.8.0.tar.gz", hash = "sha256:6c757d99f559224a04142c3971c8baa6ac90aca905f15b129d57eeca0ece9582", size = 6637, upload-time = "2025-04-01T14:01:18.88Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/c7/b35261707bc72ce2dff1c4b66a391542be4ba80faded83b6e8e00fb14af9/sphinx_new_tab_link-0.8.0-py3-none-any.whl", hash = "sha256:c74b873d6c8a1ec089015dc414a75f6908e87f66ce4ab8d9f2c7268f13afc593", size = 5622, upload-time = "2025-04-01T14:01:17.091Z" }, +] + [[package]] name = "sphinxcontrib-applehelp" version = "2.0.0" @@ -6782,6 +6813,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530, upload-time = "2024-07-29T01:09:21.945Z" }, ] +[[package]] +name = "sphinxcontrib-extdevhelper-kasane" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs')" }, + { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d9/74/d5b2650ca859664400603d0db08b6bfce507a9606284137ab0d7bcec4e02/sphinxcontrib-extdevhelper-kasane-0.2.0.tar.gz", hash = "sha256:4dc7b00327f33c7b421c27122b40278eeaca43f24601b572cee5616d31b206a9", size = 4496, upload-time = "2024-03-16T07:23:03.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/91/41a07c91e2adee3463b443cf924778f5a2d92a1166f3f7208959d8b1fabf/sphinxcontrib_extdevhelper_kasane-0.2.0-py3-none-any.whl", hash = "sha256:20f94e3b209cddec24596234458ea3887e7a7ad45b54a4d0a5bf169ff45a38f1", size = 3918, upload-time = "2024-03-16T07:23:01.026Z" }, +] + [[package]] name = "sphinxcontrib-htmlhelp" version = "2.1.0" @@ -7311,7 +7355,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, { name = "h11" }, - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-8-nemo-run-skypilot' and extra == 'extra-8-nemo-run-skypilot-all') or (extra == 'extra-8-nemo-run-skypilot' and extra == 'group-8-nemo-run-docs') or (extra == 'extra-8-nemo-run-skypilot-all' and extra == 'group-8-nemo-run-docs')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/4b/4d/938bd85e5bf2edeec766267a5015ad969730bb91e31b44021dfe8b22df6c/uvicorn-0.34.0.tar.gz", hash = "sha256:404051050cd7e905de2c9a7e61790943440b3416f49cb409f965d9dcd0fa73e9", size = 76568, upload-time = "2024-12-15T13:33:30.42Z" } wheels = [ From a70fa2022aa153aecdba8dd82ebebab744763f68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 8 Oct 2025 17:07:29 +0200 Subject: [PATCH 18/28] remove custom dir (#351) --- .github/workflows/build-docs.yml | 2 -- docs/guides/execution.md | 18 +++++++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml index 2f20739b..a8eb4b59 100644 --- a/.github/workflows/build-docs.yml +++ b/.github/workflows/build-docs.yml @@ -22,5 +22,3 @@ on: jobs: build-docs: uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0 - with: - docs-directory: docs/source diff --git a/docs/guides/execution.md b/docs/guides/execution.md index 7f34893d..6f2c0063 100644 --- a/docs/guides/execution.md +++ b/docs/guides/execution.md @@ -3,6 +3,7 @@ After configuring NeMo-Run, the next step is to execute it. Nemo-Run decouples configuration from execution, allowing you to configure a function or task once and then execute it across multiple environments. With Nemo-Run, you can choose to execute a single task or multiple tasks simultaneously on different remote clusters, managing them under an experiment. This brings us to the core building blocks for execution: `run.Executor` and `run.Experiment`. Each execution of a single configured task requires an executor. Nemo-Run provides `run.Executor`, which are APIs to configure your remote executor and set up the packaging of your code. Currently we support: + - `run.LocalExecutor` - `run.DockerExecutor` - `run.SlurmExecutor` with an optional `SSHTunnel` for executing on Slurm clusters from your local machine @@ -12,7 +13,7 @@ Each execution of a single configured task requires an executor. Nemo-Run provid A tuple of task and executor form an execution unit. A key goal of NeMo-Run is to allow you to mix and match tasks and executors to arbitrarily define execution units. Once an execution unit is created, the next step is to run it. The `run.run` function executes a single task, whereas `run.Experiment` offers more fine-grained control to define complex experiments. `run.run` wraps `run.Experiment` with a single task. `run.Experiment` is an API to launch and manage multiple tasks all using pure Python. -The `run.Experiment` takes care of storing the run metadata, launching it on the specified cluster, and syncing the logs, etc. Additionally, `run.Experiment` also provides management tools to easily inspect and reproduce past experiments. The `run.Experiment` is inspired from [xmanager](https://github.com/google-deepmind/xmanager/tree/main) and uses [TorchX](https://pytorch.org/torchx/latest/) under the hood to handle execution. +The `run.Experiment` takes care of storing the run metadata, launching it on the specified cluster, and syncing the logs, etc. Additionally, `run.Experiment` also provides management tools to easily inspect and reproduce past experiments. The `run.Experiment` is inspired from [xmanager](https://github.com/google-deepmind/xmanager/tree/main) and uses [TorchX](https://meta-pytorch.org/torchx/latest/) under the hood to handle execution. ```{note} NeMo-Run assumes familiarity with Docker and uses a docker image as the environment for remote execution. This means you must provide a Docker image that includes all necessary dependencies and configurations when using a remote executor. @@ -23,12 +24,15 @@ All the experiment metadata is stored under `NEMORUN_HOME` env var on the machin ``` ## Executors + Executors are dataclasses that configure your remote executor and set up the packaging of your code. All supported executors inherit from the base class `run.Executor`, but have configuration parameters specific to their execution environment. There is an initial cost to understanding the specifics of your executor and setting it up, but this effort is easily amortized over time. Each `run.Executor` has the two attributes: `packager` and `launcher`. The `packager` specifies how to package the code for execution, while the `launcher` determines which tool to use for launching the task. ### Launchers + We support the following `launchers`: + - `default` or `None`: This will directly launch your task without using any special launchers. Set `executor.launcher = None` (which is the default value) if you don't want to use a specific launcher. - `torchrun` or `run.Torchrun`: This will launch the task using `torchrun`. See the `Torchrun` class for configuration options. You can use it using `executor.launcher = "torchrun"` or `executor.launcher = Torchrun(...)`. - `ft` or `run.core.execution.FaultTolerance`: This will launch the task using NVIDIA's fault tolerant launcher. See the `FaultTolerance` class for configuration options. You can use it using `executor.launcher = "ft"` or `executor.launcher = FaultTolerance(...)`. @@ -54,21 +58,26 @@ The packager support matrix is described below: `run.GitArchivePackager` uses `git archive` to package your code. Refer to the API reference for `run.GitArchivePackager` to see the exact mechanics of packaging using `git archive`. At a high level, it works in the following way: + 1. base_path = `git rev-parse --show-toplevel`. 2. Optionally define a subpath as `base_path/GitArchivePackager.subpath` by setting `subpath` attribute on `GitArchivePackager`. 3. `cd base_path && git archive --format=tar.gz --output={output_file} {GitArchivePackager.subpath}:{subpath}` This extracted tar file becomes the working directory for your job. As an example, given the following directory structure with `subpath="src"`: + ``` - docs - src - your_library - tests ``` + Your working directory at the time of execution will look like: + ``` - your_library ``` + If you're executing a Python function, this working directory will automatically be included in your Python path. ```{note} @@ -76,6 +85,7 @@ Git archive doesn't package uncommitted changes. In the future, we may add suppo ``` `run.PatternPackager` is a packager that uses a pattern to package your code. It is useful for packaging code that is not under version control. For example, if you have a directory structure like this: + ``` - docs - src @@ -94,6 +104,7 @@ cd {relative_path} && find {relative_include_pattern} -type f Each sub-packager in the `sub_packagers` dictionary is assigned a key, which becomes the directory name under which its contents are placed in the final archive. If `extract_at_root` is set to `True`, all contents are placed directly in the root of the archive, potentially overwriting files if names conflict. Example: + ```python import nemo_run as run import os @@ -108,9 +119,11 @@ hybrid_packager = run.HybridPackager( # Usage with an executor: # executor.packager = hybrid_packager ``` + This would create an archive where the contents of `src` are under a `code/` directory and matched `configs/*.yaml` files are under a `configs/` directory. ### Defining Executors + Next, We'll describe details on setting up each of the executors below. #### LocalExecutor @@ -145,6 +158,7 @@ run.DockerExecutor( The SlurmExecutor enables launching the configured task on a Slurm Cluster with Pyxis. Additionally, you can configure a `run.SSHTunnel`, which enables you to execute tasks on the Slurm cluster from your local machine while NeMo-Run manages the SSH connection for you. This setup supports use cases such as launching the same task on multiple Slurm clusters. Below is an example of configuring a Slurm Executor + ```python def your_slurm_executor(nodes: int = 1, container_image: str = DEFAULT_IMAGE): # SSH Tunnel @@ -205,9 +219,11 @@ The `dependency_type` parameter specifies the type of dependency relationship: This functionality enables you to create complex workflows with proper orchestration between different tasks, such as starting a training job only after data preparation is complete, or running an evaluation only after training finishes successfully. #### SkypilotExecutor + This executor is used to configure [Skypilot](https://skypilot.readthedocs.io/en/latest/docs/index.html). Make sure Skypilot is installed using `pip install "nemo_run[skypilot]"` and atleast one cloud is configured using `sky check`. Here's an example of the `SkypilotExecutor` for Kubernetes: + ```python def your_skypilot_executor(nodes: int, devices: int, container_image: str): return SkypilotExecutor( From b83593982d93c9faee05c45cf39c0896a018ad7e Mon Sep 17 00:00:00 2001 From: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> Date: Wed, 8 Oct 2025 10:24:31 -0500 Subject: [PATCH 19/28] Bumping to 0.5.0 (#352) Signed-off-by: Andrew Schilling --- docs/conf.py | 2 +- docs/project.json | 2 +- docs/versions1.json | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index f222af11..4261aaf5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,7 +14,7 @@ project = "NeMo-Run" copyright = "2025, NVIDIA" author = "NVIDIA" -release = "0.1.0" +release = "0.5.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/project.json b/docs/project.json index b14a96fb..6c78e9bb 100644 --- a/docs/project.json +++ b/docs/project.json @@ -1 +1 @@ -{"name": "NeMo-Run", "version": "0.1.0"} +{"name": "NeMo-Run", "version": "0.5.0"} diff --git a/docs/versions1.json b/docs/versions1.json index c987bebc..643e185c 100644 --- a/docs/versions1.json +++ b/docs/versions1.json @@ -1,7 +1,7 @@ [ { "preferred": true, - "version": "0.1.0", - "url": "http://docs.nvidia.com/nemo/run/0.1.0" + "version": "0.5.0", + "url": "http://docs.nvidia.com/nemo/run/0.5.0" } ] From b63919f2383f367a3d772e1a0d7adb203f8797c6 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Wed, 8 Oct 2025 21:53:11 -0700 Subject: [PATCH 20/28] Update release notes header in changelog build (#355) --- .github/workflows/changelog-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/changelog-build.yml b/.github/workflows/changelog-build.yml index 26bd3c3f..3607a7da 100644 --- a/.github/workflows/changelog-build.yml +++ b/.github/workflows/changelog-build.yml @@ -49,7 +49,7 @@ jobs: RELEASE_VERSION=${RELEASE_BRANCH#r} CHANGELOG=$(echo "$CHANGELOG" | sed '/^[[:blank:]]*#/s/#/###/') - RELEASE_NOTES="## NVIDIA Neural Modules $RELEASE_VERSION + RELEASE_NOTES="## NVIDIA Nemo Run $RELEASE_VERSION ### Detailed Changelogs: From 7d6a5598bd35e76c3a84955e04c909e10f330ee9 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Wed, 8 Oct 2025 22:00:07 -0700 Subject: [PATCH 21/28] add changelog-config (#356) --- .../workflows/config/changelog-config.json | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 .github/workflows/config/changelog-config.json diff --git a/.github/workflows/config/changelog-config.json b/.github/workflows/config/changelog-config.json new file mode 100644 index 00000000..a845104f --- /dev/null +++ b/.github/workflows/config/changelog-config.json @@ -0,0 +1,118 @@ +{ + "categories": [ + { + "title": "## Executors\n\n
Changelog", + "labels": ["executor", "local", "slurm", "dgxcloud", "lepton", "skypilot", "docker"], + "exclude_labels": ["ignore"] + }, + { + "title": "
\n\n## Ray Integration\n\n
Changelog", + "labels": ["ray", "kuberay", "ray-slurm"], + "exclude_labels": ["ignore"] + }, + { + "title": "
\n\n## CLI & Configuration\n\n
Changelog", + "labels": ["cli", "config", "parsing"], + "exclude_labels": ["ignore"] + }, + { + "title": "
\n\n## Experiment & Job Management\n\n
Changelog", + "labels": ["experiment", "job", "task"], + "exclude_labels": ["ignore"] + }, + { + "title": "
\n\n## Packaging & Deployment\n\n
Changelog", + "labels": ["packaging", "deployment"], + "exclude_labels": ["ignore"] + }, + { + "title": "
\n\n## Documentation\n\n
Changelog", + "labels": ["docs", "documentation"], + "exclude_labels": ["ignore"] + }, + { + "title": "
\n\n## CI/CD\n\n
Changelog", + "labels": ["ci", "github-actions", "workflow"], + "exclude_labels": ["ignore"] + }, + { + "title": "
\n\n## Bug Fixes\n\n
Changelog", + "labels": ["bug", "bugfix", "fix"], + "exclude_labels": ["ignore"] + } + ], + "ignore_labels": [ + "ignore", + "skip-changelog" + ], + "sort": "ASC", + "template": "\n${{CHANGELOG}}
\n\n## Others\n\n
Changelog\n\n${{UNCATEGORIZED}}\n
\n", + "pr_template": "- ${{TITLE}} [#${{NUMBER}}](${{URL}})", + "empty_template": "- No changes in this release", + "label_extractor": [ + { + "pattern": "(.*executor.*)|(.*local.*)|(.*slurm.*)|(.*dgxcloud.*)|(.*lepton.*)|(.*skypilot.*)|(.*docker.*)", + "target": "executor", + "flags": "gimu", + "on_property": ["title", "body"] + }, + { + "pattern": "(.*ray.*)|(.*kuberay.*)", + "target": "ray", + "flags": "gimu", + "on_property": ["title", "body"] + }, + { + "pattern": "(.*cli.*)|(.*command.*)|(.*parse.*)|(.*argument.*)", + "target": "cli", + "flags": "gimu", + "on_property": ["title", "body"] + }, + { + "pattern": "(.*experiment.*)|(.*job.*)|(.*task.*)", + "target": "experiment", + "flags": "gimu", + "on_property": ["title", "body"] + }, + { + "pattern": "(.*packaging.*)|(.*package.*)|(.*deploy.*)|(.*archive.*)|(.*mount.*)", + "target": "packaging", + "flags": "gimu", + "on_property": ["title", "body"] + }, + { + "pattern": "(.*doc.*)|(.*readme.*)|(.*guide.*)|(.*tutorial.*)", + "target": "docs", + "flags": "gimu", + "on_property": ["title", "body"] + }, + { + "pattern": "(.*\\bci\\b.*)|(.*github.*)|(.*workflow.*)|(.*action.*)", + "target": "ci", + "flags": "gimu", + "on_property": ["title", "body"] + }, + { + "pattern": "(.*\\[bug.*)|(.*\\bfix\\b.*)|(.*bugfix.*)|(.*patch.*)", + "target": "bug", + "flags": "gimu", + "on_property": ["title", "body"] + } + ], + "duplicate_filter": { + "pattern": ".+", + "on_property": "title", + "method": "match" + }, + "transformers": [ + ], + "max_tags_to_fetch": 100, + "max_pull_requests": 500, + "max_back_track_time_days": 365, + "exclude_merge_branches": [ + ], + "tag_resolver": { + "method": "semver" + } +} + From de0412595c86f9399a7db4cb233061fdb29fe0ac Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Wed, 8 Oct 2025 22:23:31 -0700 Subject: [PATCH 22/28] Changelog 0.6.0 (#357) --- .../workflows/config/changelog-config.json | 18 +++--- CHANGELOG.md | 57 +++++++++++++++++++ 2 files changed, 66 insertions(+), 9 deletions(-) diff --git a/.github/workflows/config/changelog-config.json b/.github/workflows/config/changelog-config.json index a845104f..0b78b096 100644 --- a/.github/workflows/config/changelog-config.json +++ b/.github/workflows/config/changelog-config.json @@ -1,42 +1,42 @@ { "categories": [ { - "title": "## Executors\n\n
Changelog", + "title": "## Executors\n\n", "labels": ["executor", "local", "slurm", "dgxcloud", "lepton", "skypilot", "docker"], "exclude_labels": ["ignore"] }, { - "title": "
\n\n## Ray Integration\n\n
Changelog", + "title": "\n## Ray Integration\n\n", "labels": ["ray", "kuberay", "ray-slurm"], "exclude_labels": ["ignore"] }, { - "title": "
\n\n## CLI & Configuration\n\n
Changelog", + "title": "\n## CLI & Configuration\n\n", "labels": ["cli", "config", "parsing"], "exclude_labels": ["ignore"] }, { - "title": "
\n\n## Experiment & Job Management\n\n
Changelog", + "title": "\n## Experiment & Job Management\n\n", "labels": ["experiment", "job", "task"], "exclude_labels": ["ignore"] }, { - "title": "
\n\n## Packaging & Deployment\n\n
Changelog", + "title": "\n## Packaging & Deployment\n\n", "labels": ["packaging", "deployment"], "exclude_labels": ["ignore"] }, { - "title": "
\n\n## Documentation\n\n
Changelog", + "title": "\n## Documentation\n\n", "labels": ["docs", "documentation"], "exclude_labels": ["ignore"] }, { - "title": "
\n\n## CI/CD\n\n
Changelog", + "title": "\n## CI/CD\n\n", "labels": ["ci", "github-actions", "workflow"], "exclude_labels": ["ignore"] }, { - "title": "
\n\n## Bug Fixes\n\n
Changelog", + "title": "\n## Bug Fixes\n\n", "labels": ["bug", "bugfix", "fix"], "exclude_labels": ["ignore"] } @@ -46,7 +46,7 @@ "skip-changelog" ], "sort": "ASC", - "template": "\n${{CHANGELOG}}
\n\n## Others\n\n
Changelog\n\n${{UNCATEGORIZED}}\n
\n", + "template": "\n${{CHANGELOG}}\n## Others\n\n${{UNCATEGORIZED}}\n", "pr_template": "- ${{TITLE}} [#${{NUMBER}}](${{URL}})", "empty_template": "- No changes in this release", "label_extractor": [ diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d389dd0..5c18eca5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,63 @@ # Changelog +## NVIDIA Nemo Run 0.6.0 + +### Detailed Changelogs: + +## Executors + +- Added Pre-Launch Commands Support to LeptonExecutor [#312](https://github.com/NVIDIA-NeMo/Run/pull/312) +- Remove breaking torchrun config for single-node runs [#292](https://github.com/NVIDIA-NeMo/Run/pull/292) +- Upgrade skypilot to v0.10.0, introduce network_tier [#297](https://github.com/NVIDIA-NeMo/Run/pull/297) +- Fixes for multi-node execution with torchrun + LocalExecutor [#251](https://github.com/NVIDIA-NeMo/Run/pull/251) +- Add option to specify --container-env for srun [#293](https://github.com/NVIDIA-NeMo/Run/pull/293) +- Fix skypilot archive mount bug [#288](https://github.com/NVIDIA-NeMo/Run/pull/288) +- finetune on dgxcloud with nemo-run and deploy on bedrock example [#286](https://github.com/NVIDIA-NeMo/Run/pull/286) + +## Ray Integration + +- Add nsys patch in ray sub template [#318](https://github.com/NVIDIA-NeMo/Run/pull/318) +- Add logs dir to container mount for ray slurm [#287](https://github.com/NVIDIA-NeMo/Run/pull/287) +- Allow customizing folder for SlurmRayRequest [#281](https://github.com/NVIDIA-NeMo/Run/pull/281) + +## CLI & Configuration + +## Experiment & Job Management + +- Use thread pool for status, run methods inside experiment + other fixes [#295](https://github.com/NVIDIA-NeMo/Run/pull/295) + +## Packaging & Deployment + +- Correctly append tar files for packaging [#317](https://github.com/NVIDIA-NeMo/Run/pull/317) + +## Documentation + +- Create CHANGELOG.md [#314](https://github.com/NVIDIA-NeMo/Run/pull/314) +- docs: Fixing doc build issue [#290](https://github.com/NVIDIA-NeMo/Run/pull/290) +- fix docs tutorial links and add intro to guides/index.md [#285](https://github.com/NVIDIA-NeMo/Run/pull/285) +- README [#277](https://github.com/NVIDIA-NeMo/Run/pull/277) + +## CI/CD + +- changelog workflow [#315](https://github.com/NVIDIA-NeMo/Run/pull/315) +- Update release.yml [#306](https://github.com/NVIDIA-NeMo/Run/pull/306) +- ci(fix): Use GITHUB_TOKEN for community bot [#302](https://github.com/NVIDIA-NeMo/Run/pull/302) +- ci: Add community-bot [#300](https://github.com/NVIDIA-NeMo/Run/pull/300) + +## Bug Fixes + +- [Bugfix] Adding a check for name length [#273](https://github.com/NVIDIA-NeMo/Run/pull/273) +- misc fixes [#280](https://github.com/NVIDIA-NeMo/Run/pull/280) +- adding fix for lowercase and name length k8s requirements [#274](https://github.com/NVIDIA-NeMo/Run/pull/274) + +## Others + +- Specify nodes for gpu metrics collection and split data to each rank [#320](https://github.com/NVIDIA-NeMo/Run/pull/320) +- Apply '_enable_goodbye_message' check to both goodbye messages. [#319](https://github.com/NVIDIA-NeMo/Run/pull/319) +- Update refs [#278](https://github.com/NVIDIA-NeMo/Run/pull/278) +- chore: Bump to version 0.6.0rc0.dev0 [#272](https://github.com/NVIDIA-NeMo/Run/pull/272) + ## NVIDIA Nemo Run 0.5.0 From 39dcff2e014ee839209510929478dfcb3e356903 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Thu, 9 Oct 2025 08:09:20 -0700 Subject: [PATCH 23/28] spelling (#359) --- nemo_run/run/experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/run/experiment.py b/nemo_run/run/experiment.py index 9e6a6c88..460f04f6 100644 --- a/nemo_run/run/experiment.py +++ b/nemo_run/run/experiment.py @@ -645,7 +645,7 @@ def run( In sequential mode, if all executor supports dependencies, then all tasks will be scheduled at once by specifying the correct dependencies to each task. Otherwise, the experiment.run call will block and each task that is scheduled will be executed sequentially. - In this particular case, we cannot guarantee the state of the exeperiment if the process exits in the middle. + In this particular case, we cannot guarantee the state of the experiment if the process exits in the middle. Currently, only the slurm executor supports dependencies. From 927e3df0cdcfba68b96d9224caf8e12617181f21 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Thu, 9 Oct 2025 08:30:43 -0700 Subject: [PATCH 24/28] spelling (#359) Signed-off-by: Pablo Garay From 8ca8f7952a597f944985f1f1368a7acb9aa3a6c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 15 Oct 2025 20:40:18 +0200 Subject: [PATCH 25/28] fix: exit code docker runs (#365) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: Emit exit-code of docker runs Signed-off-by: oliver könig * fix test Signed-off-by: oliver könig * fixes Signed-off-by: oliver könig * refactor Signed-off-by: oliver könig * cleanup Signed-off-by: oliver könig * add scheduler test Signed-off-by: oliver könig * more scheduler tests Signed-off-by: oliver könig * test executor Signed-off-by: oliver könig * formatting Signed-off-by: oliver könig --------- Signed-off-by: oliver könig --- nemo_run/core/execution/docker.py | 7 +- .../run/torchx_backend/schedulers/docker.py | 41 +++++--- test/core/execution/test_docker.py | 27 +++++- .../torchx_backend/schedulers/test_docker.py | 95 +++++++++++++++++++ 4 files changed, 152 insertions(+), 18 deletions(-) diff --git a/nemo_run/core/execution/docker.py b/nemo_run/core/execution/docker.py index 4de7433d..ec060823 100644 --- a/nemo_run/core/execution/docker.py +++ b/nemo_run/core/execution/docker.py @@ -276,9 +276,12 @@ def run(self, client: "DockerClient", id: str) -> "Container": container_kwargs.update(self.executor.additional_kwargs) assert self.executor.experiment_id - tee_cmd = f" 2>&1 | tee -a /{RUNDIR_NAME}/log_{self.name}.out" + tee_cmd = f" 2>&1 | tee -a /{RUNDIR_NAME}/log_{self.name}.out; " + save_status_cmd = r"export EXIT_CODE=${PIPESTATUS[0]}; " + save_status_cmd += f'printf \'{{\\"id\\": \\"{id}\\", \\"exit_code\\": \\"%s\\"}}\\n\' "$EXIT_CODE" > /{RUNDIR_NAME}/status_{self.name}.out; exit $EXIT_CODE;' + command = " ".join(self.command) - command = f'bash -c "{command}{tee_cmd}"' + command = f'bash -c "{command}{tee_cmd}{save_status_cmd}"' ensure_network(client=client, network=self.executor.network) return client.containers.run( diff --git a/nemo_run/run/torchx_backend/schedulers/docker.py b/nemo_run/run/torchx_backend/schedulers/docker.py index e3124a02..4f68920c 100644 --- a/nemo_run/run/torchx_backend/schedulers/docker.py +++ b/nemo_run/run/torchx_backend/schedulers/docker.py @@ -14,6 +14,7 @@ # limitations under the License. import glob +import json import logging import os from datetime import datetime @@ -157,17 +158,35 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]: roles[role].num_replicas += 1 c = container.get_container(client=self._docker_client, id=app_id) - _state = self._get_app_state(c) if c else AppState.SUCCEEDED - - roles_statuses[role].replicas.append( - ReplicaStatus( - id=0, - role=role, - state=_state, - hostname=container.name, + _state = self._get_app_state(c) if c is not None else None + + if _state is not None: + roles_statuses[role].replicas.append( + ReplicaStatus( + id=0, + role=role, + state=_state, + hostname=container.name, + ) ) - ) - states.append(_state) + states.append(_state) + else: + status_file = os.path.join(req.executor.job_dir, f"status_{role}.out") + if os.path.exists(status_file): + with open(status_file, "r") as f: + status = json.load(f) + roles_statuses[role].replicas.append( + ReplicaStatus( + id=0, + role=role, + state=int(status["exit_code"]), + hostname=container.name, + ) + ) + state = ( + AppState.FAILED if int(status["exit_code"]) != 0 else AppState.SUCCEEDED + ) + states.append(state) state = AppState.UNKNOWN if any(is_terminal(state) for state in states): @@ -175,7 +194,7 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]: state = AppState.SUCCEEDED else: state = AppState.FAILED - else: + elif len(states) > 0: state = next(state for state in states if not is_terminal(state)) return DescribeAppResponse( diff --git a/test/core/execution/test_docker.py b/test/core/execution/test_docker.py index 33ae19da..52766ce9 100644 --- a/test/core/execution/test_docker.py +++ b/test/core/execution/test_docker.py @@ -285,15 +285,22 @@ def test_init(self): assert container.executor == executor assert container.extra_env == {"EXTRA": "value"} - @patch("nemo_run.core.execution.docker.DockerContainer.run") - def test_run(self, mock_run, mock_docker_client, mock_container): + @patch("docker.DockerClient") + @patch("nemo_run.core.execution.docker.ensure_network") + def test_run( + self, + mock_client, + mock_ensure_network, + mock_docker_client, + mock_container, + ): """Test run method of DockerContainer.""" executor = DockerExecutor( container_image="test:latest", runtime="nvidia", num_gpus=2, shm_size="8g", - ulimits=["memlock:unlimited:unlimited"], + ulimits=["memlock:0:123"], ipc_mode="host", privileged=True, volumes=["/host:/container"], @@ -308,11 +315,21 @@ def test_run(self, mock_run, mock_docker_client, mock_container): extra_env={"EXTRA": "value"}, ) - mock_run.return_value = mock_container + mock_ensure_network.return_value = None + + def mocked_run(*args, **kwargs): + detach = kwargs.pop("detach", None) + remove = kwargs.pop("remove", None) + assert detach is True + assert remove is True + + mock_client.containers.run = mocked_run + + container.run(mock_client, "job123") # Instead of actually calling run which would fail with the "unlimited" value, # we'll check that the container is properly set up - assert container.executor.ulimits == ["memlock:unlimited:unlimited"] + assert container.executor.ulimits == ["memlock:0:123"] assert container.extra_env == {"EXTRA": "value"} assert container.executor.experiment_id == "exp123" diff --git a/test/run/torchx_backend/schedulers/test_docker.py b/test/run/torchx_backend/schedulers/test_docker.py index 3055d967..551d8a60 100644 --- a/test/run/torchx_backend/schedulers/test_docker.py +++ b/test/run/torchx_backend/schedulers/test_docker.py @@ -13,12 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +import os import tempfile from unittest import mock import pytest from torchx.schedulers.api import AppDryRunInfo from torchx.specs import AppDef, Role +from torchx.specs.api import AppState from nemo_run.core.execution.docker import DockerExecutor from nemo_run.run.torchx_backend.schedulers.docker import ( @@ -121,6 +124,98 @@ def test_describe(docker_scheduler, docker_executor): response = docker_scheduler.describe("test_session___test_role___test_container_id") assert response is not None assert response.app_id == "test_session___test_role___test_container_id" + assert "UNKNOWN" in str(response.state) + assert len(response.roles) == 1 + + +def test_describe_running(docker_scheduler, docker_executor): + with ( + mock.patch.object(DockerJobRequest, "load") as mock_load, + mock.patch.object(DockerContainer, "get_container") as mock_get_container, + mock.patch.object(PersistentDockerScheduler, "_get_app_state") as mock_get_app_state, + ): + container = DockerContainer( + name="test_role", + command=["test"], + executor=docker_executor, + extra_env={}, + ) + mock_load.return_value = DockerJobRequest( + id="test_session___test_role___test_container_id", + executor=docker_executor, + containers=[container], + ) + mock_get_container.return_value = container + mock_get_app_state.return_value = AppState.RUNNING + + response = docker_scheduler.describe("test_session___test_role___test_container_id") + assert response is not None + assert response.app_id == "test_session___test_role___test_container_id" + assert "RUNNING" in str(response.state) + assert len(response.roles) == 1 + + +def test_describe_failed(docker_scheduler, docker_executor): + with ( + mock.patch.object(DockerJobRequest, "load") as mock_load, + mock.patch.object(DockerContainer, "get_container") as mock_get_container, + mock.patch.object(PersistentDockerScheduler, "_get_app_state") as mock_get_app_state, + ): + container = DockerContainer( + name="test_role", + command=["test"], + executor=docker_executor, + extra_env={}, + ) + req = DockerJobRequest( + id="test_session___test_role___test_container_id", + executor=docker_executor, + containers=[container], + ) + mock_load.return_value = req + mock_get_container.return_value = container + mock_get_app_state.return_value = None + status_file = os.path.join(req.executor.job_dir, f"status_{req.containers[0].name}.out") + + with open(status_file, "w") as f: + f.write(json.dumps({"exit_code": 1})) + + response = docker_scheduler.describe(req.id) + assert response is not None + assert response.app_id == req.id + assert "FAILED" in str(response.state) + assert len(response.roles) == 1 + + +@pytest.mark.xfail +def test_describe_failure_not_detected(docker_scheduler, docker_executor): + with ( + mock.patch.object(DockerJobRequest, "load") as mock_load, + mock.patch.object(DockerContainer, "get_container") as mock_get_container, + mock.patch.object(PersistentDockerScheduler, "_get_app_state") as mock_get_app_state, + ): + container = DockerContainer( + name="test_role", + command=["test"], + executor=docker_executor, + extra_env={}, + ) + req = DockerJobRequest( + id="test_session___test_role___test_container_id", + executor=docker_executor, + containers=[container], + ) + mock_load.return_value = req + mock_get_container.return_value = container + mock_get_app_state.return_value = None + status_file = os.path.join(req.executor.job_dir, f"status_{req.containers[0].name}.out") + + with open(status_file, "w") as f: + f.write(json.dumps({"exit_code": 1})) + + response = docker_scheduler.describe(req.id) + assert response is not None + assert response.app_id == req.id assert "SUCCEEDED" in str(response.state) assert len(response.roles) == 1 From 01a9a8ba360f7b2908728ad0516e0ad9d936966d Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Thu, 16 Oct 2025 23:51:17 -0700 Subject: [PATCH 26/28] new changelog-build (#367) Signed-off-by: Pablo Garay --- .github/workflows/changelog-build.yml | 56 +++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/.github/workflows/changelog-build.yml b/.github/workflows/changelog-build.yml index 3607a7da..f69b92e7 100644 --- a/.github/workflows/changelog-build.yml +++ b/.github/workflows/changelog-build.yml @@ -11,6 +11,11 @@ on: description: Release branch to build changelog on (e.g. `r2.1.0`) type: string required: true + changelog-main-content: + description: Custom changelog content to include before detailed changelogs + type: string + required: false + default: '' jobs: changelog: @@ -33,8 +38,8 @@ jobs: # fromTag: Auto resolved from historical tag order (previous tag compared to current tag) # toTag: Current tag reference configuration: ".github/workflows/config/changelog-config.json" - owner: "NVIDIA" - repo: "NeMo" + owner: ${{ github.repository_owner }} + repo: ${{ github.event.repository.name }} ignorePreReleases: "false" failOnError: "false" fromTag: ${{ inputs.last-release-tag }} @@ -44,12 +49,24 @@ jobs: env: RELEASE_BRANCH: ${{ inputs.release-branch }} CHANGELOG: ${{ steps.github_tag.outputs.changelog }} + MAIN_CONTENT: ${{ inputs.changelog-main-content }} shell: bash -x -e -u -o pipefail {0} run: | RELEASE_VERSION=${RELEASE_BRANCH#r} CHANGELOG=$(echo "$CHANGELOG" | sed '/^[[:blank:]]*#/s/#/###/') - RELEASE_NOTES="## NVIDIA Nemo Run $RELEASE_VERSION + # Build release notes starting with version header + RELEASE_NOTES="## NVIDIA Nemo Run $RELEASE_VERSION" + + # Add custom content if provided + if [ -n "$MAIN_CONTENT" ]; then + RELEASE_NOTES="$RELEASE_NOTES + + $MAIN_CONTENT" + fi + + # Add detailed changelogs section + RELEASE_NOTES="$RELEASE_NOTES ### Detailed Changelogs: @@ -62,6 +79,38 @@ jobs: - name: Inspect new changelog file run: cat CHANGELOG.md + - name: Create or update label + uses: actions/github-script@v6 + with: + script: | + const labelName = '${{ inputs.release-branch }}'; + const labelColor = '0366d6'; // Blue color + const labelDescription = `Release ${labelName}`; + + try { + // Try to get the label + await github.rest.issues.getLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + name: labelName + }); + console.log(`Label '${labelName}' already exists`); + } catch (error) { + if (error.status === 404) { + // Label doesn't exist, create it + await github.rest.issues.createLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + name: labelName, + color: labelColor, + description: labelDescription + }); + console.log(`Created label '${labelName}'`); + } else { + throw error; + } + } + - name: Create Pull Request uses: peter-evans/create-pull-request@v7 with: @@ -71,3 +120,4 @@ jobs: sign-commits: true base: main branch: bot/chore/update-changelog-into-${{ inputs.release-branch }} + labels: ${{ inputs.release-branch }} From dc86aeaf2b49fb2d9146f94cd7eeafa62892dd7e Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Wed, 26 Nov 2025 17:22:02 -0600 Subject: [PATCH 27/28] beep boop: Update changelog (#396) (#397) Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: NeMo Bot Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- CHANGELOG.md | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c18eca5..e6db9256 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,88 @@ # Changelog +## NVIDIA Nemo Run 0.7.0 + +### Detailed Changelogs: + + +#### Executors + + + +- Add image pull secrets param for lepton [#330](https://github.com/NVIDIA-NeMo/Run/pull/330) +- Add node reservations for LeptonExecutor [#336](https://github.com/NVIDIA-NeMo/Run/pull/336) +- [SkyPilot] Fix nodes -> num_nodes for SkyPilotExecutor in docs [#338](https://github.com/NVIDIA-NeMo/Run/pull/338) +- [SkyPilot] Add retry_until_up as an optional arg to SkyPilot Executor [#340](https://github.com/NVIDIA-NeMo/Run/pull/340) +- Support SkyPilot Storage configurations in `file_mounts` for automatic cloud sync [#335](https://github.com/NVIDIA-NeMo/Run/pull/335) +- [SkyPilot] Update YAML dump imports + backward compatibility for SkyPilot <=0.10.3 [#339](https://github.com/NVIDIA-NeMo/Run/pull/339) +- Create SkypilotJobsExecutor to allow running managed jobs [#343](https://github.com/NVIDIA-NeMo/Run/pull/343) +- fix: exit code docker runs [#365](https://github.com/NVIDIA-NeMo/Run/pull/365) + + +#### Ray Integration + + + +- Add ray head start timeout [#324](https://github.com/NVIDIA-NeMo/Run/pull/324) +- Remove ray deprecated dashboard-grpc-port arg [#325](https://github.com/NVIDIA-NeMo/Run/pull/325) + + +#### Experiment & Job Management + + + +- add a grace for Jobs that may start in Unknown [#291](https://github.com/NVIDIA-NeMo/Run/pull/291) +- Create SkypilotJobsExecutor to allow running managed jobs [#343](https://github.com/NVIDIA-NeMo/Run/pull/343) + + +#### Packaging & Deployment + + + +- Support SkyPilot Storage configurations in `file_mounts` for automatic cloud sync [#335](https://github.com/NVIDIA-NeMo/Run/pull/335) +- Refactor tar packaging logic to work for submodule and extra repo [#347](https://github.com/NVIDIA-NeMo/Run/pull/347) + + +#### Documentation + + + +- Add broken links check in docs [#333](https://github.com/NVIDIA-NeMo/Run/pull/333) +- [SkyPilot] Fix nodes -> num_nodes for SkyPilotExecutor in docs [#338](https://github.com/NVIDIA-NeMo/Run/pull/338) +- Documentation Restructurting [#350](https://github.com/NVIDIA-NeMo/Run/pull/350) +- Fix spelling in docstring [#359](https://github.com/NVIDIA-NeMo/Run/pull/359) +- fix: exit code docker runs [#365](https://github.com/NVIDIA-NeMo/Run/pull/365) + + +#### CI/CD + + + +- Update cherry-pick workflow to use version 0.63.0 [#344](https://github.com/NVIDIA-NeMo/Run/pull/344) +- fix: exit code docker runs [#365](https://github.com/NVIDIA-NeMo/Run/pull/365) + + +#### Bug Fixes + + + +- [SkyPilot] Fix nodes -> num_nodes for SkyPilotExecutor in docs [#338](https://github.com/NVIDIA-NeMo/Run/pull/338) +- Fix spelling in docstring [#359](https://github.com/NVIDIA-NeMo/Run/pull/359) +- fix: exit code docker runs [#365](https://github.com/NVIDIA-NeMo/Run/pull/365) + + +#### Others + +- chore: Bump to version 0.7.0rc0.dev0 [#322](https://github.com/NVIDIA-NeMo/Run/pull/322) +- Update community-bot to add community issues to shared project [#321](https://github.com/NVIDIA-NeMo/Run/pull/321) +- Bump community-bot to 0.54.4 [#332](https://github.com/NVIDIA-NeMo/Run/pull/332) +- remove custom dir [#351](https://github.com/NVIDIA-NeMo/Run/pull/351) +- Bumping to 0.5.0 [#352](https://github.com/NVIDIA-NeMo/Run/pull/352) +- Update release notes header in changelog build [#355](https://github.com/NVIDIA-NeMo/Run/pull/355) +- add changelog-config [#356](https://github.com/NVIDIA-NeMo/Run/pull/356) +- Changelog 0.6.0 [#357](https://github.com/NVIDIA-NeMo/Run/pull/357) +- feat: new changelog-build [#367](https://github.com/NVIDIA-NeMo/Run/pull/367) ## NVIDIA Nemo Run 0.6.0 ### Detailed Changelogs: From 0ba0b24745f7f6836333b0822a33b15edff3596f Mon Sep 17 00:00:00 2001 From: pablo-garay <7166088+pablo-garay@users.noreply.github.com> Date: Wed, 3 Dec 2025 23:25:22 +0000 Subject: [PATCH 28/28] =?UTF-8?q?[=F0=9F=A4=96]:=20Howdy=20folks,=20let's?= =?UTF-8?q?=20bump=20=20to=20`Version(=5F=5Fversion=5F=5F).major.Version(?= =?UTF-8?q?=5F=5Fversion=5F=5F).minor.Version(=5F=5Fversion=5F=5F).microrc?= =?UTF-8?q?1.dev0`=20!?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- nemo_run/package_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/package_info.py b/nemo_run/package_info.py index 50fd2aec..23184860 100644 --- a/nemo_run/package_info.py +++ b/nemo_run/package_info.py @@ -23,7 +23,7 @@ else: PRE_RELEASE = "" -DEV = Version(__version__).dev +DEV = 'dev0' __package_name__ = "nemo_run" __contact_names__ = "NVIDIA"