From a79f0739b4a7261d1e741d00a8ddd3b505d3c629 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 16 Oct 2025 15:13:43 -0400 Subject: [PATCH 01/29] [Isolated] Add Subnet Prioritization test for isolated regions --- tests/integration-tests/configs/isolated_regions.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/integration-tests/configs/isolated_regions.yaml b/tests/integration-tests/configs/isolated_regions.yaml index 0013b30679..5a640f80eb 100644 --- a/tests/integration-tests/configs/isolated_regions.yaml +++ b/tests/integration-tests/configs/isolated_regions.yaml @@ -234,6 +234,12 @@ test-suites: # oss: {{ OSS }} # schedulers: {{ SCHEDULERS }} networking: + test_cluster_networking.py::test_cluster_with_subnet_prioritization: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} test_cluster_networking.py::test_cluster_in_private_subnet: dimensions: - regions: {{ REGIONS }} From 7397a6a5f7ea0920ebe040d501525e7d53136e7a Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 16 Oct 2025 15:16:08 -0400 Subject: [PATCH 02/29] [Isolated] Add Ultraserver test for isolated regions --- tests/integration-tests/configs/isolated_regions.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/integration-tests/configs/isolated_regions.yaml b/tests/integration-tests/configs/isolated_regions.yaml index 5a640f80eb..df98c45b15 100644 --- a/tests/integration-tests/configs/isolated_regions.yaml +++ b/tests/integration-tests/configs/isolated_regions.yaml @@ -613,6 +613,13 @@ test-suites: instances: {{ INSTANCES }} oss: {{ OSS }} schedulers: {{ SCHEDULERS }} + ultraserver: + test_gb200.py::test_gb200: + dimensions: + - regions: {{ REGIONS }} + instances: ["g4dn.2xlarge"] + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} # These tests cannot be executed in US isolated regions # because the feature Custom Resource is not supported in these regions. # custom_resource: From c0015aa1550588a49adefeafa83d323b359daefb Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 16 Oct 2025 15:21:48 -0400 Subject: [PATCH 03/29] [Isolated] Add Ultraserver test for isolated regions * Using HN instance type for x86 for queue_without_imex as it mainly being run in isolated regions. For commercial partition we focus on ARM --- .../integration-tests/tests/ultraserver/test_gb200.py | 10 +++++++++- .../test_gb200/test_gb200/91_nvidia_imex_prolog.sh | 2 +- .../test_gb200/test_gb200/pcluster.config.final.yaml | 4 ++++ .../test_gb200/test_gb200/pcluster.config.update.yaml | 4 ++++ .../test_gb200/test_gb200/pcluster.config.yaml | 4 ++++ 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index 1baec1d8e4..08372d9a03 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -368,6 +368,7 @@ def test_gb200( instance, scheduler, os, + architecture, scheduler_commands_factory, request, ): @@ -402,7 +403,11 @@ def test_gb200( """ capacity_reservation_id = None max_queue_size = 2 - headnode_instance_type = "c7g.4xlarge" + headnode_instance_type = "" + if architecture == "x86_64": + headnode_instance_type = "c5.xlarge" + else: + headnode_instance_type = "c7g.4xlarge" if instance == "p6e-gb200.36xlarge": ultraserver_reservations_ids = get_capacity_reservation_id(request, instance, region, max_queue_size, os) if ultraserver_reservations_ids: @@ -451,6 +456,7 @@ def test_gb200( compute_resource_without_imex=compute_resource_without_imex, capacity_block_reservation_id=capacity_block_reservation_id, headnode_instance_type=headnode_instance_type, + architecture=architecture, ) slots_per_instance = fetch_instance_slots(region, instance, multithreading_disabled=True) suppress_validators = ["type:UltraserverCapacityBlockSizeValidator"] @@ -497,6 +503,7 @@ def test_gb200( queue_without_imex=queue_without_imex, compute_resource_without_imex=compute_resource_without_imex, headnode_instance_type=headnode_instance_type, + architecture=architecture, ) cluster.stop() @@ -544,6 +551,7 @@ def test_gb200( queue_without_imex=queue_without_imex, compute_resource_without_imex=compute_resource_without_imex, headnode_instance_type=headnode_instance_type, + architecture=architecture, ) cluster.stop() diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh index 4d5118dc55..794778891d 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh @@ -22,7 +22,7 @@ SCONTROL_CMD="/opt/slurm/bin/scontrol" IMEX_START_TIMEOUT=60 IMEX_STOP_TIMEOUT=15 #TODO In production, specify p6e-gb200, only. We added g5g only for testing purposes. -ALLOWED_INSTANCE_TYPES="^(p6e-gb200|g5g)" +ALLOWED_INSTANCE_TYPES="^(p6e-gb200|g5g|g4d)" IMEX_SERVICE="nvidia-imex" IMEX_NODES_CONFIG="/etc/nvidia-imex/nodes_config.cfg" diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.final.yaml b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.final.yaml index 14e767d42c..9654620b09 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.final.yaml +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.final.yaml @@ -53,9 +53,13 @@ Scheduling: ComputeResources: - Name: {{ compute_resource_without_imex }} Instances: + {% if architecture=="x86_64" %} + - InstanceType: {{ headnode_instance_type }} + {% else %} - InstanceType: m6g.xlarge - InstanceType: t4g.xlarge - InstanceType: c6gn.xlarge + {% endif %} MinCount: {{ min_queue_size_without_imex }} MaxCount: 1 Networking: diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.update.yaml b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.update.yaml index 08cbabc7a6..33d35e542c 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.update.yaml @@ -51,9 +51,13 @@ Scheduling: ComputeResources: - Name: {{ compute_resource_without_imex }} Instances: + {% if architecture=="x86_64" %} + - InstanceType: {{ headnode_instance_type }} + {% else %} - InstanceType: m6g.xlarge - InstanceType: t4g.xlarge - InstanceType: c6gn.xlarge + {% endif %} MinCount: {{ min_queue_size_without_imex }} MaxCount: 1 Networking: diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.yaml b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.yaml index 02d9333a58..164f565db3 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.yaml +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.yaml @@ -48,9 +48,13 @@ Scheduling: ComputeResources: - Name: {{ compute_resource_without_imex }} Instances: + {% if architecture=="x86_64" %} + - InstanceType: {{ headnode_instance_type }} + {% else %} - InstanceType: m6g.xlarge - InstanceType: t4g.xlarge - InstanceType: c6gn.xlarge + {% endif %} MinCount: {{ min_queue_size_without_imex }} MaxCount: 1 Networking: From a132717af9df95e496289c8d30ad7165c0d92bf0 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 16 Oct 2025 15:46:45 -0400 Subject: [PATCH 04/29] [Isolated] Add Ultraserver test for isolated regions * Using HN instance type for x86 for queue_without_imex as it mainly being run in isolated regions. For commercial partition we focus on ARM * Skipping test for ALinux 2 --- tests/integration-tests/tests/ultraserver/test_gb200.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index 08372d9a03..fe752d1856 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -404,6 +404,9 @@ def test_gb200( capacity_reservation_id = None max_queue_size = 2 headnode_instance_type = "" + if os == "alinux2": + pytest.skip(f"Skipping the test as nvidia-imex is not installed in {os}") + if architecture == "x86_64": headnode_instance_type = "c5.xlarge" else: From 07ed7dcdbc7e64c1c257a9ad2b18c73f71b4aa57 Mon Sep 17 00:00:00 2001 From: hgreebe <141743196+hgreebe@users.noreply.github.com> Date: Mon, 10 Nov 2025 11:42:54 -0500 Subject: [PATCH 05/29] Add GetFunction and GetPolicy permissions to the build image cleanup role (#7087) * Add GetFunction and GetPolicy permissions to the build image cleanup role * Increase pcluster build image cleanup role revision number * Update CHANGELOG * Add unit test for actions in CleanupRole policy * Fix linter errors --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27cd7ba5be..cbb50caaa5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,11 @@ CHANGELOG - Add GetFunction and GetPolicy permissions to PClusterBuildImageCleanupRole to prevent AccessDenied errors during build image stack deletion. - Fix validation error messages when `DevSettings` is null or `DevSettings/InstanceTypesData` is missing required fields. +**BUG FIXES** +- Reduce EFA installation time for Ubuntu by ~20 minutes by only holding kernel packages for the installed kernel. +- Add GetFunction and GetPolicy permissions to PClusterBuildImageCleanupRole to prevent AccessDenied errors during build image stack deletion. + + 3.14.0 ------ From afe15ff18f783fc949ddb5951800835c8e2bd71f Mon Sep 17 00:00:00 2001 From: hanwenli Date: Tue, 18 Nov 2025 13:17:08 -0800 Subject: [PATCH 06/29] [Test][Isolated Regions] Restrict tests running in us-iso-east-1 to use AZs "usie1-az1", "usie1-az2" FSx Lustre Scratch_2 is not supported in other AZs --- tests/integration-tests/conftest_networking.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration-tests/conftest_networking.py b/tests/integration-tests/conftest_networking.py index 70f3add6e1..f7a1aaaa78 100644 --- a/tests/integration-tests/conftest_networking.py +++ b/tests/integration-tests/conftest_networking.py @@ -54,6 +54,7 @@ "cn-north-1": ["cnn1-az1", "cnn1-az2"], # Should only consider supported AZs "us-isob-east-1": ["usibe1-az2", "usibe1-az3"], + "us-iso-east-1": ["usie1-az1", "usie1-az2"], } # used to map a ZoneId to the corresponding region From 482a29f660c8f1e82c0d017384fa2d67d2839390 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Tue, 18 Nov 2025 09:13:06 -0500 Subject: [PATCH 07/29] [Validation] Fix bug that caused unhelpful validation errors in case DevSettings is null, DevSettings/InstanceTypesData is not a valid JSON or DevSettings/InstanceTypesData is missing required fields. With this change we replaced the validation error message "'NoneType' object has no attribute 'get'", with a more actionable "Could not determine network cards for instance type ${instanceType}" --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cbb50caaa5..5fca0d07f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,7 +48,7 @@ CHANGELOG **BUG FIXES** - Reduce EFA installation time for Ubuntu by ~20 minutes by only holding kernel packages for the installed kernel. - Add GetFunction and GetPolicy permissions to PClusterBuildImageCleanupRole to prevent AccessDenied errors during build image stack deletion. - +- Fix validation error messages when `DevSettings` is null or `DevSettings/InstanceTypesData` is missing required fields. 3.14.0 ------ From d5cab0c20ea6f8e11bdffdfe44481c64b88ebc32 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 20 Nov 2025 15:17:53 -0500 Subject: [PATCH 08/29] [Test] Using new naming convention for filtering Official AL2 DLAMI https://docs.aws.amazon.com/dlami/latest/devguide/aws-deep-learning-multiframework-ami-amazon-linux-2.html --- tests/integration-tests/tests/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index 2138811779..70b1a0b2e6 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -58,7 +58,7 @@ # Remarkable AMIs are latest deep learning base AMI and FPGA developer AMI without pcluster infrastructure OS_TO_REMARKABLE_AMI_NAME_OWNER_MAP = { - "alinux2": {"name": "Deep Learning Base AMI (Amazon Linux 2)*", "owners": ["amazon"]}, + "alinux2": {"name": "Deep Learning OSS Nvidia Driver AMI (Amazon Linux 2)*", "owners": ["amazon"]}, "ubuntu2404": { "name": "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-*-server-*", "owners": ["099720109477"], From 3cadb3459181cce08e80701d7dca04fba9c58697 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 20 Nov 2025 16:30:22 -0500 Subject: [PATCH 09/29] [Test] Using architecture specific naming convestion for DLAMI https://docs.aws.amazon.com/dlami/latest/devguide/aws-deep-learning-multiframework-ami-amazon-linux-2.html --- tests/integration-tests/tests/common/utils.py | 37 ++++++++++++++++--- .../tests/createami/test_createami.py | 10 +++-- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index 70b1a0b2e6..5d949b0154 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -58,7 +58,26 @@ # Remarkable AMIs are latest deep learning base AMI and FPGA developer AMI without pcluster infrastructure OS_TO_REMARKABLE_AMI_NAME_OWNER_MAP = { - "alinux2": {"name": "Deep Learning OSS Nvidia Driver AMI (Amazon Linux 2)*", "owners": ["amazon"]}, + # Using a patched DLAMI which has uninstalled openssl11-devel, openssl11-libs and openssl11-pkcs + # so that it will not conflict with pcluster build image. + "alinux2": { + "name": "Deep Learning OSS Nvidia Driver AMI (Amazon Linux 2) Version 83.9 for ParallelCluster*", + "owners": ["amazon"], + }, + "alinux2023": { + "name": { + "x86_64": "Deep Learning OSS Nvidia Driver AMI (Amazon Linux 2)*", + "arm64": "Deep Learning ARM64 Base OSS Nvidia Driver GPU AMI (Amazon Linux 2023)*", + }, + "owners": ["amazon"], + }, + "ubuntu2204": { + "name": { + "x86_64": "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*", + "arm64": "Deep Learning ARM64 Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*", + }, + "owners": ["amazon"], + }, "ubuntu2404": { "name": "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-*-server-*", "owners": ["099720109477"], @@ -128,7 +147,7 @@ def retrieve_latest_ami( if ami_type == "pcluster": ami_name = "aws-parallelcluster-{version}-{ami_name}".format( version=get_installed_parallelcluster_version(), - ami_name=_get_ami_for_os(ami_type, os).get("name"), + ami_name=_get_ami_for_os(ami_type, os, architecture).get("name"), ) if ( request @@ -141,14 +160,14 @@ def retrieve_latest_ami( # Then retrieve public pcluster AMIs additional_filters.append({"Name": "is-public", "Values": ["true"]}) else: - ami_name = _get_ami_for_os(ami_type, os).get("name") + ami_name = _get_ami_for_os(ami_type, os, architecture).get("name") logging.info("Parent image name %s" % ami_name) paginator = boto3.client("ec2", region_name=region).get_paginator("describe_images") page_iterator = paginator.paginate( Filters=[{"Name": "name", "Values": [ami_name]}, {"Name": "architecture", "Values": [architecture]}] + additional_filters, - Owners=_get_ami_for_os(ami_type, os).get("owners"), - IncludeDeprecated=_get_ami_for_os(ami_type, os).get("includeDeprecated", False), + Owners=_get_ami_for_os(ami_type, os, architecture).get("owners"), + IncludeDeprecated=_get_ami_for_os(ami_type, os, architecture).get("includeDeprecated", False), ) images = [] for page in page_iterator: @@ -166,13 +185,19 @@ def retrieve_latest_ami( raise -def _get_ami_for_os(ami_type, os): +def _get_ami_for_os(ami_type, os, architecture="x86_64"): ami_dict = AMI_TYPE_DICT.get(ami_type) if not ami_dict: raise Exception(f"'{ami_type}' not found in the dict 'AMI_TYPE_DICT'") os_ami = ami_dict.get(os) if not os_ami: raise Exception(f"'{os}' not found in the '{ami_type}' mapping referenced in the 'AMI_TYPE_DICT'") + + # Get correct AMI names as per architecture + if isinstance(os_ami.get("name"), dict): + name = os_ami["name"].get(architecture) + return {"name": name, "owners": os_ami["owners"]} + return os_ami diff --git a/tests/integration-tests/tests/createami/test_createami.py b/tests/integration-tests/tests/createami/test_createami.py index 7e4570ca92..e70cc440ee 100644 --- a/tests/integration-tests/tests/createami/test_createami.py +++ b/tests/integration-tests/tests/createami/test_createami.py @@ -80,8 +80,8 @@ def test_invalid_config( # Test Suppression of a validator - # Get base AMI -- remarkable AMIs are not available for ARM and ubuntu2204, alinux2023 yet - if os not in ["ubuntu2204", "alinux2023"]: + # Get base AMI -- remarkable AL2 AMIs are failing because of conflicts between openssl-devel packages + if os not in ["alinux2"]: base_ami = retrieve_latest_ami(region, os, ami_type="remarkable", architecture=architecture) else: base_ami = retrieve_latest_ami(region, os, architecture=architecture) @@ -143,8 +143,12 @@ def test_build_image( # Get base AMI if os in ["alinux2", "ubuntu2004"]: + # Using patched DLAMI AMI so it does not conflict with pcluster build image. + allow_private_ami = True if os == "alinux2" else False # Test Deep Learning AMIs - base_ami = retrieve_latest_ami(region, os, ami_type="remarkable", architecture=architecture) + base_ami = retrieve_latest_ami( + region, os, ami_type="remarkable", architecture=architecture, allow_private_ami=allow_private_ami + ) enable_nvidia = False # Deep learning AMIs have Nvidia pre-installed elif "rhel" in os or "ubuntu" in os or os == "rocky8": # Test AMIs from first stage build. Because RHEL/Rocky and Ubuntu have specific requirement of kernel versions. From 6273a917a279b6b83f9220edec17d28b6c9f1f7a Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 25 Nov 2025 11:37:35 -0500 Subject: [PATCH 10/29] [Test] Using AL2023 for creating DLAMI AMIs * Using self for Owner Filter for patched AL2 DLAMI --- tests/integration-tests/tests/common/utils.py | 3 ++- tests/integration-tests/tests/createami/test_createami.py | 8 ++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index 5d949b0154..f2834f959b 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -62,7 +62,8 @@ # so that it will not conflict with pcluster build image. "alinux2": { "name": "Deep Learning OSS Nvidia Driver AMI (Amazon Linux 2) Version 83.9 for ParallelCluster*", - "owners": ["amazon"], + # If you are running in your personal account, then you must have this patched AMI + "owners": ["self"], }, "alinux2023": { "name": { diff --git a/tests/integration-tests/tests/createami/test_createami.py b/tests/integration-tests/tests/createami/test_createami.py index e70cc440ee..e5e84534e2 100644 --- a/tests/integration-tests/tests/createami/test_createami.py +++ b/tests/integration-tests/tests/createami/test_createami.py @@ -142,13 +142,9 @@ def test_build_image( enable_nvidia = False # Get base AMI - if os in ["alinux2", "ubuntu2004"]: - # Using patched DLAMI AMI so it does not conflict with pcluster build image. - allow_private_ami = True if os == "alinux2" else False + if os in ["alinux2023", "ubuntu2004"]: # Test Deep Learning AMIs - base_ami = retrieve_latest_ami( - region, os, ami_type="remarkable", architecture=architecture, allow_private_ami=allow_private_ami - ) + base_ami = retrieve_latest_ami(region, os, ami_type="remarkable", architecture=architecture) enable_nvidia = False # Deep learning AMIs have Nvidia pre-installed elif "rhel" in os or "ubuntu" in os or os == "rocky8": # Test AMIs from first stage build. Because RHEL/Rocky and Ubuntu have specific requirement of kernel versions. From a2b67fd2311cf3fa84c7d186c72d89edddc1378f Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 26 Nov 2025 11:43:13 -0500 Subject: [PATCH 11/29] [Test] Correcting the DLAMI filter name --- tests/integration-tests/tests/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index f2834f959b..96fa07e32f 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -67,7 +67,7 @@ }, "alinux2023": { "name": { - "x86_64": "Deep Learning OSS Nvidia Driver AMI (Amazon Linux 2)*", + "x86_64": "Deep Learning OSS Nvidia Driver AMI (Amazon Linux 2023)*", "arm64": "Deep Learning ARM64 Base OSS Nvidia Driver GPU AMI (Amazon Linux 2023)*", }, "owners": ["amazon"], From 7414d972137aa502c9b62f99ac8638a76b4e6357 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 26 Nov 2025 15:44:01 -0500 Subject: [PATCH 12/29] [Test] Use Ubuntu22 AMI for DLAMI as AL2023 is not available in all regions --- tests/integration-tests/tests/createami/test_createami.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/createami/test_createami.py b/tests/integration-tests/tests/createami/test_createami.py index e5e84534e2..f634e4bd27 100644 --- a/tests/integration-tests/tests/createami/test_createami.py +++ b/tests/integration-tests/tests/createami/test_createami.py @@ -142,7 +142,7 @@ def test_build_image( enable_nvidia = False # Get base AMI - if os in ["alinux2023", "ubuntu2004"]: + if os in ["ubuntu2204"]: # Test Deep Learning AMIs base_ami = retrieve_latest_ami(region, os, ami_type="remarkable", architecture=architecture) enable_nvidia = False # Deep learning AMIs have Nvidia pre-installed From 24809738b2ed521ef0184863ee67cca4e45a2979 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Mon, 1 Dec 2025 13:18:32 -0800 Subject: [PATCH 13/29] [integ-tests] In OSU test, use the same instance type for head node and compute nodes Prior to this commit, there were intermittent failures of insufficient capacity of the head node instance type. Although the head node is not in the placement group, which is used by the integration tests framework to make the capacity reservation, this commit "hopes" that the AZ has adequate capacity of the instance type. In other words, if the integration test framework can make a capacity reservation of 35 instances in a placement group in the AZ, this commit hopes there will be an additional 1 instance anywhere in the AZ. --- tests/integration-tests/tests/performance_tests/test_osu.py | 6 ------ .../test_osu/test_osu/pcluster.config.yaml | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/integration-tests/tests/performance_tests/test_osu.py b/tests/integration-tests/tests/performance_tests/test_osu.py index 783e719273..285e59672f 100644 --- a/tests/integration-tests/tests/performance_tests/test_osu.py +++ b/tests/integration-tests/tests/performance_tests/test_osu.py @@ -54,11 +54,6 @@ def test_osu( f"Only these instances are supported: {OSU_BENCHMARKS_INSTANCES}" ) - if architecture == "x86_64": - head_node_instance = "c5.18xlarge" - else: - head_node_instance = "c6g.16xlarge" - max_queue_size = 32 capacity_type = "ONDEMAND" capacity_reservation_id = None @@ -79,7 +74,6 @@ def test_osu( slots_per_instance = fetch_instance_slots(region, instance, multithreading_disabled=True) cluster_config = pcluster_config_reader( - head_node_instance=head_node_instance, max_queue_size=max_queue_size, capacity_type=capacity_type, capacity_reservation_id=capacity_reservation_id, diff --git a/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml index b42253da24..d4d03c2078 100644 --- a/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml @@ -1,7 +1,7 @@ Image: Os: {{ os }} HeadNode: - InstanceType: {{ head_node_instance }} + InstanceType: {{ instance }} Networking: SubnetId: {{ public_subnet_id }} Ssh: From d216991ac1bb71f7addfa83bd8b26df497ddf44c Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 26 Nov 2025 18:21:13 -0500 Subject: [PATCH 14/29] [Test] Use t3.medium for spot test instances --- tests/integration-tests/tests/update/test_update.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index a674d8a0b5..53b5db8c70 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -83,10 +83,10 @@ def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory, cluster ]: bucket.upload_file(str(test_datadir / script), f"scripts/{script}") - spot_instance_types = ["t3.small", "t3.medium"] + spot_instance_types = ["t3.medium"] try: - boto3.client("ec2").describe_instance_types(InstanceTypes=["t3a.small"]) - spot_instance_types.extend(["t3a.small", "t3a.medium"]) + boto3.client("ec2").describe_instance_types(InstanceTypes=["t3a.medium"]) + spot_instance_types.extend(["t3a.medium"]) except Exception: pass From 019e9ebe75afeb0ac0b6731093199b9b5b1c458d Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 2 Dec 2025 13:26:43 -0500 Subject: [PATCH 15/29] [Test] Correcting the Al2023 DLAMI filter name --- tests/integration-tests/tests/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index 96fa07e32f..792b3a1063 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -67,7 +67,7 @@ }, "alinux2023": { "name": { - "x86_64": "Deep Learning OSS Nvidia Driver AMI (Amazon Linux 2023)*", + "x86_64": "Deep Learning Base OSS Nvidia Driver GPU AMI (Amazon Linux 2023)*", "arm64": "Deep Learning ARM64 Base OSS Nvidia Driver GPU AMI (Amazon Linux 2023)*", }, "owners": ["amazon"], From c7fed62effbe32911fed3846592a6906edfcc920 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Tue, 2 Dec 2025 10:26:01 -0800 Subject: [PATCH 16/29] [integ-tests] In EFA test, use the same instance type for head node and compute nodes when possible This commit uses the same instance type for both compute node and head node when the instance type is available in open capacity pool. When the instance type is p4, p5, p6, or hpc which require capacity reservations or capacity blocks, the head node still uses a different instance type as the logic before this commit. Prior to this commit, there were intermittent failures of insufficient capacity of the head node instance type. Although the head node is not in the placement group, which is used by the integration tests framework to make the capacity reservation, this commit "hopes" that the AZ has adequate capacity of the instance type. In other words, if the integration test framework can make a capacity reservation of 35 instances in a placement group in the AZ, this commit hopes there will be an additional 1 instance anywhere in the AZ. --- tests/integration-tests/tests/efa/test_efa.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/efa/test_efa.py b/tests/integration-tests/tests/efa/test_efa.py index d6962207ed..8afb932431 100644 --- a/tests/integration-tests/tests/efa/test_efa.py +++ b/tests/integration-tests/tests/efa/test_efa.py @@ -51,10 +51,15 @@ def test_efa( Grouped all tests in a single function so that cluster can be reused for all of them. """ - if architecture == "x86_64": - head_node_instance = "c5.18xlarge" + if instance.startswith("p") or instance.startswith("hpc"): + if architecture == "x86_64": + head_node_instance = "c5.18xlarge" + else: + head_node_instance = "c6g.16xlarge" else: - head_node_instance = "c6g.16xlarge" + # Use the same instance type for both compute node and head node + # when the instance type is available in open capacity pool + head_node_instance = instance max_queue_size = 2 p6_b200_capacity_reservation_id = None if instance == "p6-b200.48xlarge": From e090f174aded1b68ce4ecb72cfc1c325c6dde9d6 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Fri, 21 Nov 2025 10:42:12 -0800 Subject: [PATCH 17/29] [Test][Isolated Region] Reduce memory usage in test_slurm_memory_based_scheduling The changed line is running on c5.large, which has 4GB of physical memory, when running with 3GB of program, it failed in isolated region on RHEL8. Slightly reduce the memory usage to 2.8GB to make the test pass. ParallelCluster test shouldn't check too strictly about memory usage because it is managed by operating systems. --- tests/integration-tests/tests/schedulers/test_slurm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index 483ec9d8bb..6ea6008be4 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -2594,7 +2594,7 @@ def _test_slurm_behavior_when_updating_schedulable_memory_with_already_running_j submit_command_args={ "nodes": 1, "slots": 1, - "command": "srun ./a.out 3000000000 390", + "command": "srun ./a.out 2000000000 390", "other_options": "-w queue1-st-ondemand1-i1-1", "raise_on_error": False, } From ca907b69c2ee32e9441172280f4a165cd8160c22 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 2 Dec 2025 18:43:16 -0500 Subject: [PATCH 18/29] [TEST] Use latest RHel8 AMI for testing --- tests/integration-tests/tests/common/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index 792b3a1063..4ba334bbd3 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -84,9 +84,7 @@ "owners": ["099720109477"], }, # Simple redhat8 to be able to build in remarkable test - # FIXME: when fixed upstream, unpin the timestamp introduced because the `kernel-devel` package was missing for - # the kernel released in 20231127 RHEL 8.8 AMI - "rhel8": {"name": "RHEL-8.8*_HVM-202309*", "owners": RHEL_OWNERS}, + "rhel8": {"name": "RHEL-8.8*_HVM-*", "owners": RHEL_OWNERS}, "rocky8": {"name": "Rocky-8-EC2-Base-8.10*", "owners": ["792107900819"]}, # TODO add china and govcloud accounts "rhel8.9": {"name": "RHEL-8.9*_HVM-*", "owners": RHEL_OWNERS}, "rocky8.9": {"name": "Rocky-8-EC2-Base-8.9*", "owners": ["792107900819"]}, # TODO add china and govcloud accounts From 8be6739d85006d782b6ff67b3a07675c7d75900f Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 3 Dec 2025 13:34:23 -0500 Subject: [PATCH 19/29] [TEST] Remove OpenFoam for Daily integration test --- tests/integration-tests/configs/develop.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/integration-tests/configs/develop.yaml b/tests/integration-tests/configs/develop.yaml index 98f8cdc65b..e219f5200a 100644 --- a/tests/integration-tests/configs/develop.yaml +++ b/tests/integration-tests/configs/develop.yaml @@ -865,12 +865,6 @@ test-suites: instances: [ "c5n.18xlarge" ] oss: [{{ NO_ROCKY_OS_X86_0 }}] # ParallelCluster does not release official Rocky images. Skip the test. schedulers: [ "slurm" ] - test_openfoam.py::test_openfoam: - dimensions: - - regions: [ {{ c5n_18xlarge_CAPACITY_RESERVATION_35_INSTANCES_2_HOURS_YESPG_NO_ROCKY_OS_X86_0 }} ] - instances: [ "c5n.18xlarge" ] - oss: [{{ NO_ROCKY_OS_X86_0 }}] # ParallelCluster does not release official Rocky images. Skip the test. - schedulers: [ "slurm" ] test_startup_time.py::test_startup_time: dimensions: - regions: [ "us-east-1" ] From b6568e0b3575d71ac7af5d119a933cccb84a4ba6 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 3 Dec 2025 17:50:46 -0500 Subject: [PATCH 20/29] [OSU] Update test_osu performance test to use in_place_update_on_fleet_enabled --- .../integration-tests/tests/performance_tests/test_osu.py | 7 ++++++- .../test_osu/test_osu/pcluster.config.yaml | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/performance_tests/test_osu.py b/tests/integration-tests/tests/performance_tests/test_osu.py index 285e59672f..de255c2370 100644 --- a/tests/integration-tests/tests/performance_tests/test_osu.py +++ b/tests/integration-tests/tests/performance_tests/test_osu.py @@ -33,11 +33,12 @@ @pytest.mark.usefixtures("serial_execution_by_instance") -@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize("in_place_update_on_fleet_enabled", ["true", "false"]) def test_osu( os, region, scheduler, + in_place_update_on_fleet_enabled, instance, pcluster_config_reader, clusters_factory, @@ -59,6 +60,9 @@ def test_osu( capacity_reservation_id = None placement_group_enabled = True + extra_chef_attributes = '{{"cluster": {{"in_place_update_on_fleet_enabled": "{}"}}}}'.format( + in_place_update_on_fleet_enabled + ) if instance in ["p6-b200.48xlarge", "p5en.48xlarge"]: max_queue_size = 2 capacity_type = "CAPACITY_BLOCK" @@ -78,6 +82,7 @@ def test_osu( capacity_type=capacity_type, capacity_reservation_id=capacity_reservation_id, placement_group_enabled=placement_group_enabled, + extra_chef_attributes=extra_chef_attributes, ) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) diff --git a/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml index d4d03c2078..511630d55b 100644 --- a/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml @@ -38,3 +38,6 @@ SharedStorage: - MountDir: /shared Name: name1 StorageType: Ebs +DevSettings: + Cookbook: + ExtraChefAttributes: '{{ extra_chef_attributes }}' From f371e9e01dc2bccfd59b9ec657362e280001dcac Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Wed, 3 Dec 2025 23:41:14 -0500 Subject: [PATCH 21/29] [StarCCM] Update test_starccm performance test to use in_place_update_on_fleet_enabled --- .../tests/performance_tests/test_starccm.py | 13 +++++++++---- .../test_starccm/test_starccm/pcluster.config.yaml | 3 +++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py index 837e0f870a..7516a61a94 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm.py @@ -58,12 +58,13 @@ def calculate_observed_value(result, remote_command_executor, scheduler_commands @pytest.mark.usefixtures("serial_execution_by_instance") -@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize("in_place_update_on_fleet_enabled", ["true", "false"]) def test_starccm( vpc_stack, instance, os, region, + in_place_update_on_fleet_enabled, scheduler, pcluster_config_reader, clusters_factory, @@ -77,10 +78,15 @@ def test_starccm( s3 = boto3.client("s3") s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") + extra_chef_attributes = '{{"cluster": {{"in_place_update_on_fleet_enabled": "{}"}}}}'.format( + in_place_update_on_fleet_enabled + ) + cluster_config = pcluster_config_reader( bucket_name=bucket_name, install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS, number_of_nodes=max(number_of_nodes), + extra_chef_attributes=extra_chef_attributes, ) cluster = clusters_factory(cluster_config) logging.info("Cluster Created") @@ -109,9 +115,8 @@ def test_starccm( instance_slots = fetch_instance_slots(region, instance, multithreading_disabled=True) run_command = f'sbatch --ntasks={num_of_nodes * instance_slots} starccm.slurm.sh "{podkey}" "{licpath}"' multiple_runs = [] - # Run at least four times up to whatever parallelism allows to maximize usage of available nodes - # Running the test multiple times reduces and get the average value improves stability of the result. - number_of_runs = max(parallelism, 4) + # Run at least twice up to whatever parallelism allows to maximize usage of available nodes + number_of_runs = max(parallelism, 2) for _ in range(number_of_runs): multiple_runs.append(remote_command_executor.run_remote_command(run_command)) for run in multiple_runs: diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml index 6bf23c1cd7..57bcba628d 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml @@ -62,3 +62,6 @@ SharedStorage: DeploymentType: PERSISTENT_1 PerUnitStorageThroughput: 100 StorageType: SSD +DevSettings: + Cookbook: + ExtraChefAttributes: '{{ extra_chef_attributes }}' \ No newline at end of file From 21589d894c54a330d0ab52a11da5851dbab0b774 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 4 Dec 2025 13:11:22 -0500 Subject: [PATCH 22/29] [PERF] Skipping the performance tests when in_place_update_on_fleet_enabled is true --- .../tests/performance_tests/test_osu.py | 16 +++++++++++++--- .../tests/performance_tests/test_starccm.py | 14 +++++++++++--- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/tests/integration-tests/tests/performance_tests/test_osu.py b/tests/integration-tests/tests/performance_tests/test_osu.py index de255c2370..ad0baa6f72 100644 --- a/tests/integration-tests/tests/performance_tests/test_osu.py +++ b/tests/integration-tests/tests/performance_tests/test_osu.py @@ -9,6 +9,7 @@ # or in the "LICENSE.txt" file accompanying this file. # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. +import json import logging import re @@ -49,6 +50,11 @@ def test_osu( scheduler_commands_factory, request, ): + if in_place_update_on_fleet_enabled == "true": + message = f"Skipping the test as we want to compare performance when cfn-hup is disabled" + logging.warn(message) + pytest.skip(message) + if instance not in OSU_BENCHMARKS_INSTANCES: raise Exception( f"OSU benchmarks can't be run on instance {instance}. " @@ -60,9 +66,13 @@ def test_osu( capacity_reservation_id = None placement_group_enabled = True - extra_chef_attributes = '{{"cluster": {{"in_place_update_on_fleet_enabled": "{}"}}}}'.format( - in_place_update_on_fleet_enabled - ) + chef_attributes_dict = { + "cluster": { + "in_place_update_on_fleet_enabled": in_place_update_on_fleet_enabled + } + } + extra_chef_attributes = json.dumps(chef_attributes_dict) + if instance in ["p6-b200.48xlarge", "p5en.48xlarge"]: max_queue_size = 2 capacity_type = "CAPACITY_BLOCK" diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py index 7516a61a94..d85309d1f9 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm.py @@ -72,15 +72,23 @@ def test_starccm( scheduler_commands_factory, s3_bucket_factory, ): + if in_place_update_on_fleet_enabled == "true": + message = f"Skipping the test as we want to compare performance when cfn-hup is disabled" + logging.warn(message) + pytest.skip(message) + number_of_nodes = [8, 16, 32] # Create S3 bucket for custom actions scripts bucket_name = s3_bucket_factory() s3 = boto3.client("s3") s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") - extra_chef_attributes = '{{"cluster": {{"in_place_update_on_fleet_enabled": "{}"}}}}'.format( - in_place_update_on_fleet_enabled - ) + chef_attributes_dict = { + "cluster": { + "in_place_update_on_fleet_enabled": in_place_update_on_fleet_enabled + } + } + extra_chef_attributes = json.dumps(chef_attributes_dict) cluster_config = pcluster_config_reader( bucket_name=bucket_name, From 7aa63080970044d5f7932f09a5afc8fa32129c90 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 5 Dec 2025 15:20:45 -0500 Subject: [PATCH 23/29] [Test] Increase HeadNodeBootstrapTimeout to 40 mins for Managed FSX integration tests as FSX take 20+ mins if throttled --- .../test_fsx_lustre_configuration_options/pcluster.config.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre_configuration_options/pcluster.config.yaml b/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre_configuration_options/pcluster.config.yaml index 682b942012..d0449f1996 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre_configuration_options/pcluster.config.yaml +++ b/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre_configuration_options/pcluster.config.yaml @@ -1,3 +1,6 @@ +DevSettings: + Timeouts: + HeadNodeBootstrapTimeout: 2400 # Increasing Timeout as FSX takes 20+ minutes to create if throttled Image: Os: {{ os }} HeadNode: From 04a059e91bccaf0f93a02faeed60adff410781d4 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 11 Dec 2025 15:28:20 -0500 Subject: [PATCH 24/29] Fix code check failures in performance tests --- .../integration-tests/tests/performance_tests/test_osu.py | 8 ++------ .../tests/performance_tests/test_starccm.py | 8 ++------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/tests/integration-tests/tests/performance_tests/test_osu.py b/tests/integration-tests/tests/performance_tests/test_osu.py index ad0baa6f72..d7d712f917 100644 --- a/tests/integration-tests/tests/performance_tests/test_osu.py +++ b/tests/integration-tests/tests/performance_tests/test_osu.py @@ -51,7 +51,7 @@ def test_osu( request, ): if in_place_update_on_fleet_enabled == "true": - message = f"Skipping the test as we want to compare performance when cfn-hup is disabled" + message = "Skipping the test as we want to compare performance when cfn-hup is disabled" logging.warn(message) pytest.skip(message) @@ -66,11 +66,7 @@ def test_osu( capacity_reservation_id = None placement_group_enabled = True - chef_attributes_dict = { - "cluster": { - "in_place_update_on_fleet_enabled": in_place_update_on_fleet_enabled - } - } + chef_attributes_dict = {"cluster": {"in_place_update_on_fleet_enabled": in_place_update_on_fleet_enabled}} extra_chef_attributes = json.dumps(chef_attributes_dict) if instance in ["p6-b200.48xlarge", "p5en.48xlarge"]: diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py index d85309d1f9..fd77405be6 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm.py @@ -73,7 +73,7 @@ def test_starccm( s3_bucket_factory, ): if in_place_update_on_fleet_enabled == "true": - message = f"Skipping the test as we want to compare performance when cfn-hup is disabled" + message = "Skipping the test as we want to compare performance when cfn-hup is disabled" logging.warn(message) pytest.skip(message) @@ -83,11 +83,7 @@ def test_starccm( s3 = boto3.client("s3") s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") - chef_attributes_dict = { - "cluster": { - "in_place_update_on_fleet_enabled": in_place_update_on_fleet_enabled - } - } + chef_attributes_dict = {"cluster": {"in_place_update_on_fleet_enabled": in_place_update_on_fleet_enabled}} extra_chef_attributes = json.dumps(chef_attributes_dict) cluster_config = pcluster_config_reader( From cd61adabec3fcf49e259c596a5751c9a59cb70ee Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 16 Dec 2025 17:51:45 -0500 Subject: [PATCH 25/29] [Build image] Adding kernel-devel-matched to the OS in which it exists * the package dose not exist in Amazon Linux and Rocky/Rhel 8 AMI's; we avoid errors like Error: Unable to find a match: kernel6.12-devel-matched-6.12.58-82.121.amzn2023.aarch64 --- cli/src/pcluster/resources/imagebuilder/parallelcluster.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/pcluster/resources/imagebuilder/parallelcluster.yaml b/cli/src/pcluster/resources/imagebuilder/parallelcluster.yaml index b64296b2b9..eb25b13bda 100644 --- a/cli/src/pcluster/resources/imagebuilder/parallelcluster.yaml +++ b/cli/src/pcluster/resources/imagebuilder/parallelcluster.yaml @@ -108,7 +108,7 @@ phases: PACKAGE_LIST="${!KERNEL_PACKAGES_PREFIX}-headers-$(uname -r) ${!KERNEL_PACKAGES_PREFIX}-devel-$(uname -r) ${!KERNEL_PACKAGES_PREFIX}-modules-extra-$(uname -r)" Repository="BaseOS" - [[ ! $OS =~ (rocky8|rhel8) ]] && PACKAGE_LIST+=" ${!KERNEL_PACKAGES_PREFIX}-devel-matched-$(uname -r)" && Repository="AppStream" + [[ $OS =~ (rocky9|rhel9) ]] && PACKAGE_LIST+=" ${!KERNEL_PACKAGES_PREFIX}-devel-matched-$(uname -r)" && Repository="AppStream" if [[ $OS =~ rocky ]]; then for PACKAGE in ${!PACKAGE_LIST} From 7f6e76ef2a173e0d87cd6f0f9ca2b280c9433e34 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Thu, 18 Dec 2025 13:13:03 -0500 Subject: [PATCH 26/29] [BuildImage] On Ubuntu, disable snap refresh during AMI build process to prevent background updates of SSM agent that could lead to reboot failures. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fca0d07f0..d390d5ffe3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,7 @@ CHANGELOG - Reduce EFA installation time for Ubuntu by ~20 minutes by only holding kernel packages for the installed kernel. - Add GetFunction and GetPolicy permissions to PClusterBuildImageCleanupRole to prevent AccessDenied errors during build image stack deletion. - Fix validation error messages when `DevSettings` is null or `DevSettings/InstanceTypesData` is missing required fields. +- Disable snap auto-refresh on Ubuntu during build image to prevent intermittent reboot failures. 3.14.0 ------ From fed03e61e514b803ea3e7470eb92024f5acb5f20 Mon Sep 17 00:00:00 2001 From: Xuanqi He <93849823+hehe7318@users.noreply.github.com> Date: Tue, 23 Dec 2025 09:50:38 -0500 Subject: [PATCH 27/29] [Release-3.14][E2E Test] Prevent the bad IndexError when build image fails (#7171) When build image fails, the image_list is empty but the code directly accesses image_list[0], causing IndexError that masks the real error. Add safety check to only access list elements when list is not empty. --- .../integration-tests/tests/createami/test_createami.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/integration-tests/tests/createami/test_createami.py b/tests/integration-tests/tests/createami/test_createami.py index f634e4bd27..ae9d42c6f0 100644 --- a/tests/integration-tests/tests/createami/test_createami.py +++ b/tests/integration-tests/tests/createami/test_createami.py @@ -430,10 +430,11 @@ def _test_image_tag_and_volume(image): raise ImageNotFound() assert_that(len(image_list)).is_equal_to(1) - created_image = image_list[0] - volume_size = created_image.get("BlockDeviceMappings")[0].get("Ebs").get("VolumeSize") - assert_that(volume_size).is_equal_to(200) - assert_that(created_image["Tags"]).contains({"Key": "dummyImageTag", "Value": "dummyImageTag"}) + if len(image_list) > 0: + created_image = image_list[0] + volume_size = created_image.get("BlockDeviceMappings")[0].get("Ebs").get("VolumeSize") + assert_that(volume_size).is_equal_to(200) + assert_that(created_image["Tags"]).contains({"Key": "dummyImageTag", "Value": "dummyImageTag"}) @pytest.fixture() From a2f160305ecc7f7f91e72958197314a82e9b97c5 Mon Sep 17 00:00:00 2001 From: Xuanqi He <93849823+hehe7318@users.noreply.github.com> Date: Tue, 23 Dec 2025 10:33:17 -0500 Subject: [PATCH 28/29] [Release-3.14.1][E2E Test] Skip dcv installation when test_build_image on Ubuntu24 (#7164) * Skip dcv installation when test_build_image on ubuntu24.04 --- tests/integration-tests/tests/createami/test_createami.py | 5 +++++ .../test_createami/test_build_image/image.config.yaml | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/tests/integration-tests/tests/createami/test_createami.py b/tests/integration-tests/tests/createami/test_createami.py index ae9d42c6f0..d2284a1dff 100644 --- a/tests/integration-tests/tests/createami/test_createami.py +++ b/tests/integration-tests/tests/createami/test_createami.py @@ -164,6 +164,10 @@ def test_build_image( enable_lustre_client = False if os in ["alinux2", "alinux2023", "rocky9"]: update_os_packages = True + + # Disable DCV installation for Ubuntu 24.04 to avoid build failures with DLAMI + enable_dcv = os != "ubuntu2404" + image_config = pcluster_config_reader( config_file="image.config.yaml", parent_image=base_ami, @@ -172,6 +176,7 @@ def test_build_image( enable_nvidia=str(enable_nvidia and get_gpu_count(instance) > 0).lower(), update_os_packages=str(update_os_packages).lower(), enable_lustre_client=str(enable_lustre_client).lower(), + enable_dcv=str(enable_dcv).lower(), ) image = images_factory(image_id, image_config, region) diff --git a/tests/integration-tests/tests/createami/test_createami/test_build_image/image.config.yaml b/tests/integration-tests/tests/createami/test_createami/test_build_image/image.config.yaml index db15a8ece3..2a3a8bd9b1 100644 --- a/tests/integration-tests/tests/createami/test_createami/test_build_image/image.config.yaml +++ b/tests/integration-tests/tests/createami/test_createami/test_build_image/image.config.yaml @@ -36,3 +36,8 @@ DeploymentSettings: - {{ default_vpc_security_group_id }} DevSettings: TerminateInstanceOnFailure: True +{% if enable_dcv == "false" %} + Cookbook: + ExtraChefAttributes: | + {"cluster": {"dcv": {"install_enabled": false}}} +{% endif %} From c1d212f4a86be72ddbe66d2b9545a3ccb5ba7ba4 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Wed, 24 Dec 2025 15:18:25 -0500 Subject: [PATCH 29/29] Fix changelog --- CHANGELOG.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d390d5ffe3..27cd7ba5be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,12 +45,6 @@ CHANGELOG - Add GetFunction and GetPolicy permissions to PClusterBuildImageCleanupRole to prevent AccessDenied errors during build image stack deletion. - Fix validation error messages when `DevSettings` is null or `DevSettings/InstanceTypesData` is missing required fields. -**BUG FIXES** -- Reduce EFA installation time for Ubuntu by ~20 minutes by only holding kernel packages for the installed kernel. -- Add GetFunction and GetPolicy permissions to PClusterBuildImageCleanupRole to prevent AccessDenied errors during build image stack deletion. -- Fix validation error messages when `DevSettings` is null or `DevSettings/InstanceTypesData` is missing required fields. -- Disable snap auto-refresh on Ubuntu during build image to prevent intermittent reboot failures. - 3.14.0 ------