aws · hehe7318 · Dec 25, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
@@ -108,7 +108,7 @@ phases:
 
                 PACKAGE_LIST="${!KERNEL_PACKAGES_PREFIX}-headers-$(uname -r) ${!KERNEL_PACKAGES_PREFIX}-devel-$(uname -r) ${!KERNEL_PACKAGES_PREFIX}-modules-extra-$(uname -r)"
                 Repository="BaseOS"
-                [[ ! $OS =~ (rocky8|rhel8) ]] && PACKAGE_LIST+=" ${!KERNEL_PACKAGES_PREFIX}-devel-matched-$(uname -r)" && Repository="AppStream"
+                [[ $OS =~ (rocky9|rhel9) ]] && PACKAGE_LIST+=" ${!KERNEL_PACKAGES_PREFIX}-devel-matched-$(uname -r)" && Repository="AppStream"
 
                 if [[ $OS =~ rocky ]]; then
                   for PACKAGE in ${!PACKAGE_LIST}

@@ -865,12 +865,6 @@ test-suites:
           instances: [ "c5n.18xlarge" ]
           oss: [{{ NO_ROCKY_OS_X86_0 }}] # ParallelCluster does not release official Rocky images. Skip the test.
           schedulers: [ "slurm" ]
-    test_openfoam.py::test_openfoam:
-      dimensions:
-        - regions: [ {{ c5n_18xlarge_CAPACITY_RESERVATION_35_INSTANCES_2_HOURS_YESPG_NO_ROCKY_OS_X86_0 }} ]
-          instances: [ "c5n.18xlarge" ]
-          oss: [{{ NO_ROCKY_OS_X86_0 }}] # ParallelCluster does not release official Rocky images. Skip the test.
-          schedulers: [ "slurm" ]
     test_startup_time.py::test_startup_time:
       dimensions:
         - regions: [ "us-east-1" ]

@@ -234,6 +234,12 @@ test-suites:
 #          oss: {{ OSS }}
 #          schedulers: {{ SCHEDULERS }}
   networking:
+    test_cluster_networking.py::test_cluster_with_subnet_prioritization:
+      dimensions:
+        - regions: {{ REGIONS }}
+          instances: {{ INSTANCES }}
+          oss: {{ OSS }}
+          schedulers: {{ SCHEDULERS }}
     test_cluster_networking.py::test_cluster_in_private_subnet:
       dimensions:
         - regions: {{ REGIONS }}
@@ -607,6 +613,13 @@ test-suites:
           instances: {{ INSTANCES }}
           oss: {{ OSS }}
           schedulers: {{ SCHEDULERS }}
+  ultraserver:
+    test_gb200.py::test_gb200:
+      dimensions:
+        - regions: {{ REGIONS }}
+          instances:  ["g4dn.2xlarge"]
+          oss: {{ OSS }}
+          schedulers: {{ SCHEDULERS }}
 # These tests cannot be executed in US isolated regions
 # because the feature Custom Resource is not supported in these regions.
 #  custom_resource:

@@ -54,6 +54,7 @@
     "cn-north-1": ["cnn1-az1", "cnn1-az2"],
     # Should only consider supported AZs
     "us-isob-east-1": ["usibe1-az2", "usibe1-az3"],
+    "us-iso-east-1": ["usie1-az1", "usie1-az2"],
 }
 
 # used to map a ZoneId to the corresponding region

@@ -58,15 +58,33 @@
 
 # Remarkable AMIs are latest deep learning base AMI and FPGA developer AMI without pcluster infrastructure
 OS_TO_REMARKABLE_AMI_NAME_OWNER_MAP = {
-    "alinux2": {"name": "Deep Learning Base AMI (Amazon Linux 2)*", "owners": ["amazon"]},
+    # Using a patched DLAMI which has uninstalled openssl11-devel, openssl11-libs and openssl11-pkcs
+    # so that it will not conflict with pcluster build image.
+    "alinux2": {
+        "name": "Deep Learning OSS Nvidia Driver AMI (Amazon Linux 2) Version 83.9 for ParallelCluster*",
+        # If you are running in your personal account, then you must have this patched AMI
+        "owners": ["self"],
+    },
+    "alinux2023": {
+        "name": {
+            "x86_64": "Deep Learning Base OSS Nvidia Driver GPU AMI (Amazon Linux 2023)*",
+            "arm64": "Deep Learning ARM64 Base OSS Nvidia Driver GPU AMI (Amazon Linux 2023)*",
+        },
+        "owners": ["amazon"],
+    },
+    "ubuntu2204": {
+        "name": {
+            "x86_64": "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*",
+            "arm64": "Deep Learning ARM64 Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*",
+        },
+        "owners": ["amazon"],
+    },
     "ubuntu2404": {
         "name": "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-*-server-*",
         "owners": ["099720109477"],
     },
     # Simple redhat8 to be able to build in remarkable test
-    # FIXME: when fixed upstream, unpin the timestamp introduced because the `kernel-devel` package was missing for
-    # the kernel released in 20231127 RHEL 8.8 AMI
-    "rhel8": {"name": "RHEL-8.8*_HVM-202309*", "owners": RHEL_OWNERS},
+    "rhel8": {"name": "RHEL-8.8*_HVM-*", "owners": RHEL_OWNERS},
     "rocky8": {"name": "Rocky-8-EC2-Base-8.10*", "owners": ["792107900819"]},  # TODO add china and govcloud accounts
     "rhel8.9": {"name": "RHEL-8.9*_HVM-*", "owners": RHEL_OWNERS},
     "rocky8.9": {"name": "Rocky-8-EC2-Base-8.9*", "owners": ["792107900819"]},  # TODO add china and govcloud accounts
@@ -128,7 +146,7 @@ def retrieve_latest_ami(
         if ami_type == "pcluster":
             ami_name = "aws-parallelcluster-{version}-{ami_name}".format(
                 version=get_installed_parallelcluster_version(),
-                ami_name=_get_ami_for_os(ami_type, os).get("name"),
+                ami_name=_get_ami_for_os(ami_type, os, architecture).get("name"),
             )
             if (
                 request
@@ -141,14 +159,14 @@ def retrieve_latest_ami(
                 # Then retrieve public pcluster AMIs
                 additional_filters.append({"Name": "is-public", "Values": ["true"]})
         else:
-            ami_name = _get_ami_for_os(ami_type, os).get("name")
+            ami_name = _get_ami_for_os(ami_type, os, architecture).get("name")
         logging.info("Parent image name %s" % ami_name)
         paginator = boto3.client("ec2", region_name=region).get_paginator("describe_images")
         page_iterator = paginator.paginate(
             Filters=[{"Name": "name", "Values": [ami_name]}, {"Name": "architecture", "Values": [architecture]}]
             + additional_filters,
-            Owners=_get_ami_for_os(ami_type, os).get("owners"),
-            IncludeDeprecated=_get_ami_for_os(ami_type, os).get("includeDeprecated", False),
+            Owners=_get_ami_for_os(ami_type, os, architecture).get("owners"),
+            IncludeDeprecated=_get_ami_for_os(ami_type, os, architecture).get("includeDeprecated", False),
         )
         images = []
         for page in page_iterator:
@@ -166,13 +184,19 @@ def retrieve_latest_ami(
         raise
 
 
-def _get_ami_for_os(ami_type, os):
+def _get_ami_for_os(ami_type, os, architecture="x86_64"):
     ami_dict = AMI_TYPE_DICT.get(ami_type)
     if not ami_dict:
         raise Exception(f"'{ami_type}' not found in the dict 'AMI_TYPE_DICT'")
     os_ami = ami_dict.get(os)
     if not os_ami:
         raise Exception(f"'{os}' not found in the '{ami_type}' mapping referenced in the 'AMI_TYPE_DICT'")
+
+    # Get correct AMI names as per architecture
+    if isinstance(os_ami.get("name"), dict):
+        name = os_ami["name"].get(architecture)
+        return {"name": name, "owners": os_ami["owners"]}
+
     return os_ami
 
 

@@ -80,8 +80,8 @@ def test_invalid_config(
 
     # Test Suppression of a validator
 
-    # Get base AMI -- remarkable AMIs are not available for ARM and ubuntu2204, alinux2023 yet
-    if os not in ["ubuntu2204", "alinux2023"]:
+    # Get base AMI -- remarkable AL2 AMIs are failing because of conflicts between openssl-devel packages
+    if os not in ["alinux2"]:
         base_ami = retrieve_latest_ami(region, os, ami_type="remarkable", architecture=architecture)
     else:
         base_ami = retrieve_latest_ami(region, os, architecture=architecture)
@@ -142,7 +142,7 @@ def test_build_image(
         enable_nvidia = False
 
     # Get base AMI
-    if os in ["alinux2", "ubuntu2004"]:
+    if os in ["ubuntu2204"]:
         # Test Deep Learning AMIs
         base_ami = retrieve_latest_ami(region, os, ami_type="remarkable", architecture=architecture)
         enable_nvidia = False  # Deep learning AMIs have Nvidia pre-installed
@@ -164,6 +164,10 @@ def test_build_image(
             enable_lustre_client = False
     if os in ["alinux2", "alinux2023", "rocky9"]:
         update_os_packages = True
+
+    # Disable DCV installation for Ubuntu 24.04 to avoid build failures with DLAMI
+    enable_dcv = os != "ubuntu2404"
+
     image_config = pcluster_config_reader(
         config_file="image.config.yaml",
         parent_image=base_ami,
@@ -172,6 +176,7 @@ def test_build_image(
         enable_nvidia=str(enable_nvidia and get_gpu_count(instance) > 0).lower(),
         update_os_packages=str(update_os_packages).lower(),
         enable_lustre_client=str(enable_lustre_client).lower(),
+        enable_dcv=str(enable_dcv).lower(),
     )
 
     image = images_factory(image_id, image_config, region)
@@ -430,10 +435,11 @@ def _test_image_tag_and_volume(image):
         raise ImageNotFound()
     assert_that(len(image_list)).is_equal_to(1)
 
-    created_image = image_list[0]
-    volume_size = created_image.get("BlockDeviceMappings")[0].get("Ebs").get("VolumeSize")
-    assert_that(volume_size).is_equal_to(200)
-    assert_that(created_image["Tags"]).contains({"Key": "dummyImageTag", "Value": "dummyImageTag"})
+    if len(image_list) > 0:
+        created_image = image_list[0]
+        volume_size = created_image.get("BlockDeviceMappings")[0].get("Ebs").get("VolumeSize")
+        assert_that(volume_size).is_equal_to(200)
+        assert_that(created_image["Tags"]).contains({"Key": "dummyImageTag", "Value": "dummyImageTag"})
 
 
 @pytest.fixture()

@@ -36,3 +36,8 @@ DeploymentSettings:
         - {{ default_vpc_security_group_id }}
 DevSettings:
     TerminateInstanceOnFailure: True
+{% if enable_dcv == "false" %}
+    Cookbook:
+        ExtraChefAttributes: |
+            {"cluster": {"dcv": {"install_enabled": false}}}
+{% endif %}
@@ -51,10 +51,15 @@ def test_efa(
 
     Grouped all tests in a single function so that cluster can be reused for all of them.
     """
-    if architecture == "x86_64":
-        head_node_instance = "c5.18xlarge"
+    if instance.startswith("p") or instance.startswith("hpc"):
+        if architecture == "x86_64":
+            head_node_instance = "c5.18xlarge"
+        else:
+            head_node_instance = "c6g.16xlarge"
     else:
-        head_node_instance = "c6g.16xlarge"
+        # Use the same instance type for both compute node and head node
+        # when the instance type is available in open capacity pool
+        head_node_instance = instance
     max_queue_size = 2
     p6_b200_capacity_reservation_id = None
     if instance == "p6-b200.48xlarge":

@@ -9,6 +9,7 @@
 # or in the "LICENSE.txt" file accompanying this file.
 # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import json
 import logging
 import re
 
@@ -33,11 +34,12 @@
 
 
 @pytest.mark.usefixtures("serial_execution_by_instance")
-@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize("in_place_update_on_fleet_enabled", ["true", "false"])
 def test_osu(
     os,
     region,
     scheduler,
+    in_place_update_on_fleet_enabled,
     instance,
     pcluster_config_reader,
     clusters_factory,
@@ -48,22 +50,25 @@ def test_osu(
     scheduler_commands_factory,
     request,
 ):
+    if in_place_update_on_fleet_enabled == "true":
+        message = "Skipping the test as we want to compare performance when cfn-hup is disabled"
+        logging.warn(message)
+        pytest.skip(message)
+
     if instance not in OSU_BENCHMARKS_INSTANCES:
         raise Exception(
             f"OSU benchmarks can't be run on instance {instance}. "
             f"Only these instances are supported: {OSU_BENCHMARKS_INSTANCES}"
         )
 
-    if architecture == "x86_64":
-        head_node_instance = "c5.18xlarge"
-    else:
-        head_node_instance = "c6g.16xlarge"
-
     max_queue_size = 32
     capacity_type = "ONDEMAND"
     capacity_reservation_id = None
     placement_group_enabled = True
 
+    chef_attributes_dict = {"cluster": {"in_place_update_on_fleet_enabled": in_place_update_on_fleet_enabled}}
+    extra_chef_attributes = json.dumps(chef_attributes_dict)
+
     if instance in ["p6-b200.48xlarge", "p5en.48xlarge"]:
         max_queue_size = 2
         capacity_type = "CAPACITY_BLOCK"
@@ -79,11 +84,11 @@ def test_osu(
 
     slots_per_instance = fetch_instance_slots(region, instance, multithreading_disabled=True)
     cluster_config = pcluster_config_reader(
-        head_node_instance=head_node_instance,
         max_queue_size=max_queue_size,
         capacity_type=capacity_type,
         capacity_reservation_id=capacity_reservation_id,
         placement_group_enabled=placement_group_enabled,
+        extra_chef_attributes=extra_chef_attributes,
     )
     cluster = clusters_factory(cluster_config)
     remote_command_executor = RemoteCommandExecutor(cluster)

@@ -1,7 +1,7 @@
 Image:
   Os: {{ os }}
 HeadNode:
-  InstanceType: {{ head_node_instance }}
+  InstanceType: {{ instance }}
   Networking:
     SubnetId: {{ public_subnet_id }}
   Ssh:
@@ -38,3 +38,6 @@ SharedStorage:
   - MountDir: /shared
     Name: name1
     StorageType: Ebs
+DevSettings:
+  Cookbook:
+    ExtraChefAttributes: '{{ extra_chef_attributes }}'
@@ -58,29 +58,39 @@ def calculate_observed_value(result, remote_command_executor, scheduler_commands
 
 
 @pytest.mark.usefixtures("serial_execution_by_instance")
-@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize("in_place_update_on_fleet_enabled", ["true", "false"])
 def test_starccm(
     vpc_stack,
     instance,
     os,
     region,
+    in_place_update_on_fleet_enabled,
     scheduler,
     pcluster_config_reader,
     clusters_factory,
     test_datadir,
     scheduler_commands_factory,
     s3_bucket_factory,
 ):
+    if in_place_update_on_fleet_enabled == "true":
+        message = "Skipping the test as we want to compare performance when cfn-hup is disabled"
+        logging.warn(message)
+        pytest.skip(message)
+
     number_of_nodes = [8, 16, 32]
     # Create S3 bucket for custom actions scripts
     bucket_name = s3_bucket_factory()
     s3 = boto3.client("s3")
     s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh")
 
+    chef_attributes_dict = {"cluster": {"in_place_update_on_fleet_enabled": in_place_update_on_fleet_enabled}}
+    extra_chef_attributes = json.dumps(chef_attributes_dict)
+
     cluster_config = pcluster_config_reader(
         bucket_name=bucket_name,
         install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS,
         number_of_nodes=max(number_of_nodes),
+        extra_chef_attributes=extra_chef_attributes,
     )
     cluster = clusters_factory(cluster_config)
     logging.info("Cluster Created")
@@ -109,9 +119,8 @@ def test_starccm(
         instance_slots = fetch_instance_slots(region, instance, multithreading_disabled=True)
         run_command = f'sbatch --ntasks={num_of_nodes * instance_slots} starccm.slurm.sh "{podkey}" "{licpath}"'
         multiple_runs = []
-        # Run at least four times up to whatever parallelism allows to maximize usage of available nodes
-        # Running the test multiple times reduces and get the average value improves stability of the result.
-        number_of_runs = max(parallelism, 4)
+        # Run at least twice up to whatever parallelism allows to maximize usage of available nodes
+        number_of_runs = max(parallelism, 2)
         for _ in range(number_of_runs):
             multiple_runs.append(remote_command_executor.run_remote_command(run_command))
         for run in multiple_runs:

@@ -62,3 +62,6 @@ SharedStorage:
       DeploymentType: PERSISTENT_1
       PerUnitStorageThroughput: 100
       StorageType: SSD
+DevSettings:
+  Cookbook:
+    ExtraChefAttributes: '{{ extra_chef_attributes }}'
@@ -2594,7 +2594,7 @@ def _test_slurm_behavior_when_updating_schedulable_memory_with_already_running_j
         submit_command_args={
             "nodes": 1,
             "slots": 1,
-            "command": "srun ./a.out 3000000000 390",
+            "command": "srun ./a.out 2000000000 390",
             "other_options": "-w queue1-st-ondemand1-i1-1",
             "raise_on_error": False,
         }

@@ -1,3 +1,6 @@
+DevSettings:
+  Timeouts:
+    HeadNodeBootstrapTimeout: 2400  # Increasing Timeout as FSX takes 20+ minutes to create if throttled
 Image:
   Os: {{ os }}
 HeadNode: