diff --git a/cli/src/pcluster/resources/imagebuilder/parallelcluster.yaml b/cli/src/pcluster/resources/imagebuilder/parallelcluster.yaml index b64296b2b9..eb25b13bda 100644 --- a/cli/src/pcluster/resources/imagebuilder/parallelcluster.yaml +++ b/cli/src/pcluster/resources/imagebuilder/parallelcluster.yaml @@ -108,7 +108,7 @@ phases: PACKAGE_LIST="${!KERNEL_PACKAGES_PREFIX}-headers-$(uname -r) ${!KERNEL_PACKAGES_PREFIX}-devel-$(uname -r) ${!KERNEL_PACKAGES_PREFIX}-modules-extra-$(uname -r)" Repository="BaseOS" - [[ ! $OS =~ (rocky8|rhel8) ]] && PACKAGE_LIST+=" ${!KERNEL_PACKAGES_PREFIX}-devel-matched-$(uname -r)" && Repository="AppStream" + [[ $OS =~ (rocky9|rhel9) ]] && PACKAGE_LIST+=" ${!KERNEL_PACKAGES_PREFIX}-devel-matched-$(uname -r)" && Repository="AppStream" if [[ $OS =~ rocky ]]; then for PACKAGE in ${!PACKAGE_LIST} diff --git a/tests/integration-tests/configs/develop.yaml b/tests/integration-tests/configs/develop.yaml index 98f8cdc65b..e219f5200a 100644 --- a/tests/integration-tests/configs/develop.yaml +++ b/tests/integration-tests/configs/develop.yaml @@ -865,12 +865,6 @@ test-suites: instances: [ "c5n.18xlarge" ] oss: [{{ NO_ROCKY_OS_X86_0 }}] # ParallelCluster does not release official Rocky images. Skip the test. schedulers: [ "slurm" ] - test_openfoam.py::test_openfoam: - dimensions: - - regions: [ {{ c5n_18xlarge_CAPACITY_RESERVATION_35_INSTANCES_2_HOURS_YESPG_NO_ROCKY_OS_X86_0 }} ] - instances: [ "c5n.18xlarge" ] - oss: [{{ NO_ROCKY_OS_X86_0 }}] # ParallelCluster does not release official Rocky images. Skip the test. - schedulers: [ "slurm" ] test_startup_time.py::test_startup_time: dimensions: - regions: [ "us-east-1" ] diff --git a/tests/integration-tests/configs/isolated_regions.yaml b/tests/integration-tests/configs/isolated_regions.yaml index 0013b30679..df98c45b15 100644 --- a/tests/integration-tests/configs/isolated_regions.yaml +++ b/tests/integration-tests/configs/isolated_regions.yaml @@ -234,6 +234,12 @@ test-suites: # oss: {{ OSS }} # schedulers: {{ SCHEDULERS }} networking: + test_cluster_networking.py::test_cluster_with_subnet_prioritization: + dimensions: + - regions: {{ REGIONS }} + instances: {{ INSTANCES }} + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} test_cluster_networking.py::test_cluster_in_private_subnet: dimensions: - regions: {{ REGIONS }} @@ -607,6 +613,13 @@ test-suites: instances: {{ INSTANCES }} oss: {{ OSS }} schedulers: {{ SCHEDULERS }} + ultraserver: + test_gb200.py::test_gb200: + dimensions: + - regions: {{ REGIONS }} + instances: ["g4dn.2xlarge"] + oss: {{ OSS }} + schedulers: {{ SCHEDULERS }} # These tests cannot be executed in US isolated regions # because the feature Custom Resource is not supported in these regions. # custom_resource: diff --git a/tests/integration-tests/conftest_networking.py b/tests/integration-tests/conftest_networking.py index 70f3add6e1..f7a1aaaa78 100644 --- a/tests/integration-tests/conftest_networking.py +++ b/tests/integration-tests/conftest_networking.py @@ -54,6 +54,7 @@ "cn-north-1": ["cnn1-az1", "cnn1-az2"], # Should only consider supported AZs "us-isob-east-1": ["usibe1-az2", "usibe1-az3"], + "us-iso-east-1": ["usie1-az1", "usie1-az2"], } # used to map a ZoneId to the corresponding region diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index 2138811779..4ba334bbd3 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -58,15 +58,33 @@ # Remarkable AMIs are latest deep learning base AMI and FPGA developer AMI without pcluster infrastructure OS_TO_REMARKABLE_AMI_NAME_OWNER_MAP = { - "alinux2": {"name": "Deep Learning Base AMI (Amazon Linux 2)*", "owners": ["amazon"]}, + # Using a patched DLAMI which has uninstalled openssl11-devel, openssl11-libs and openssl11-pkcs + # so that it will not conflict with pcluster build image. + "alinux2": { + "name": "Deep Learning OSS Nvidia Driver AMI (Amazon Linux 2) Version 83.9 for ParallelCluster*", + # If you are running in your personal account, then you must have this patched AMI + "owners": ["self"], + }, + "alinux2023": { + "name": { + "x86_64": "Deep Learning Base OSS Nvidia Driver GPU AMI (Amazon Linux 2023)*", + "arm64": "Deep Learning ARM64 Base OSS Nvidia Driver GPU AMI (Amazon Linux 2023)*", + }, + "owners": ["amazon"], + }, + "ubuntu2204": { + "name": { + "x86_64": "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*", + "arm64": "Deep Learning ARM64 Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*", + }, + "owners": ["amazon"], + }, "ubuntu2404": { "name": "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-*-server-*", "owners": ["099720109477"], }, # Simple redhat8 to be able to build in remarkable test - # FIXME: when fixed upstream, unpin the timestamp introduced because the `kernel-devel` package was missing for - # the kernel released in 20231127 RHEL 8.8 AMI - "rhel8": {"name": "RHEL-8.8*_HVM-202309*", "owners": RHEL_OWNERS}, + "rhel8": {"name": "RHEL-8.8*_HVM-*", "owners": RHEL_OWNERS}, "rocky8": {"name": "Rocky-8-EC2-Base-8.10*", "owners": ["792107900819"]}, # TODO add china and govcloud accounts "rhel8.9": {"name": "RHEL-8.9*_HVM-*", "owners": RHEL_OWNERS}, "rocky8.9": {"name": "Rocky-8-EC2-Base-8.9*", "owners": ["792107900819"]}, # TODO add china and govcloud accounts @@ -128,7 +146,7 @@ def retrieve_latest_ami( if ami_type == "pcluster": ami_name = "aws-parallelcluster-{version}-{ami_name}".format( version=get_installed_parallelcluster_version(), - ami_name=_get_ami_for_os(ami_type, os).get("name"), + ami_name=_get_ami_for_os(ami_type, os, architecture).get("name"), ) if ( request @@ -141,14 +159,14 @@ def retrieve_latest_ami( # Then retrieve public pcluster AMIs additional_filters.append({"Name": "is-public", "Values": ["true"]}) else: - ami_name = _get_ami_for_os(ami_type, os).get("name") + ami_name = _get_ami_for_os(ami_type, os, architecture).get("name") logging.info("Parent image name %s" % ami_name) paginator = boto3.client("ec2", region_name=region).get_paginator("describe_images") page_iterator = paginator.paginate( Filters=[{"Name": "name", "Values": [ami_name]}, {"Name": "architecture", "Values": [architecture]}] + additional_filters, - Owners=_get_ami_for_os(ami_type, os).get("owners"), - IncludeDeprecated=_get_ami_for_os(ami_type, os).get("includeDeprecated", False), + Owners=_get_ami_for_os(ami_type, os, architecture).get("owners"), + IncludeDeprecated=_get_ami_for_os(ami_type, os, architecture).get("includeDeprecated", False), ) images = [] for page in page_iterator: @@ -166,13 +184,19 @@ def retrieve_latest_ami( raise -def _get_ami_for_os(ami_type, os): +def _get_ami_for_os(ami_type, os, architecture="x86_64"): ami_dict = AMI_TYPE_DICT.get(ami_type) if not ami_dict: raise Exception(f"'{ami_type}' not found in the dict 'AMI_TYPE_DICT'") os_ami = ami_dict.get(os) if not os_ami: raise Exception(f"'{os}' not found in the '{ami_type}' mapping referenced in the 'AMI_TYPE_DICT'") + + # Get correct AMI names as per architecture + if isinstance(os_ami.get("name"), dict): + name = os_ami["name"].get(architecture) + return {"name": name, "owners": os_ami["owners"]} + return os_ami diff --git a/tests/integration-tests/tests/createami/test_createami.py b/tests/integration-tests/tests/createami/test_createami.py index 7e4570ca92..d2284a1dff 100644 --- a/tests/integration-tests/tests/createami/test_createami.py +++ b/tests/integration-tests/tests/createami/test_createami.py @@ -80,8 +80,8 @@ def test_invalid_config( # Test Suppression of a validator - # Get base AMI -- remarkable AMIs are not available for ARM and ubuntu2204, alinux2023 yet - if os not in ["ubuntu2204", "alinux2023"]: + # Get base AMI -- remarkable AL2 AMIs are failing because of conflicts between openssl-devel packages + if os not in ["alinux2"]: base_ami = retrieve_latest_ami(region, os, ami_type="remarkable", architecture=architecture) else: base_ami = retrieve_latest_ami(region, os, architecture=architecture) @@ -142,7 +142,7 @@ def test_build_image( enable_nvidia = False # Get base AMI - if os in ["alinux2", "ubuntu2004"]: + if os in ["ubuntu2204"]: # Test Deep Learning AMIs base_ami = retrieve_latest_ami(region, os, ami_type="remarkable", architecture=architecture) enable_nvidia = False # Deep learning AMIs have Nvidia pre-installed @@ -164,6 +164,10 @@ def test_build_image( enable_lustre_client = False if os in ["alinux2", "alinux2023", "rocky9"]: update_os_packages = True + + # Disable DCV installation for Ubuntu 24.04 to avoid build failures with DLAMI + enable_dcv = os != "ubuntu2404" + image_config = pcluster_config_reader( config_file="image.config.yaml", parent_image=base_ami, @@ -172,6 +176,7 @@ def test_build_image( enable_nvidia=str(enable_nvidia and get_gpu_count(instance) > 0).lower(), update_os_packages=str(update_os_packages).lower(), enable_lustre_client=str(enable_lustre_client).lower(), + enable_dcv=str(enable_dcv).lower(), ) image = images_factory(image_id, image_config, region) @@ -430,10 +435,11 @@ def _test_image_tag_and_volume(image): raise ImageNotFound() assert_that(len(image_list)).is_equal_to(1) - created_image = image_list[0] - volume_size = created_image.get("BlockDeviceMappings")[0].get("Ebs").get("VolumeSize") - assert_that(volume_size).is_equal_to(200) - assert_that(created_image["Tags"]).contains({"Key": "dummyImageTag", "Value": "dummyImageTag"}) + if len(image_list) > 0: + created_image = image_list[0] + volume_size = created_image.get("BlockDeviceMappings")[0].get("Ebs").get("VolumeSize") + assert_that(volume_size).is_equal_to(200) + assert_that(created_image["Tags"]).contains({"Key": "dummyImageTag", "Value": "dummyImageTag"}) @pytest.fixture() diff --git a/tests/integration-tests/tests/createami/test_createami/test_build_image/image.config.yaml b/tests/integration-tests/tests/createami/test_createami/test_build_image/image.config.yaml index db15a8ece3..2a3a8bd9b1 100644 --- a/tests/integration-tests/tests/createami/test_createami/test_build_image/image.config.yaml +++ b/tests/integration-tests/tests/createami/test_createami/test_build_image/image.config.yaml @@ -36,3 +36,8 @@ DeploymentSettings: - {{ default_vpc_security_group_id }} DevSettings: TerminateInstanceOnFailure: True +{% if enable_dcv == "false" %} + Cookbook: + ExtraChefAttributes: | + {"cluster": {"dcv": {"install_enabled": false}}} +{% endif %} diff --git a/tests/integration-tests/tests/efa/test_efa.py b/tests/integration-tests/tests/efa/test_efa.py index d6962207ed..8afb932431 100644 --- a/tests/integration-tests/tests/efa/test_efa.py +++ b/tests/integration-tests/tests/efa/test_efa.py @@ -51,10 +51,15 @@ def test_efa( Grouped all tests in a single function so that cluster can be reused for all of them. """ - if architecture == "x86_64": - head_node_instance = "c5.18xlarge" + if instance.startswith("p") or instance.startswith("hpc"): + if architecture == "x86_64": + head_node_instance = "c5.18xlarge" + else: + head_node_instance = "c6g.16xlarge" else: - head_node_instance = "c6g.16xlarge" + # Use the same instance type for both compute node and head node + # when the instance type is available in open capacity pool + head_node_instance = instance max_queue_size = 2 p6_b200_capacity_reservation_id = None if instance == "p6-b200.48xlarge": diff --git a/tests/integration-tests/tests/performance_tests/test_osu.py b/tests/integration-tests/tests/performance_tests/test_osu.py index 783e719273..d7d712f917 100644 --- a/tests/integration-tests/tests/performance_tests/test_osu.py +++ b/tests/integration-tests/tests/performance_tests/test_osu.py @@ -9,6 +9,7 @@ # or in the "LICENSE.txt" file accompanying this file. # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. +import json import logging import re @@ -33,11 +34,12 @@ @pytest.mark.usefixtures("serial_execution_by_instance") -@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize("in_place_update_on_fleet_enabled", ["true", "false"]) def test_osu( os, region, scheduler, + in_place_update_on_fleet_enabled, instance, pcluster_config_reader, clusters_factory, @@ -48,22 +50,25 @@ def test_osu( scheduler_commands_factory, request, ): + if in_place_update_on_fleet_enabled == "true": + message = "Skipping the test as we want to compare performance when cfn-hup is disabled" + logging.warn(message) + pytest.skip(message) + if instance not in OSU_BENCHMARKS_INSTANCES: raise Exception( f"OSU benchmarks can't be run on instance {instance}. " f"Only these instances are supported: {OSU_BENCHMARKS_INSTANCES}" ) - if architecture == "x86_64": - head_node_instance = "c5.18xlarge" - else: - head_node_instance = "c6g.16xlarge" - max_queue_size = 32 capacity_type = "ONDEMAND" capacity_reservation_id = None placement_group_enabled = True + chef_attributes_dict = {"cluster": {"in_place_update_on_fleet_enabled": in_place_update_on_fleet_enabled}} + extra_chef_attributes = json.dumps(chef_attributes_dict) + if instance in ["p6-b200.48xlarge", "p5en.48xlarge"]: max_queue_size = 2 capacity_type = "CAPACITY_BLOCK" @@ -79,11 +84,11 @@ def test_osu( slots_per_instance = fetch_instance_slots(region, instance, multithreading_disabled=True) cluster_config = pcluster_config_reader( - head_node_instance=head_node_instance, max_queue_size=max_queue_size, capacity_type=capacity_type, capacity_reservation_id=capacity_reservation_id, placement_group_enabled=placement_group_enabled, + extra_chef_attributes=extra_chef_attributes, ) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) diff --git a/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml index b42253da24..511630d55b 100644 --- a/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_osu/test_osu/pcluster.config.yaml @@ -1,7 +1,7 @@ Image: Os: {{ os }} HeadNode: - InstanceType: {{ head_node_instance }} + InstanceType: {{ instance }} Networking: SubnetId: {{ public_subnet_id }} Ssh: @@ -38,3 +38,6 @@ SharedStorage: - MountDir: /shared Name: name1 StorageType: Ebs +DevSettings: + Cookbook: + ExtraChefAttributes: '{{ extra_chef_attributes }}' diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py index 837e0f870a..fd77405be6 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm.py @@ -58,12 +58,13 @@ def calculate_observed_value(result, remote_command_executor, scheduler_commands @pytest.mark.usefixtures("serial_execution_by_instance") -@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize("in_place_update_on_fleet_enabled", ["true", "false"]) def test_starccm( vpc_stack, instance, os, region, + in_place_update_on_fleet_enabled, scheduler, pcluster_config_reader, clusters_factory, @@ -71,16 +72,25 @@ def test_starccm( scheduler_commands_factory, s3_bucket_factory, ): + if in_place_update_on_fleet_enabled == "true": + message = "Skipping the test as we want to compare performance when cfn-hup is disabled" + logging.warn(message) + pytest.skip(message) + number_of_nodes = [8, 16, 32] # Create S3 bucket for custom actions scripts bucket_name = s3_bucket_factory() s3 = boto3.client("s3") s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") + chef_attributes_dict = {"cluster": {"in_place_update_on_fleet_enabled": in_place_update_on_fleet_enabled}} + extra_chef_attributes = json.dumps(chef_attributes_dict) + cluster_config = pcluster_config_reader( bucket_name=bucket_name, install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS, number_of_nodes=max(number_of_nodes), + extra_chef_attributes=extra_chef_attributes, ) cluster = clusters_factory(cluster_config) logging.info("Cluster Created") @@ -109,9 +119,8 @@ def test_starccm( instance_slots = fetch_instance_slots(region, instance, multithreading_disabled=True) run_command = f'sbatch --ntasks={num_of_nodes * instance_slots} starccm.slurm.sh "{podkey}" "{licpath}"' multiple_runs = [] - # Run at least four times up to whatever parallelism allows to maximize usage of available nodes - # Running the test multiple times reduces and get the average value improves stability of the result. - number_of_runs = max(parallelism, 4) + # Run at least twice up to whatever parallelism allows to maximize usage of available nodes + number_of_runs = max(parallelism, 2) for _ in range(number_of_runs): multiple_runs.append(remote_command_executor.run_remote_command(run_command)) for run in multiple_runs: diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml index 6bf23c1cd7..57bcba628d 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml @@ -62,3 +62,6 @@ SharedStorage: DeploymentType: PERSISTENT_1 PerUnitStorageThroughput: 100 StorageType: SSD +DevSettings: + Cookbook: + ExtraChefAttributes: '{{ extra_chef_attributes }}' \ No newline at end of file diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index 483ec9d8bb..6ea6008be4 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -2594,7 +2594,7 @@ def _test_slurm_behavior_when_updating_schedulable_memory_with_already_running_j submit_command_args={ "nodes": 1, "slots": 1, - "command": "srun ./a.out 3000000000 390", + "command": "srun ./a.out 2000000000 390", "other_options": "-w queue1-st-ondemand1-i1-1", "raise_on_error": False, } diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre_configuration_options/pcluster.config.yaml b/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre_configuration_options/pcluster.config.yaml index 682b942012..d0449f1996 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre_configuration_options/pcluster.config.yaml +++ b/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre_configuration_options/pcluster.config.yaml @@ -1,3 +1,6 @@ +DevSettings: + Timeouts: + HeadNodeBootstrapTimeout: 2400 # Increasing Timeout as FSX takes 20+ minutes to create if throttled Image: Os: {{ os }} HeadNode: diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index 1baec1d8e4..fe752d1856 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -368,6 +368,7 @@ def test_gb200( instance, scheduler, os, + architecture, scheduler_commands_factory, request, ): @@ -402,7 +403,14 @@ def test_gb200( """ capacity_reservation_id = None max_queue_size = 2 - headnode_instance_type = "c7g.4xlarge" + headnode_instance_type = "" + if os == "alinux2": + pytest.skip(f"Skipping the test as nvidia-imex is not installed in {os}") + + if architecture == "x86_64": + headnode_instance_type = "c5.xlarge" + else: + headnode_instance_type = "c7g.4xlarge" if instance == "p6e-gb200.36xlarge": ultraserver_reservations_ids = get_capacity_reservation_id(request, instance, region, max_queue_size, os) if ultraserver_reservations_ids: @@ -451,6 +459,7 @@ def test_gb200( compute_resource_without_imex=compute_resource_without_imex, capacity_block_reservation_id=capacity_block_reservation_id, headnode_instance_type=headnode_instance_type, + architecture=architecture, ) slots_per_instance = fetch_instance_slots(region, instance, multithreading_disabled=True) suppress_validators = ["type:UltraserverCapacityBlockSizeValidator"] @@ -497,6 +506,7 @@ def test_gb200( queue_without_imex=queue_without_imex, compute_resource_without_imex=compute_resource_without_imex, headnode_instance_type=headnode_instance_type, + architecture=architecture, ) cluster.stop() @@ -544,6 +554,7 @@ def test_gb200( queue_without_imex=queue_without_imex, compute_resource_without_imex=compute_resource_without_imex, headnode_instance_type=headnode_instance_type, + architecture=architecture, ) cluster.stop() diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh index 4d5118dc55..794778891d 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh @@ -22,7 +22,7 @@ SCONTROL_CMD="/opt/slurm/bin/scontrol" IMEX_START_TIMEOUT=60 IMEX_STOP_TIMEOUT=15 #TODO In production, specify p6e-gb200, only. We added g5g only for testing purposes. -ALLOWED_INSTANCE_TYPES="^(p6e-gb200|g5g)" +ALLOWED_INSTANCE_TYPES="^(p6e-gb200|g5g|g4d)" IMEX_SERVICE="nvidia-imex" IMEX_NODES_CONFIG="/etc/nvidia-imex/nodes_config.cfg" diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.final.yaml b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.final.yaml index 14e767d42c..9654620b09 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.final.yaml +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.final.yaml @@ -53,9 +53,13 @@ Scheduling: ComputeResources: - Name: {{ compute_resource_without_imex }} Instances: + {% if architecture=="x86_64" %} + - InstanceType: {{ headnode_instance_type }} + {% else %} - InstanceType: m6g.xlarge - InstanceType: t4g.xlarge - InstanceType: c6gn.xlarge + {% endif %} MinCount: {{ min_queue_size_without_imex }} MaxCount: 1 Networking: diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.update.yaml b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.update.yaml index 08cbabc7a6..33d35e542c 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.update.yaml +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.update.yaml @@ -51,9 +51,13 @@ Scheduling: ComputeResources: - Name: {{ compute_resource_without_imex }} Instances: + {% if architecture=="x86_64" %} + - InstanceType: {{ headnode_instance_type }} + {% else %} - InstanceType: m6g.xlarge - InstanceType: t4g.xlarge - InstanceType: c6gn.xlarge + {% endif %} MinCount: {{ min_queue_size_without_imex }} MaxCount: 1 Networking: diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.yaml b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.yaml index 02d9333a58..164f565db3 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.yaml +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/pcluster.config.yaml @@ -48,9 +48,13 @@ Scheduling: ComputeResources: - Name: {{ compute_resource_without_imex }} Instances: + {% if architecture=="x86_64" %} + - InstanceType: {{ headnode_instance_type }} + {% else %} - InstanceType: m6g.xlarge - InstanceType: t4g.xlarge - InstanceType: c6gn.xlarge + {% endif %} MinCount: {{ min_queue_size_without_imex }} MaxCount: 1 Networking: diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index a674d8a0b5..53b5db8c70 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -83,10 +83,10 @@ def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory, cluster ]: bucket.upload_file(str(test_datadir / script), f"scripts/{script}") - spot_instance_types = ["t3.small", "t3.medium"] + spot_instance_types = ["t3.medium"] try: - boto3.client("ec2").describe_instance_types(InstanceTypes=["t3a.small"]) - spot_instance_types.extend(["t3a.small", "t3a.medium"]) + boto3.client("ec2").describe_instance_types(InstanceTypes=["t3a.medium"]) + spot_instance_types.extend(["t3a.medium"]) except Exception: pass