diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 1bba87cc9..f6415e05e 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -501,17 +501,20 @@ function install_nvidia_gpu_driver() { elif is_rocky ; then - # Ensure the Correct Kernel Development Packages are Installed - execute_with_retries "dnf -y -q update --exclude=systemd*,kernel*" - execute_with_retries "dnf -y -q install pciutils kernel-devel gcc" + # Install kernel development packages + execute_with_retries "dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r)" - readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" - execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" - execute_with_retries "dnf clean all" - configure_dkms_certs - execute_with_retries "dnf -y -q module install nvidia-driver:latest-dkms" - clear_dkms_key - execute_with_retries "dnf -y -q install cuda-toolkit" + # Download the CUDA installer run file + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 30 -o driver.run \ + "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/cuda_${CUDA_VERSION}_${NVIDIA_DRIVER_VERSION}_linux.run" + + # Run the installer in silent mode + execute_with_retries "bash driver.run --silent --driver --toolkit --no-opengl-libs" + + # Remove the installer file after installation to clean up + rm driver.run + + # Load the NVIDIA kernel module modprobe nvidia else diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 6e03f2d62..a29a20f1d 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -58,8 +58,8 @@ def verify_spark_job_sql(self): ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids(self, configuration, machine_suffixes, accelerator): - if self.getImageOs() == "rocky": - self.skipTest("Not supported for Rocky OS") + if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky": + self.skipTest("Not supported for Rocky 9") if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images") @@ -88,8 +88,8 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): - if self.getImageOs() == "rocky": - self.skipTest("Not supported for Rocky OS") + if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky": + self.skipTest("Not supported for Rocky 9") if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images") @@ -118,8 +118,8 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version, driver_version): - if self.getImageOs() == "rocky": - self.skipTest("Not supported for Rocky OS") + if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky": + self.skipTest("Not supported for Rocky 9") if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images")