From d8e2156da99774d79aed5f82ac88df95ff928996 Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Wed, 24 Sep 2025 15:56:22 -0700 Subject: [PATCH 1/2] Refactor NVIDIA driver installation for Rocky Linux Updated the installation process for the NVIDIA GPU driver on Rocky Linux. The script now installs kernel development packages directly and downloads the CUDA installer run file, executing it in silent mode. The installer file is removed post-installation to clean up. This change simplifies the installation steps and ensures the correct driver version is used. --- spark-rapids/spark-rapids.sh | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 1bba87cc9..bde8d8541 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -501,17 +501,20 @@ function install_nvidia_gpu_driver() { elif is_rocky ; then - # Ensure the Correct Kernel Development Packages are Installed - execute_with_retries "dnf -y -q update --exclude=systemd*,kernel*" - execute_with_retries "dnf -y -q install pciutils kernel-devel gcc" + # Install kernel development packages + execute_with_retries "dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r)" - readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" - execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" - execute_with_retries "dnf clean all" - configure_dkms_certs - execute_with_retries "dnf -y -q module install nvidia-driver:latest-dkms" - clear_dkms_key - execute_with_retries "dnf -y -q install cuda-toolkit" + # Download the CUDA installer run file + curl -o driver.run \ + "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/cuda_${CUDA_VERSION}_${NVIDIA_DRIVER_VERSION}_linux.run" + + # Run the installer in silent mode + bash driver.run --silent + + # Remove the installer file after installation to clean up + rm driver.run + + # Load the NVIDIA kernel module modprobe nvidia else From 7e52b5f0f20c42e248ed0dd734c19f72da184b8a Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Oct 2025 19:38:50 +0000 Subject: [PATCH 2/2] feat: Enable spark-rapids on Dataproc 2.1 Rocky Linux 8 This commit integrates changes to enable the spark-rapids initialization action on Dataproc 2.1-rocky8 images. - Updates the NVIDIA driver installation process in `spark-rapids.sh` for Rocky Linux: - Uses `curl` with retry and fail-fast options for downloading the CUDA installer. - Executes the NVIDIA installer with `--silent --driver --toolkit --no-opengl-libs` flags and wraps it in `execute_with_retries`. - Modifies `test_spark_rapids.py` to enable tests for Rocky Linux on Dataproc 2.1 and below, while keeping them skipped for 2.2+ (Rocky 9). This resolves the installation issues on Rocky 8. Further work is required to support Rocky 9 (Dataproc 2.2). --- spark-rapids/spark-rapids.sh | 4 ++-- spark-rapids/test_spark_rapids.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index bde8d8541..f6415e05e 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -505,11 +505,11 @@ function install_nvidia_gpu_driver() { execute_with_retries "dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r)" # Download the CUDA installer run file - curl -o driver.run \ + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 30 -o driver.run \ "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/cuda_${CUDA_VERSION}_${NVIDIA_DRIVER_VERSION}_linux.run" # Run the installer in silent mode - bash driver.run --silent + execute_with_retries "bash driver.run --silent --driver --toolkit --no-opengl-libs" # Remove the installer file after installation to clean up rm driver.run diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 6e03f2d62..a29a20f1d 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -58,8 +58,8 @@ def verify_spark_job_sql(self): ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids(self, configuration, machine_suffixes, accelerator): - if self.getImageOs() == "rocky": - self.skipTest("Not supported for Rocky OS") + if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky": + self.skipTest("Not supported for Rocky 9") if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images") @@ -88,8 +88,8 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): - if self.getImageOs() == "rocky": - self.skipTest("Not supported for Rocky OS") + if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky": + self.skipTest("Not supported for Rocky 9") if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images") @@ -118,8 +118,8 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version, driver_version): - if self.getImageOs() == "rocky": - self.skipTest("Not supported for Rocky OS") + if self.getImageVersion() > pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky": + self.skipTest("Not supported for Rocky 9") if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images")