Skip to content

Commit 16c30ce

Browse files
committed
Add nightly regression tests for CUDA 12.8 and 12.9 on H100/A100
1 parent aa21b80 commit 16c30ce

File tree

1 file changed

+80
-0
lines changed

1 file changed

+80
-0
lines changed
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
name: Run CUDA Nightly Regression Tests (12.8, 12.9)
2+
3+
on:
4+
# TODO: Remove push/pull_request trigger after initial CI validation
5+
push:
6+
branches:
7+
- main
8+
- 'gh/**'
9+
pull_request:
10+
branches:
11+
- main
12+
- 'gh/**'
13+
schedule:
14+
# 7 am PST every day
15+
- cron: "0 15 * * *"
16+
workflow_dispatch:
17+
18+
concurrency:
19+
group: regression_test_cuda_nightly-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
20+
cancel-in-progress: true
21+
22+
env:
23+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
24+
25+
jobs:
26+
test:
27+
strategy:
28+
fail-fast: false
29+
matrix:
30+
include:
31+
# CUDA 12.8 on H100
32+
- name: CUDA 12.8 H100
33+
runs-on: linux.aws.h100
34+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
35+
gpu-arch-type: "cuda"
36+
gpu-arch-version: "12.8"
37+
38+
# CUDA 12.8 on A100
39+
- name: CUDA 12.8 A100
40+
runs-on: linux.aws.a100
41+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
42+
gpu-arch-type: "cuda"
43+
gpu-arch-version: "12.8"
44+
45+
# CUDA 12.9 on H100
46+
- name: CUDA 12.9 H100
47+
runs-on: linux.aws.h100
48+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu129'
49+
gpu-arch-type: "cuda"
50+
gpu-arch-version: "12.9"
51+
52+
# CUDA 12.9 on A100
53+
- name: CUDA 12.9 A100
54+
runs-on: linux.aws.a100
55+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu129'
56+
gpu-arch-type: "cuda"
57+
gpu-arch-version: "12.9"
58+
59+
permissions:
60+
id-token: write
61+
contents: read
62+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
63+
with:
64+
timeout: 180
65+
runner: ${{ matrix.runs-on }}
66+
gpu-arch-type: ${{ matrix.gpu-arch-type }}
67+
gpu-arch-version: ${{ matrix.gpu-arch-version }}
68+
submodules: recursive
69+
script: |
70+
conda create -n venv python=3.10 -y
71+
conda activate venv
72+
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
73+
python -m pip install --upgrade pip
74+
pip install ${{ matrix.torch-spec }}
75+
pip install -r dev-requirements.txt
76+
pip install . --no-build-isolation
77+
export CONDA=$(dirname $(dirname $(which conda)))
78+
export LD_LIBRARY_PATH=$CONDA/lib/:$LD_LIBRARY_PATH
79+
pytest test --verbose -s
80+

0 commit comments

Comments
 (0)