FlashAttention Benchmark update (#96)

jduprat · web-flow · commit ffdcb72cfc6a · 2025-10-27T22:16:33.000-07:00
FA4 now automatically picks up nvidia-cutlass-dsl from the project requirements.  This fixes the failures from last few days where we were installing outdated package.

Test output now clearly states system power limit

Update Docker image version in workflow
diff --git a/.github/workflows/flash_attention.yml b/.github/workflows/flash_attention.yml
@@ -34,7 +34,7 @@ jobs:
 
       - name: Run Flash Attention benchmark in Docker
         env:
-          DOCKER_IMAGE: nvcr.io/nvidia/pytorch:25.06-py3
+          DOCKER_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3
         run: |
           set -eux
 
@@ -52,21 +52,19 @@ jobs:
             "${DOCKER_IMAGE}"
           )
 
-          # Install CuTe DSL
-          docker exec -t "${container_name}" bash -c "
-            set -x
-            echo 'Installing nvidia-cutlass-dsl'
-            pip install nvidia-cutlass-dsl==4.1.0
-          "
-
           # Build and run FlashAttention CuTe DSL
           docker exec -t "${container_name}" bash -c "
             set -x
             pushd fa4
             python setup.py install
-
-            echo '<h1>B200 1000W</h1>' >> /tmp/workspace/fa4_output.txt
+            pip install -e flash_attn/cute/
+            
             nvidia-smi
+
+            echo '<h1>B200' >> /tmp/workspace/fa4_output.txt
+            nvidia-smi -q -d POWER | grep 'Current Power Limit' | head -1 | cut  -d : -f 2 >> /tmp/workspace/fa4_output.txt
+            echo '</h1>' >> /tmp/workspace/fa4_output.txt
+
             export PYTHONPATH=\$(pwd)
             python benchmarks/benchmark_attn.py >> /tmp/workspace/fa4_output.txt
             popd