PyFR · WillTrojak · Nov 30, 2025 · Dec 1, 2025 · FreddieWitherden · Dec 1, 2025
diff --git a/gimmik/cuda.py b/gimmik/cuda.py
@@ -10,43 +10,44 @@ class CUDAMatMul(MatMul):
 
     def _kernel_generators(self, dtype, dsize, *, compute_capability=None):
         # B loading, C streaming kernel
-        yield ('cstream', {}, {})
+        yield ('cstream', {'blocksz': self.basemeta['block'][0]}, {})
 
         # B streaming, C accumulation kernel
-        yield ('bstream', {}, {})
+        yield ('bstream', {'blocksz': self.basemeta['block'][0]}, {})
 
         # Four-way m-split B streaming, C accumulation kernel
         ms, bsz, blkx = 4, 24, 32
-        args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx}
+        args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx, 'blocksz': blkx*ms}
         meta = {'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize}
         yield ('bstream-msplit', args, meta)
 
         # Two-way k-split B loading, C streaming kernel
         ks, csz, blkx = 2, 24, 32
-        args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
+        args = {'ksplit': ks, 'csz': csz, 'blockx': blkx, 'blocksz': blkx*ks}
         meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize}
         yield ('cstream-ksplit', args, meta)
 
         # At single precision also consider vectorized kernels
         if (dtype == 'float' and
             self.aligne is not None and self.aligne % 2 == 0):
             # Vector B loading, C streaming kernel
-            args = {'dtype': 'float2', 'width': 2}
+            args = {'dtype': 'float2', 'width': 2,
+                    'blocksz': self.basemeta['block'][0]}
             meta = {'width': 2}
             yield ('cstream', args, meta)
 
             # Vector four-way m-split B streaming, C accumulation kernel
             ms, bsz, blkx = 4, 16, 32
-            args = {'dtype': 'float2', 'width': 2, 'msplit': ms,
-                    'bsz': bsz, 'blockx': blkx}
+            args = {'dtype': 'float2', 'width': 2, 'msplit': ms, 'bsz': bsz,
+                    'blockx': blkx, 'blocksz': blkx*ms}
             meta = {'block': (blkx, ms, 1), 'width': 2,
                     'shared': 2*blkx*bsz*2*dsize}
             yield ('bstream-msplit', args, meta)
 
             # Vector two-way k-split B loading, C streaming kernel
             ks, csz, blkx = 2, 24, 32
             args = {'dtype': 'float2', 'width': 2, 'ksplit': ks,
-                    'csz': csz, 'blockx': blkx}
+                    'csz': csz, 'blockx': blkx, 'blocksz': blkx*ks}
             meta = {'block': (blkx, ks, 1), 'width': 2,
                     'shared': 2*(ks - 1)*csz*blkx*dsize}
             yield ('cstream-ksplit', args, meta)

diff --git a/gimmik/kernels/cuda/bstream-msplit.mako b/gimmik/kernels/cuda/bstream-msplit.mako
@@ -5,7 +5,7 @@ mx = partition(A, into=msplit, by='rows')
 bchunks = chunk(bix, bsz)
 %>
 
-__global__ void
+__global__ __launch_bounds__(${blocksz}) void
 % if n is None:
 ${kname}(int n,
          const ${dtype}* __restrict__ b, int ldb,

diff --git a/gimmik/kernels/cuda/bstream.mako b/gimmik/kernels/cuda/bstream.mako
@@ -1,11 +1,15 @@
 <%inherit file='base'/>
 
-__global__ void
+__global__ __launch_bounds__(${blocksz}) void
 % if n is None:
 ${kname}(int n,
          const ${dtype}* __restrict__ b, int ldb,
          ${dtype}* __restrict__ c, int ldc)
 {
+#if ( __CUDACC_VER_MAJOR__ >= 13 )
+    asm volatile (".pragma \"enable_smem_spilling\";");
+#endif
+
   % if width > 1:
     n = ((n + ${width} - 1) / ${width}) * ${width};
     ldb /= ${width};
@@ -14,6 +18,9 @@ ${kname}(int n,
 % else:
 ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
 {
+#if ( __CUDACC_VER_MAJOR__ >= 13 )
+    asm volatile (".pragma \"enable_smem_spilling\";");
+#endif
     const int n = ${-(-n // width)};
     const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
     const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};

diff --git a/gimmik/kernels/cuda/cstream-ksplit.mako b/gimmik/kernels/cuda/cstream-ksplit.mako
@@ -6,7 +6,7 @@ cchunks = chunk(range(m), csz)
 loaded = set()
 %>
 
-__global__ void
+__global__ __launch_bounds__(${blocksz}) void
 % if n is None:
 ${kname}(int n,
          const ${dtype}* __restrict__ b, int ldb,

diff --git a/gimmik/kernels/cuda/cstream.mako b/gimmik/kernels/cuda/cstream.mako
@@ -2,12 +2,15 @@
 
 <% ksplit = 2 if m < 36 else 1 %>
 
-__global__ void
+__global__ __launch_bounds__(${blocksz}) void
 % if n is None:
 ${kname}(int n,
          const ${dtype}* __restrict__ b, int ldb,
          ${dtype}* __restrict__ c, int ldc)
 {
+#if ( __CUDACC_VER_MAJOR__ >= 13 )
+    asm volatile (".pragma \"enable_smem_spilling\";");
+#endif
   % if width > 1:
     n = ((n + ${width} - 1) / ${width}) * ${width};
     ldb /= ${width};
@@ -16,6 +19,9 @@ ${kname}(int n,
 % else:
 ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
 {
+#if ( __CUDACC_VER_MAJOR__ >= 13 )
+    asm volatile (".pragma \"enable_smem_spilling\";");
+#endif
     const int n = ${-(-n // width)};
     const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
     const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};