From 55b9908a45f371715ee290803a027bb6cd4979cb Mon Sep 17 00:00:00 2001 From: Will Trojak Date: Mon, 1 Dec 2025 14:35:38 +0000 Subject: [PATCH 1/2] added float4 and double2 --- gimmik/cuda.py | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/gimmik/cuda.py b/gimmik/cuda.py index b18c509..974d4b9 100644 --- a/gimmik/cuda.py +++ b/gimmik/cuda.py @@ -7,6 +7,8 @@ class CUDAMatMul(MatMul): platform = 'cuda' basemeta = {'block': (128, 1, 1), 'width': 1, 'shared': 0, 'dynamic_shared': 0} + vtypes = {'float': {'float2': (2, 2), 'float4': (4, 4)}, + 'double': {'double2': (2, 2)}} def _kernel_generators(self, dtype, dsize, *, compute_capability=None): # B loading, C streaming kernel @@ -27,29 +29,29 @@ def _kernel_generators(self, dtype, dsize, *, compute_capability=None): meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize} yield ('cstream-ksplit', args, meta) - # At single precision also consider vectorized kernels - if (dtype == 'float' and - self.aligne is not None and self.aligne % 2 == 0): - # Vector B loading, C streaming kernel - args = {'dtype': 'float2', 'width': 2} - meta = {'width': 2} - yield ('cstream', args, meta) - - # Vector four-way m-split B streaming, C accumulation kernel - ms, bsz, blkx = 4, 16, 32 - args = {'dtype': 'float2', 'width': 2, 'msplit': ms, - 'bsz': bsz, 'blockx': blkx} - meta = {'block': (blkx, ms, 1), 'width': 2, - 'shared': 2*blkx*bsz*2*dsize} - yield ('bstream-msplit', args, meta) - - # Vector two-way k-split B loading, C streaming kernel - ks, csz, blkx = 2, 24, 32 - args = {'dtype': 'float2', 'width': 2, 'ksplit': ks, - 'csz': csz, 'blockx': blkx} - meta = {'block': (blkx, ks, 1), 'width': 2, - 'shared': 2*(ks - 1)*csz*blkx*dsize} - yield ('cstream-ksplit', args, meta) + # Consider some vector types + for vtype, (width, aligne) in self.vtypes[dtype]: + if self.aligne is not None and self.aligne % aligne == 0: + # Vector B loading, C streaming kernel + args = {'dtype': vtype, 'width': width} + meta = {'width': width} + yield ('cstream', args, meta) + + # Vector four-way m-split B streaming, C accumulation kernel + ms, bsz, blkx = 4, 16, 32 + args = {'dtype': vtype, 'width': width, 'msplit': ms, + 'bsz': bsz, 'blockx': blkx} + meta = {'block': (blkx, ms, 1), 'width': width, + 'shared': width*blkx*bsz*2*dsize} + yield ('bstream-msplit', args, meta) + + # Vector two-way k-split B loading, C streaming kernel + ks, csz, blkx = 2, 24, 32 + args = {'dtype': vtype, 'width': width, 'ksplit': ks, + 'csz': csz, 'blockx': blkx} + meta = {'block': (blkx, ks, 1), 'width': width, + 'shared': width*(ks - 1)*csz*blkx*dsize} + yield ('cstream-ksplit', args, meta) def _process_meta(self, meta): if self.n is not None: From 1d64a0c1eedf7f305671ebb021ee28effb8a21d8 Mon Sep 17 00:00:00 2001 From: Will Trojak Date: Wed, 3 Dec 2025 18:16:27 +0000 Subject: [PATCH 2/2] fix alignment bug --- gimmik/cuda.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gimmik/cuda.py b/gimmik/cuda.py index 974d4b9..17ab5e2 100644 --- a/gimmik/cuda.py +++ b/gimmik/cuda.py @@ -7,8 +7,8 @@ class CUDAMatMul(MatMul): platform = 'cuda' basemeta = {'block': (128, 1, 1), 'width': 1, 'shared': 0, 'dynamic_shared': 0} - vtypes = {'float': {'float2': (2, 2), 'float4': (4, 4)}, - 'double': {'double2': (2, 2)}} + vtypes = {'float': {'float2': 2, 'float4': 4}, + 'double': {'double2': 2}} def _kernel_generators(self, dtype, dsize, *, compute_capability=None): # B loading, C streaming kernel @@ -30,8 +30,8 @@ def _kernel_generators(self, dtype, dsize, *, compute_capability=None): yield ('cstream-ksplit', args, meta) # Consider some vector types - for vtype, (width, aligne) in self.vtypes[dtype]: - if self.aligne is not None and self.aligne % aligne == 0: + for vtype, width in self.vtypes[dtype].items(): + if self.aligne is not None and self.aligne % width == 0: # Vector B loading, C streaming kernel args = {'dtype': vtype, 'width': width} meta = {'width': width}