diff --git a/gimmik/cuda.py b/gimmik/cuda.py
index b18c509..17ab5e2 100644
--- a/gimmik/cuda.py
+++ b/gimmik/cuda.py
@@ -7,6 +7,8 @@ class CUDAMatMul(MatMul):
     platform = 'cuda'
     basemeta = {'block': (128, 1, 1), 'width': 1, 'shared': 0,
                 'dynamic_shared': 0}
+    vtypes = {'float': {'float2': 2, 'float4': 4},
+              'double': {'double2': 2}}
 
     def _kernel_generators(self, dtype, dsize, *, compute_capability=None):
         # B loading, C streaming kernel
@@ -27,29 +29,29 @@ def _kernel_generators(self, dtype, dsize, *, compute_capability=None):
         meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize}
         yield ('cstream-ksplit', args, meta)
 
-        # At single precision also consider vectorized kernels
-        if (dtype == 'float' and
-            self.aligne is not None and self.aligne % 2 == 0):
-            # Vector B loading, C streaming kernel
-            args = {'dtype': 'float2', 'width': 2}
-            meta = {'width': 2}
-            yield ('cstream', args, meta)
-
-            # Vector four-way m-split B streaming, C accumulation kernel
-            ms, bsz, blkx = 4, 16, 32
-            args = {'dtype': 'float2', 'width': 2, 'msplit': ms,
-                    'bsz': bsz, 'blockx': blkx}
-            meta = {'block': (blkx, ms, 1), 'width': 2,
-                    'shared': 2*blkx*bsz*2*dsize}
-            yield ('bstream-msplit', args, meta)
-
-            # Vector two-way k-split B loading, C streaming kernel
-            ks, csz, blkx = 2, 24, 32
-            args = {'dtype': 'float2', 'width': 2, 'ksplit': ks,
-                    'csz': csz, 'blockx': blkx}
-            meta = {'block': (blkx, ks, 1), 'width': 2,
-                    'shared': 2*(ks - 1)*csz*blkx*dsize}
-            yield ('cstream-ksplit', args, meta)
+        # Consider some vector types
+        for vtype, width in self.vtypes[dtype].items():
+            if self.aligne is not None and self.aligne % width == 0:
+                # Vector B loading, C streaming kernel
+                args = {'dtype': vtype, 'width': width}
+                meta = {'width': width}
+                yield ('cstream', args, meta)
+
+                # Vector four-way m-split B streaming, C accumulation kernel
+                ms, bsz, blkx = 4, 16, 32
+                args = {'dtype': vtype, 'width': width, 'msplit': ms,
+                        'bsz': bsz, 'blockx': blkx}
+                meta = {'block': (blkx, ms, 1), 'width': width,
+                        'shared': width*blkx*bsz*2*dsize}
+                yield ('bstream-msplit', args, meta)
+
+                # Vector two-way k-split B loading, C streaming kernel
+                ks, csz, blkx = 2, 24, 32
+                args = {'dtype': vtype, 'width': width, 'ksplit': ks,
+                        'csz': csz, 'blockx': blkx}
+                meta = {'block': (blkx, ks, 1), 'width': width,
+                        'shared': width*(ks - 1)*csz*blkx*dsize}
+                yield ('cstream-ksplit', args, meta)
 
     def _process_meta(self, meta):
         if self.n is not None: