@@ -10188,8 +10188,8 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
1018810188; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
1018910189; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
1019010190; SI-NEXT: v_mov_b32_e32 v1, s4
10191- ; SI-NEXT: v_readlane_b32 s30, v18, 28
1019210191; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
10192+ ; SI-NEXT: v_readlane_b32 s30, v18, 28
1019310193; SI-NEXT: v_readlane_b32 s31, v18, 29
1019410194; SI-NEXT: v_readlane_b32 s85, v18, 27
1019510195; SI-NEXT: v_readlane_b32 s84, v18, 26
@@ -10640,8 +10640,8 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
1064010640; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
1064110641; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
1064210642; VI-NEXT: v_mov_b32_e32 v1, s4
10643- ; VI-NEXT: v_readlane_b32 s30, v18, 18
1064410643; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
10644+ ; VI-NEXT: v_readlane_b32 s30, v18, 18
1064510645; VI-NEXT: v_readlane_b32 s31, v18, 19
1064610646; VI-NEXT: v_readlane_b32 s67, v18, 17
1064710647; VI-NEXT: v_readlane_b32 s66, v18, 16
@@ -11063,8 +11063,8 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
1106311063; GFX9-NEXT: s_or_b32 s4, s4, s5
1106411064; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
1106511065; GFX9-NEXT: v_mov_b32_e32 v1, s4
11066- ; GFX9-NEXT: v_readlane_b32 s30, v18, 14
1106711066; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
11067+ ; GFX9-NEXT: v_readlane_b32 s30, v18, 14
1106811068; GFX9-NEXT: v_readlane_b32 s31, v18, 15
1106911069; GFX9-NEXT: v_readlane_b32 s55, v18, 13
1107011070; GFX9-NEXT: v_readlane_b32 s54, v18, 12
@@ -11423,12 +11423,12 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
1142311423; GFX11-NEXT: s_or_b32 s3, s4, s5
1142411424; GFX11-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1
1142511425; GFX11-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3
11426- ; GFX11-NEXT: v_readlane_b32 s30, v17, 7
1142711426; GFX11-NEXT: s_clause 0x3
1142811427; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
1142911428; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
1143011429; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
1143111430; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
11431+ ; GFX11-NEXT: v_readlane_b32 s30, v17, 7
1143211432; GFX11-NEXT: v_readlane_b32 s31, v17, 8
1143311433; GFX11-NEXT: v_readlane_b32 s48, v17, 6
1143411434; GFX11-NEXT: v_readlane_b32 s39, v17, 5
@@ -25294,8 +25294,8 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
2529425294; SI-NEXT: v_or_b32_e32 v2, v3, v2
2529525295; SI-NEXT: v_or_b32_e32 v1, v1, v2
2529625296; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
25297- ; SI-NEXT: v_readlane_b32 s30, v40, 28
2529825297; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
25298+ ; SI-NEXT: v_readlane_b32 s30, v40, 28
2529925299; SI-NEXT: v_readlane_b32 s31, v40, 29
2530025300; SI-NEXT: v_readlane_b32 s85, v40, 27
2530125301; SI-NEXT: v_readlane_b32 s84, v40, 26
@@ -26621,12 +26621,12 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
2662126621; GFX11-NEXT: v_or_b32_e32 v2, v4, v10
2662226622; GFX11-NEXT: v_or_b32_e32 v3, v11, v7
2662326623; GFX11-NEXT: v_or_b32_e32 v4, v12, v8
26624- ; GFX11-NEXT: v_readlane_b32 s30, v40, 8
2662526624; GFX11-NEXT: s_clause 0x3
2662626625; GFX11-NEXT: scratch_store_b128 v0, v[82:85], off
2662726626; GFX11-NEXT: scratch_store_b128 v0, v[23:26], off offset:16
2662826627; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:32
2662926628; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
26629+ ; GFX11-NEXT: v_readlane_b32 s30, v40, 8
2663026630; GFX11-NEXT: v_readlane_b32 s31, v40, 9
2663126631; GFX11-NEXT: v_readlane_b32 s49, v40, 7
2663226632; GFX11-NEXT: v_readlane_b32 s48, v40, 6
@@ -39802,8 +39802,8 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
3980239802; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
3980339803; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
3980439804; SI-NEXT: v_mov_b32_e32 v1, s4
39805- ; SI-NEXT: v_readlane_b32 s30, v18, 28
3980639805; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
39806+ ; SI-NEXT: v_readlane_b32 s30, v18, 28
3980739807; SI-NEXT: v_readlane_b32 s31, v18, 29
3980839808; SI-NEXT: v_readlane_b32 s85, v18, 27
3980939809; SI-NEXT: v_readlane_b32 s84, v18, 26
@@ -40254,8 +40254,8 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
4025440254; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
4025540255; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
4025640256; VI-NEXT: v_mov_b32_e32 v1, s4
40257- ; VI-NEXT: v_readlane_b32 s30, v18, 18
4025840257; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
40258+ ; VI-NEXT: v_readlane_b32 s30, v18, 18
4025940259; VI-NEXT: v_readlane_b32 s31, v18, 19
4026040260; VI-NEXT: v_readlane_b32 s67, v18, 17
4026140261; VI-NEXT: v_readlane_b32 s66, v18, 16
@@ -40677,8 +40677,8 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
4067740677; GFX9-NEXT: s_or_b32 s4, s4, s5
4067840678; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
4067940679; GFX9-NEXT: v_mov_b32_e32 v1, s4
40680- ; GFX9-NEXT: v_readlane_b32 s30, v18, 14
4068140680; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
40681+ ; GFX9-NEXT: v_readlane_b32 s30, v18, 14
4068240682; GFX9-NEXT: v_readlane_b32 s31, v18, 15
4068340683; GFX9-NEXT: v_readlane_b32 s55, v18, 13
4068440684; GFX9-NEXT: v_readlane_b32 s54, v18, 12
@@ -41037,12 +41037,12 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
4103741037; GFX11-NEXT: s_or_b32 s3, s4, s5
4103841038; GFX11-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1
4103941039; GFX11-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3
41040- ; GFX11-NEXT: v_readlane_b32 s30, v17, 7
4104141040; GFX11-NEXT: s_clause 0x3
4104241041; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
4104341042; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
4104441043; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
4104541044; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
41045+ ; GFX11-NEXT: v_readlane_b32 s30, v17, 7
4104641046; GFX11-NEXT: v_readlane_b32 s31, v17, 8
4104741047; GFX11-NEXT: v_readlane_b32 s48, v17, 6
4104841048; GFX11-NEXT: v_readlane_b32 s39, v17, 5
@@ -53428,8 +53428,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
5342853428; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
5342953429; SI-NEXT: s_waitcnt expcnt(0)
5343053430; SI-NEXT: v_mov_b32_e32 v1, s4
53431- ; SI-NEXT: v_readlane_b32 s30, v40, 30
5343253431; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
53432+ ; SI-NEXT: v_readlane_b32 s30, v40, 30
5343353433; SI-NEXT: v_readlane_b32 s31, v40, 31
5343453434; SI-NEXT: v_readlane_b32 s87, v40, 29
5343553435; SI-NEXT: v_readlane_b32 s86, v40, 28
@@ -53876,8 +53876,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
5387653876; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
5387753877; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
5387853878; VI-NEXT: v_mov_b32_e32 v1, s4
53879- ; VI-NEXT: v_readlane_b32 s30, v40, 18
5388053879; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
53880+ ; VI-NEXT: v_readlane_b32 s30, v40, 18
5388153881; VI-NEXT: v_readlane_b32 s31, v40, 19
5388253882; VI-NEXT: v_readlane_b32 s67, v40, 17
5388353883; VI-NEXT: v_readlane_b32 s66, v40, 16
@@ -54293,8 +54293,8 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
5429354293; GFX9-NEXT: s_or_b32 s4, s4, s5
5429454294; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
5429554295; GFX9-NEXT: v_mov_b32_e32 v1, s4
54296- ; GFX9-NEXT: v_readlane_b32 s30, v40, 14
5429754296; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
54297+ ; GFX9-NEXT: v_readlane_b32 s30, v40, 14
5429854298; GFX9-NEXT: v_readlane_b32 s31, v40, 15
5429954299; GFX9-NEXT: v_readlane_b32 s55, v40, 13
5430054300; GFX9-NEXT: v_readlane_b32 s54, v40, 12
@@ -54671,12 +54671,12 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
5467154671; GFX11-NEXT: v_or_b32_e32 v3, v3, v2
5467254672; GFX11-NEXT: v_mov_b32_e32 v2, s0
5467354673; GFX11-NEXT: v_mov_b32_e32 v4, s1
54674- ; GFX11-NEXT: v_readlane_b32 s30, v33, 8
5467554674; GFX11-NEXT: s_clause 0x3
5467654675; GFX11-NEXT: scratch_store_b128 v0, v[22:25], off
5467754676; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:16
5467854677; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
5467954678; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
54679+ ; GFX11-NEXT: v_readlane_b32 s30, v33, 8
5468054680; GFX11-NEXT: v_readlane_b32 s31, v33, 9
5468154681; GFX11-NEXT: v_readlane_b32 s49, v33, 7
5468254682; GFX11-NEXT: v_readlane_b32 s48, v33, 6
@@ -66748,11 +66748,9 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
6674866748; SI-NEXT: v_writelane_b32 v21, s17, 13
6674966749; SI-NEXT: .LBB97_3: ; %end
6675066750; SI-NEXT: v_readlane_b32 s18, v21, 0
66751- ; SI-NEXT: v_readlane_b32 s19, v21, 1
66751+ ; SI-NEXT: s_and_b32 s16, s40, 0xff
6675266752; SI-NEXT: s_lshl_b32 s17, s18, 8
6675366753; SI-NEXT: v_readlane_b32 s18, v21, 2
66754- ; SI-NEXT: s_and_b32 s16, s40, 0xff
66755- ; SI-NEXT: v_readlane_b32 s19, v21, 3
6675666754; SI-NEXT: s_or_b32 s16, s16, s17
6675766755; SI-NEXT: s_and_b32 s17, s18, 0xff
6675866756; SI-NEXT: v_readlane_b32 s18, v21, 4
@@ -66774,9 +66772,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
6677466772; SI-NEXT: v_mov_b32_e32 v2, s16
6677566773; SI-NEXT: v_readlane_b32 s16, v21, 6
6677666774; SI-NEXT: s_and_b32 s14, s14, 0xff
66777- ; SI-NEXT: v_readlane_b32 s17, v21, 7
6677866775; SI-NEXT: s_lshl_b32 s16, s16, 8
66779- ; SI-NEXT: v_readlane_b32 s19 , v21, 5
66776+ ; SI-NEXT: v_readlane_b32 s17 , v21, 7
6678066777; SI-NEXT: s_or_b32 s14, s14, s16
6678166778; SI-NEXT: v_readlane_b32 s16, v21, 8
6678266779; SI-NEXT: v_readlane_b32 s17, v21, 9
@@ -66808,8 +66805,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
6680866805; SI-NEXT: v_mov_b32_e32 v2, s14
6680966806; SI-NEXT: v_readlane_b32 s14, v21, 12
6681066807; SI-NEXT: s_and_b32 s10, s10, 0xff
66811- ; SI-NEXT: v_readlane_b32 s15, v21, 13
6681266808; SI-NEXT: s_lshl_b32 s14, s14, 8
66809+ ; SI-NEXT: v_readlane_b32 s15, v21, 13
6681366810; SI-NEXT: s_or_b32 s10, s10, s14
6681466811; SI-NEXT: v_readlane_b32 s14, v21, 14
6681566812; SI-NEXT: v_readlane_b32 s15, v21, 15
@@ -66960,17 +66957,20 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
6696066957; SI-NEXT: s_and_b32 s5, s89, 0xff
6696166958; SI-NEXT: s_lshl_b32 s5, s5, 16
6696266959; SI-NEXT: s_lshl_b32 s6, s91, 24
66960+ ; SI-NEXT: v_readlane_b32 s19, v21, 1
6696366961; SI-NEXT: s_and_b32 s4, s4, 0xffff
6696466962; SI-NEXT: s_or_b32 s5, s6, s5
66963+ ; SI-NEXT: v_readlane_b32 s19, v21, 3
6696566964; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0
6696666965; SI-NEXT: s_or_b32 s4, s4, s5
66966+ ; SI-NEXT: v_readlane_b32 s19, v21, 5
6696766967; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
6696866968; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
6696966969; SI-NEXT: v_mov_b32_e32 v1, s4
66970- ; SI-NEXT: v_readlane_b32 s30, v20, 34
6697166970; SI-NEXT: v_readlane_b32 s19, v21, 11
6697266971; SI-NEXT: v_readlane_b32 s17, v21, 17
6697366972; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
66973+ ; SI-NEXT: v_readlane_b32 s30, v20, 34
6697466974; SI-NEXT: v_readlane_b32 s31, v20, 35
6697566975; SI-NEXT: v_readlane_b32 s99, v20, 33
6697666976; SI-NEXT: v_readlane_b32 s98, v20, 32
@@ -67018,6 +67018,28 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
6701867018; SI-NEXT: v_writelane_b32 v21, s4, 0
6701967019; SI-NEXT: v_writelane_b32 v21, s5, 1
6702067020; SI-NEXT: ; implicit-def: $sgpr4
67021+ ; SI-NEXT: v_writelane_b32 v21, s4, 2
67022+ ; SI-NEXT: v_writelane_b32 v21, s5, 3
67023+ ; SI-NEXT: ; implicit-def: $sgpr4
67024+ ; SI-NEXT: v_writelane_b32 v21, s4, 4
67025+ ; SI-NEXT: v_writelane_b32 v21, s5, 5
67026+ ; SI-NEXT: ; implicit-def: $sgpr4
67027+ ; SI-NEXT: v_writelane_b32 v21, s4, 6
67028+ ; SI-NEXT: v_writelane_b32 v21, s5, 7
67029+ ; SI-NEXT: ; implicit-def: $sgpr4
67030+ ; SI-NEXT: v_writelane_b32 v21, s4, 8
67031+ ; SI-NEXT: v_writelane_b32 v21, s5, 9
67032+ ; SI-NEXT: ; implicit-def: $sgpr4
67033+ ; SI-NEXT: v_writelane_b32 v21, s4, 10
67034+ ; SI-NEXT: v_writelane_b32 v21, s5, 11
67035+ ; SI-NEXT: ; implicit-def: $sgpr4
67036+ ; SI-NEXT: v_writelane_b32 v21, s4, 12
67037+ ; SI-NEXT: v_writelane_b32 v21, s5, 13
67038+ ; SI-NEXT: ; implicit-def: $sgpr4
67039+ ; SI-NEXT: v_writelane_b32 v21, s4, 14
67040+ ; SI-NEXT: v_writelane_b32 v21, s5, 15
67041+ ; SI-NEXT: ; implicit-def: $sgpr4
67042+ ; SI-NEXT: v_writelane_b32 v21, s4, 16
6702167043; SI-NEXT: ; implicit-def: $sgpr40
6702267044; SI-NEXT: ; implicit-def: $sgpr60
6702367045; SI-NEXT: ; implicit-def: $sgpr74
@@ -67045,6 +67067,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
6704567067; SI-NEXT: ; implicit-def: $sgpr79
6704667068; SI-NEXT: ; implicit-def: $sgpr89
6704767069; SI-NEXT: ; implicit-def: $sgpr91
67070+ ; SI-NEXT: v_writelane_b32 v21, s5, 17
6704867071; SI-NEXT: ; implicit-def: $sgpr42
6704967072; SI-NEXT: ; implicit-def: $sgpr66
6705067073; SI-NEXT: ; implicit-def: $sgpr64
@@ -67061,33 +67084,10 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
6706167084; SI-NEXT: ; implicit-def: $sgpr30
6706267085; SI-NEXT: ; implicit-def: $sgpr94
6706367086; SI-NEXT: ; implicit-def: $sgpr92
67087+ ; SI-NEXT: ; implicit-def: $sgpr4
6706467088; SI-NEXT: ; implicit-def: $sgpr90
6706567089; SI-NEXT: ; implicit-def: $sgpr88
6706667090; SI-NEXT: ; implicit-def: $sgpr78
67067- ; SI-NEXT: v_writelane_b32 v21, s4, 2
67068- ; SI-NEXT: v_writelane_b32 v21, s5, 3
67069- ; SI-NEXT: ; implicit-def: $sgpr4
67070- ; SI-NEXT: v_writelane_b32 v21, s4, 4
67071- ; SI-NEXT: v_writelane_b32 v21, s5, 5
67072- ; SI-NEXT: ; implicit-def: $sgpr4
67073- ; SI-NEXT: v_writelane_b32 v21, s4, 6
67074- ; SI-NEXT: v_writelane_b32 v21, s5, 7
67075- ; SI-NEXT: ; implicit-def: $sgpr4
67076- ; SI-NEXT: v_writelane_b32 v21, s4, 8
67077- ; SI-NEXT: v_writelane_b32 v21, s5, 9
67078- ; SI-NEXT: ; implicit-def: $sgpr4
67079- ; SI-NEXT: v_writelane_b32 v21, s4, 10
67080- ; SI-NEXT: v_writelane_b32 v21, s5, 11
67081- ; SI-NEXT: ; implicit-def: $sgpr4
67082- ; SI-NEXT: v_writelane_b32 v21, s4, 12
67083- ; SI-NEXT: v_writelane_b32 v21, s5, 13
67084- ; SI-NEXT: ; implicit-def: $sgpr4
67085- ; SI-NEXT: v_writelane_b32 v21, s4, 14
67086- ; SI-NEXT: v_writelane_b32 v21, s5, 15
67087- ; SI-NEXT: ; implicit-def: $sgpr4
67088- ; SI-NEXT: v_writelane_b32 v21, s4, 16
67089- ; SI-NEXT: v_writelane_b32 v21, s5, 17
67090- ; SI-NEXT: ; implicit-def: $sgpr4
6709167091; SI-NEXT: s_branch .LBB97_2
6709267092;
6709367093; VI-LABEL: bitcast_v32i16_to_v64i8_scalar:
@@ -67519,8 +67519,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
6751967519; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
6752067520; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
6752167521; VI-NEXT: v_mov_b32_e32 v1, s4
67522- ; VI-NEXT: v_readlane_b32 s30, v18, 18
6752367522; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
67523+ ; VI-NEXT: v_readlane_b32 s30, v18, 18
6752467524; VI-NEXT: v_readlane_b32 s31, v18, 19
6752567525; VI-NEXT: v_readlane_b32 s67, v18, 17
6752667526; VI-NEXT: v_readlane_b32 s66, v18, 16
@@ -68414,12 +68414,12 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
6841468414; GFX11-NEXT: v_or_b32_e32 v2, v4, v10
6841568415; GFX11-NEXT: v_or_b32_e32 v3, v11, v7
6841668416; GFX11-NEXT: v_or_b32_e32 v4, v12, v8
68417- ; GFX11-NEXT: v_readlane_b32 s30, v40, 8
6841868417; GFX11-NEXT: s_clause 0x3
6841968418; GFX11-NEXT: scratch_store_b128 v0, v[82:85], off
6842068419; GFX11-NEXT: scratch_store_b128 v0, v[23:26], off offset:16
6842168420; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:32
6842268421; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
68422+ ; GFX11-NEXT: v_readlane_b32 s30, v40, 8
6842368423; GFX11-NEXT: v_readlane_b32 s31, v40, 9
6842468424; GFX11-NEXT: v_readlane_b32 s49, v40, 7
6842568425; GFX11-NEXT: v_readlane_b32 s48, v40, 6
@@ -79546,8 +79546,8 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
7954679546; SI-NEXT: v_or_b32_e32 v1, v2, v1
7954779547; SI-NEXT: v_or_b32_e32 v1, s4, v1
7954879548; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
79549- ; SI-NEXT: v_readlane_b32 s30, v40, 4
7955079549; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
79550+ ; SI-NEXT: v_readlane_b32 s30, v40, 4
7955179551; SI-NEXT: v_readlane_b32 s31, v40, 5
7955279552; SI-NEXT: v_readlane_b32 s37, v40, 3
7955379553; SI-NEXT: v_readlane_b32 s36, v40, 2
@@ -80950,12 +80950,12 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
8095080950; GFX11-NEXT: v_or_b32_e32 v2, v4, v10
8095180951; GFX11-NEXT: v_or_b32_e32 v3, v11, v7
8095280952; GFX11-NEXT: v_or_b32_e32 v4, v12, v8
80953- ; GFX11-NEXT: v_readlane_b32 s30, v40, 8
8095480953; GFX11-NEXT: s_clause 0x3
8095580954; GFX11-NEXT: scratch_store_b128 v0, v[82:85], off
8095680955; GFX11-NEXT: scratch_store_b128 v0, v[23:26], off offset:16
8095780956; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:32
8095880957; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
80958+ ; GFX11-NEXT: v_readlane_b32 s30, v40, 8
8095980959; GFX11-NEXT: v_readlane_b32 s31, v40, 9
8096080960; GFX11-NEXT: v_readlane_b32 s49, v40, 7
8096180961; GFX11-NEXT: v_readlane_b32 s48, v40, 6
@@ -88414,8 +88414,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
8841488414; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24
8841588415; SI-NEXT: s_waitcnt expcnt(0)
8841688416; SI-NEXT: v_writelane_b32 v41, s4, 0
88417- ; SI-NEXT: v_writelane_b32 v41, s5, 1
8841888417; SI-NEXT: v_readfirstlane_b32 s4, v6
88418+ ; SI-NEXT: v_writelane_b32 v41, s5, 1
8841988419; SI-NEXT: s_lshr_b32 s5, s4, 16
8842088420; SI-NEXT: v_readfirstlane_b32 s4, v7
8842188421; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16
@@ -88895,9 +88895,9 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
8889588895; SI-NEXT: v_or_b32_e32 v1, s5, v1
8889688896; SI-NEXT: v_or_b32_e32 v1, s4, v1
8889788897; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
88898- ; SI-NEXT: v_readlane_b32 s30, v40, 34
8889988898; SI-NEXT: v_readlane_b32 s75, v41, 1
8890088899; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
88900+ ; SI-NEXT: v_readlane_b32 s30, v40, 34
8890188901; SI-NEXT: v_readlane_b32 s31, v40, 35
8890288902; SI-NEXT: v_readlane_b32 s99, v40, 33
8890388903; SI-NEXT: v_readlane_b32 s98, v40, 32
@@ -89735,8 +89735,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
8973589735; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
8973689736; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
8973789737; VI-NEXT: v_mov_b32_e32 v1, s4
89738- ; VI-NEXT: v_readlane_b32 s30, v18, 26
8973989738; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
89739+ ; VI-NEXT: v_readlane_b32 s30, v18, 26
8974089740; VI-NEXT: v_readlane_b32 s31, v18, 27
8974189741; VI-NEXT: v_readlane_b32 s83, v18, 25
8974289742; VI-NEXT: v_readlane_b32 s82, v18, 24
@@ -90519,8 +90519,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
9051990519; GFX9-NEXT: s_or_b32 s4, s4, s5
9052090520; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
9052190521; GFX9-NEXT: v_mov_b32_e32 v1, s4
90522- ; GFX9-NEXT: v_readlane_b32 s30, v18, 14
9052390522; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
90523+ ; GFX9-NEXT: v_readlane_b32 s30, v18, 14
9052490524; GFX9-NEXT: v_readlane_b32 s31, v18, 15
9052590525; GFX9-NEXT: v_readlane_b32 s55, v18, 13
9052690526; GFX9-NEXT: v_readlane_b32 s54, v18, 12
@@ -91258,12 +91258,12 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
9125891258; GFX11-NEXT: s_or_b32 s3, s4, s5
9125991259; GFX11-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1
9126091260; GFX11-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3
91261- ; GFX11-NEXT: v_readlane_b32 s30, v17, 9
9126291261; GFX11-NEXT: s_clause 0x3
9126391262; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
9126491263; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
9126591264; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
9126691265; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
91266+ ; GFX11-NEXT: v_readlane_b32 s30, v17, 9
9126791267; GFX11-NEXT: v_readlane_b32 s31, v17, 10
9126891268; GFX11-NEXT: v_readlane_b32 s51, v17, 8
9126991269; GFX11-NEXT: v_readlane_b32 s50, v17, 7
0 commit comments