Skip to content

Commit d6d21c8

Browse files
doru1004adelejjeh
authored andcommitted
Allow sinking of free vector ops
1 parent 94a627f commit d6d21c8

File tree

3 files changed

+170
-44
lines changed

3 files changed

+170
-44
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
13011301

13021302
if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
13031303
Ops.push_back(&Op);
1304+
1305+
// Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
1306+
// will be optimized away, and sinking them can help SDAG combines.
1307+
DataLayout DL = I->getModule()->getDataLayout();
1308+
auto IsFreeExtractInsert = [&DL, this](VectorType *VecType,
1309+
unsigned VecIndex) {
1310+
unsigned EltSize = DL.getTypeSizeInBits(VecType->getElementType());
1311+
return EltSize >= 32 ||
1312+
(EltSize == 16 && VecIndex == 0 && ST->has16BitInsts());
1313+
};
1314+
1315+
uint64_t VecIndex;
1316+
Value *Vec;
1317+
if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) {
1318+
Instruction *VecOpInst =
1319+
dyn_cast<Instruction>(cast<Instruction>(Op.get())->getOperand(0));
1320+
// If a zero cost extractvector instruction is the only use of the vector,
1321+
// then it may be combined with the def.
1322+
if (VecOpInst && VecOpInst->hasOneUse())
1323+
continue;
1324+
1325+
if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
1326+
Ops.push_back(&Op);
1327+
1328+
continue;
1329+
}
1330+
1331+
if (match(Op.get(),
1332+
m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) {
1333+
if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
1334+
Ops.push_back(&Op);
1335+
1336+
continue;
1337+
}
1338+
1339+
if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1340+
if (Shuffle->isIdentity()) {
1341+
Ops.push_back(&Op);
1342+
continue;
1343+
}
1344+
1345+
unsigned EltSize = DL.getTypeSizeInBits(
1346+
cast<VectorType>(cast<VectorType>(Shuffle->getType()))
1347+
->getElementType());
1348+
1349+
// For i32 (or greater) shufflevectors, these will be lowered into a
1350+
// series of insert / extract elements, which will be coalesced away.
1351+
if (EltSize >= 32) {
1352+
Ops.push_back(&Op);
1353+
continue;
1354+
}
1355+
1356+
if (EltSize < 16 || !ST->has16BitInsts())
1357+
continue;
1358+
1359+
int NumSubElts, SubIndex;
1360+
if (Shuffle->changesLength()) {
1361+
if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1362+
Ops.push_back(&Op);
1363+
continue;
1364+
}
1365+
1366+
if (Shuffle->isExtractSubvectorMask(SubIndex) ||
1367+
Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
1368+
if (!(SubIndex % 2)) {
1369+
Ops.push_back(&Op);
1370+
continue;
1371+
}
1372+
}
1373+
}
1374+
1375+
if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1376+
Shuffle->isSingleSource()) {
1377+
Ops.push_back(&Op);
1378+
continue;
1379+
}
1380+
1381+
if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
1382+
if (!(SubIndex % 2)) {
1383+
Ops.push_back(&Op);
1384+
continue;
1385+
}
1386+
}
1387+
}
13041388
}
13051389

13061390
return !Ops.empty();

llvm/test/CodeGen/AMDGPU/srem.ll

Lines changed: 86 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6135,6 +6135,13 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
61356135
; TONGA-NEXT: s_addc_u32 s1, s7, 0
61366136
; TONGA-NEXT: v_mov_b32_e32 v5, s1
61376137
; TONGA-NEXT: v_mov_b32_e32 v4, s0
6138+
; TONGA-NEXT: s_add_u32 s0, s6, 32
6139+
; TONGA-NEXT: s_addc_u32 s1, s7, 0
6140+
; TONGA-NEXT: v_mov_b32_e32 v0, s6
6141+
; TONGA-NEXT: v_mov_b32_e32 v9, s1
6142+
; TONGA-NEXT: v_mov_b32_e32 v1, s7
6143+
; TONGA-NEXT: v_mov_b32_e32 v8, s0
6144+
; TONGA-NEXT: s_add_u32 s0, s6, 16
61386145
; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
61396146
; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
61406147
; TONGA-NEXT: s_waitcnt vmcnt(3)
@@ -6573,27 +6580,28 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
65736580
; TONGA-NEXT: v_subb_u32_e32 v18, vcc, 0, v16, vcc
65746581
; TONGA-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
65756582
; TONGA-NEXT: v_rcp_f32_e32 v0, v0
6583+
; TONGA-NEXT: s_mov_b32 s27, s26
65766584
; TONGA-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
65776585
; TONGA-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
65786586
; TONGA-NEXT: v_trunc_f32_e32 v1, v1
65796587
; TONGA-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
6580-
; TONGA-NEXT: v_cvt_u32_f32_e32 v14, v1
6581-
; TONGA-NEXT: v_cvt_u32_f32_e32 v15, v0
6582-
; TONGA-NEXT: v_mul_lo_u32 v3, v17, v14
6583-
; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v17, v15, 0
6584-
; TONGA-NEXT: v_mul_lo_u32 v4, v18, v15
6585-
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v3
6586-
; TONGA-NEXT: v_add_u32_e32 v19, vcc, v1, v4
6587-
; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v15, v19, 0
6588-
; TONGA-NEXT: v_mul_hi_u32 v1, v15, v0
6589-
; TONGA-NEXT: v_add_u32_e32 v20, vcc, v1, v3
6590-
; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v14, v0, 0
6591-
; TONGA-NEXT: v_addc_u32_e32 v21, vcc, 0, v4, vcc
6592-
; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v14, v19, 0
6593-
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v20, v0
6594-
; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v21, v1, vcc
6595-
; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
6596-
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v3
6588+
; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v1
6589+
; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v0
6590+
; TONGA-NEXT: v_mul_lo_u32 v2, s2, v4
6591+
; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v5, 0
6592+
; TONGA-NEXT: v_mul_lo_u32 v3, s3, v5
6593+
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v2
6594+
; TONGA-NEXT: v_add_u32_e32 v3, vcc, v1, v3
6595+
; TONGA-NEXT: v_mul_hi_u32 v6, v5, v0
6596+
; TONGA-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0
6597+
; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v1
6598+
; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v0, 0
6599+
; TONGA-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc
6600+
; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v3, 0
6601+
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v6, v0
6602+
; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v7, v1, vcc
6603+
; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
6604+
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2
65976605
; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
65986606
; TONGA-NEXT: v_add_u32_e32 v19, vcc, v15, v0
65996607
; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v14, v1, vcc
@@ -6635,25 +6643,60 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
66356643
; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v5, v3, 0
66366644
; TONGA-NEXT: v_mul_lo_u32 v3, v16, v3
66376645
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v4, v1
6646+
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
6647+
; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0
6648+
; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v1, 0
6649+
; TONGA-NEXT: v_add_u32_e32 v4, vcc, v8, v4
6650+
; TONGA-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
6651+
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v4, v2
6652+
; TONGA-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc
6653+
; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6654+
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0
6655+
; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6656+
; TONGA-NEXT: s_add_u32 s0, s18, s26
6657+
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v0
6658+
; TONGA-NEXT: s_addc_u32 s1, s19, s26
6659+
; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v7, v1, vcc
6660+
; TONGA-NEXT: s_xor_b64 s[28:29], s[0:1], s[26:27]
6661+
; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s28, v3, 0
6662+
; TONGA-NEXT: v_mul_hi_u32 v4, s28, v2
6663+
; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v0
6664+
; TONGA-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6665+
; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s29, v2, 0
6666+
; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s29, v3, 0
6667+
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0
6668+
; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v5, v1, vcc
6669+
; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
6670+
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v0, v2
6671+
; TONGA-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc
6672+
; TONGA-NEXT: v_mul_lo_u32 v3, s22, v0
6673+
; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s22, v2, 0
6674+
; TONGA-NEXT: v_mul_lo_u32 v2, s23, v2
66386675
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
6639-
; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v7, v1
6640-
; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v14, v0
6641-
; TONGA-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v16, vcc
6642-
; TONGA-NEXT: v_sub_u32_e64 v4, s[0:1], v0, v5
6643-
; TONGA-NEXT: v_subbrev_u32_e64 v14, s[2:3], 0, v3, s[0:1]
6644-
; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v16
6645-
; TONGA-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[2:3]
6646-
; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v4, v5
6647-
; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc
6648-
; TONGA-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[2:3]
6649-
; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v14, v16
6650-
; TONGA-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v16, s[0:1]
6651-
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v16
6652-
; TONGA-NEXT: v_cndmask_b32_e64 v17, v17, v18, s[2:3]
6653-
; TONGA-NEXT: v_sub_u32_e64 v18, s[0:1], v4, v5
6654-
; TONGA-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
6655-
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
6656-
; TONGA-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
6676+
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v1
6677+
; TONGA-NEXT: v_sub_u32_e32 v2, vcc, s29, v1
6678+
; TONGA-NEXT: v_mov_b32_e32 v3, s23
6679+
; TONGA-NEXT: v_sub_u32_e32 v0, vcc, s28, v0
6680+
; TONGA-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
6681+
; TONGA-NEXT: v_subrev_u32_e64 v4, s[0:1], s22, v0
6682+
; TONGA-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
6683+
; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s23, v5
6684+
; TONGA-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3]
6685+
; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s22, v4
6686+
; TONGA-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
6687+
; TONGA-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3]
6688+
; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], s23, v5
6689+
; TONGA-NEXT: v_subrev_u32_e64 v3, s[0:1], s22, v4
6690+
; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3]
6691+
; TONGA-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
6692+
; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
6693+
; TONGA-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1]
6694+
; TONGA-NEXT: v_mov_b32_e32 v4, s29
6695+
; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
6696+
; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s23, v1
6697+
; TONGA-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
6698+
; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s22, v0
6699+
; TONGA-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1]
66576700
; TONGA-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
66586701
; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v1, v16
66596702
; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v17
@@ -6675,14 +6718,14 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
66756718
; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0
66766719
; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
66776720
; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0
6678-
; TONGA-NEXT: v_mul_lo_u32 v1, v1, v0
6721+
; TONGA-NEXT: v_mul_lo_u32 v1, s0, v0
66796722
; TONGA-NEXT: v_mul_hi_u32 v1, v0, v1
66806723
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
6681-
; TONGA-NEXT: v_mul_hi_u32 v0, v6, v0
6682-
; TONGA-NEXT: v_mul_lo_u32 v0, v0, v2
6683-
; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v6, v0
6684-
; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v2, v0
6685-
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
6724+
; TONGA-NEXT: v_mul_hi_u32 v0, s18, v0
6725+
; TONGA-NEXT: v_mul_lo_u32 v0, v0, s20
6726+
; TONGA-NEXT: v_sub_u32_e32 v0, vcc, s18, v0
6727+
; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s20, v0
6728+
; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s20, v0
66866729
; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
66876730
; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v2, v0
66886731
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
@@ -6691,11 +6734,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
66916734
; TONGA-NEXT: v_mov_b32_e32 v0, s4
66926735
; TONGA-NEXT: v_mov_b32_e32 v1, s5
66936736
; TONGA-NEXT: s_add_u32 s0, s4, 16
6694-
; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
6737+
; TONGA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
66956738
; TONGA-NEXT: s_addc_u32 s1, s5, 0
66966739
; TONGA-NEXT: v_mov_b32_e32 v0, s0
66976740
; TONGA-NEXT: v_mov_b32_e32 v1, s1
6698-
; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
6741+
; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
66996742
; TONGA-NEXT: s_endpgm
67006743
; TONGA-NEXT: .LBB12_15:
67016744
; TONGA-NEXT: s_branch .LBB12_7

llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
6767
; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[58:59]
6868
; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[62:63]
6969
; CHECK-NEXT: s_waitcnt vmcnt(0)
70-
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
7170
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
7271
; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[46:47] glc
7372
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)

0 commit comments

Comments
 (0)