Skip to content

Commit 29611f4

Browse files
authored
[DAGCombiner] Relax nsz constraint for FP optimizations (#165011)
Some floating-point optimization don't trigger because they can produce incorrect results around signed zeros, and rely on the existence of the nsz flag which commonly appears when fast-math is enabled. However, this flag is not a hard requirement when all of the users of the combined value are either guaranteed to overwrite the sign-bit or simply ignore it (comparisons, etc.). The optimizations affected: - fadd x, +0.0 -> x - fsub x, -0.0 -> x - fsub +0.0, x -> fneg x - fdiv(x, sqrt(x)) -> sqrt(x) - frem lowering with power-of-2 divisors
1 parent a94f01a commit 29611f4

File tree

6 files changed

+132
-10
lines changed

6 files changed

+132
-10
lines changed

llvm/include/llvm/CodeGen/SelectionDAG.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2334,6 +2334,12 @@ class SelectionDAG {
23342334
/// +nan are considered positive, -0.0, -inf and -nan are not.
23352335
LLVM_ABI bool cannotBeOrderedNegativeFP(SDValue Op) const;
23362336

2337+
/// Check if a use of a float value is insensitive to signed zeros.
2338+
LLVM_ABI bool canIgnoreSignBitOfZero(const SDUse &Use) const;
2339+
2340+
/// Check if at most two uses of a value are insensitive to signed zeros.
2341+
LLVM_ABI bool canIgnoreSignBitOfZero(SDValue Op) const;
2342+
23372343
/// Test whether two SDValues are known to compare equal. This
23382344
/// is true if they are the same value, or if one is negative zero and the
23392345
/// other positive zero.

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17920,7 +17920,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
1792017920
// N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
1792117921
ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
1792217922
if (N1C && N1C->isZero())
17923-
if (N1C->isNegative() || Flags.hasNoSignedZeros())
17923+
if (N1C->isNegative() || Flags.hasNoSignedZeros() ||
17924+
DAG.canIgnoreSignBitOfZero(SDValue(N, 0)))
1792417925
return N0;
1792517926

1792617927
if (SDValue NewSel = foldBinOpIntoSelect(N))
@@ -18132,7 +18133,8 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
1813218133

1813318134
// (fsub A, 0) -> A
1813418135
if (N1CFP && N1CFP->isZero()) {
18135-
if (!N1CFP->isNegative() || Flags.hasNoSignedZeros()) {
18136+
if (!N1CFP->isNegative() || Flags.hasNoSignedZeros() ||
18137+
DAG.canIgnoreSignBitOfZero(SDValue(N, 0))) {
1813618138
return N0;
1813718139
}
1813818140
}
@@ -18145,7 +18147,8 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
1814518147

1814618148
// (fsub -0.0, N1) -> -N1
1814718149
if (N0CFP && N0CFP->isZero()) {
18148-
if (N0CFP->isNegative() || Flags.hasNoSignedZeros()) {
18150+
if (N0CFP->isNegative() || Flags.hasNoSignedZeros() ||
18151+
DAG.canIgnoreSignBitOfZero(SDValue(N, 0))) {
1814918152
// We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are
1815018153
// flushed to zero, unless all users treat denorms as zero (DAZ).
1815118154
// FIXME: This transform will change the sign of a NaN and the behavior
@@ -18795,7 +18798,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
1879518798
}
1879618799

1879718800
// Fold X/Sqrt(X) -> Sqrt(X)
18798-
if (Flags.hasNoSignedZeros() && Flags.hasAllowReassociation())
18801+
if ((Flags.hasNoSignedZeros() || DAG.canIgnoreSignBitOfZero(SDValue(N, 0))) &&
18802+
Flags.hasAllowReassociation())
1879918803
if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0))
1880018804
return N1;
1880118805

@@ -18846,8 +18850,9 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
1884618850
TLI.isOperationLegalOrCustom(ISD::FDIV, VT) &&
1884718851
TLI.isOperationLegalOrCustom(ISD::FTRUNC, VT) &&
1884818852
DAG.isKnownToBeAPowerOfTwoFP(N1)) {
18849-
bool NeedsCopySign =
18850-
!Flags.hasNoSignedZeros() && !DAG.cannotBeOrderedNegativeFP(N0);
18853+
bool NeedsCopySign = !Flags.hasNoSignedZeros() &&
18854+
!DAG.canIgnoreSignBitOfZero(SDValue(N, 0)) &&
18855+
!DAG.cannotBeOrderedNegativeFP(N0);
1885118856
SDValue Div = DAG.getNode(ISD::FDIV, DL, VT, N0, N1);
1885218857
SDValue Rnd = DAG.getNode(ISD::FTRUNC, DL, VT, Div);
1885318858
SDValue MLA;

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6160,6 +6160,46 @@ bool SelectionDAG::cannotBeOrderedNegativeFP(SDValue Op) const {
61606160
llvm_unreachable("covered opcode switch");
61616161
}
61626162

6163+
bool SelectionDAG::canIgnoreSignBitOfZero(const SDUse &Use) const {
6164+
assert(Use.getValueType().isFloatingPoint());
6165+
const SDNode *User = Use.getUser();
6166+
unsigned OperandNo = Use.getOperandNo();
6167+
// Check if this use is insensitive to the sign of zero
6168+
switch (User->getOpcode()) {
6169+
case ISD::SETCC:
6170+
// Comparisons: IEEE-754 specifies +0.0 == -0.0.
6171+
case ISD::FABS:
6172+
// fabs always produces +0.0.
6173+
return true;
6174+
case ISD::FCOPYSIGN:
6175+
// copysign overwrites the sign bit of the first operand.
6176+
return OperandNo == 0;
6177+
case ISD::FADD:
6178+
case ISD::FSUB: {
6179+
// Arithmetic with non-zero constants fixes the uncertainty around the
6180+
// sign bit.
6181+
SDValue Other = User->getOperand(1 - OperandNo);
6182+
return isKnownNeverZeroFloat(Other);
6183+
}
6184+
case ISD::FP_TO_SINT:
6185+
case ISD::FP_TO_UINT:
6186+
// fp-to-int conversions normalize signed zeros.
6187+
return true;
6188+
default:
6189+
return false;
6190+
}
6191+
}
6192+
6193+
bool SelectionDAG::canIgnoreSignBitOfZero(SDValue Op) const {
6194+
// FIXME: Limit the amount of checked uses to not introduce a compile-time
6195+
// regression. Ideally, this should be implemented as a demanded-bits
6196+
// optimization that stems from the users.
6197+
if (Op->use_size() > 2)
6198+
return false;
6199+
return all_of(Op->uses(),
6200+
[&](const SDUse &Use) { return canIgnoreSignBitOfZero(Use); });
6201+
}
6202+
61636203
bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
61646204
// Check the obvious case.
61656205
if (A == B) return true;
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
3+
4+
; Test that nsz constraint can be bypassed when all uses are sign-insensitive.
5+
6+
define i1 @test_fadd_neg_zero_fcmp(float %x) {
7+
; CHECK-LABEL: test_fadd_neg_zero_fcmp:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: fmov s1, #1.00000000
10+
; CHECK-NEXT: fcmp s0, s1
11+
; CHECK-NEXT: cset w0, eq
12+
; CHECK-NEXT: ret
13+
%add = fadd float %x, -0.0
14+
%cmp = fcmp oeq float %add, 1.0
15+
ret i1 %cmp
16+
}
17+
18+
define float @test_fsub_zero_fabs(float %x) {
19+
; CHECK-LABEL: test_fsub_zero_fabs:
20+
; CHECK: // %bb.0:
21+
; CHECK-NEXT: fabs s0, s0
22+
; CHECK-NEXT: ret
23+
%sub = fsub float %x, 0.0
24+
%abs = call float @llvm.fabs.f32(float %sub)
25+
ret float %abs
26+
}
27+
28+
define float @test_fsub_neg_zero_copysign(float %x, float %y) {
29+
; CHECK-LABEL: test_fsub_neg_zero_copysign:
30+
; CHECK: // %bb.0:
31+
; CHECK-NEXT: mvni v2.4s, #128, lsl #24
32+
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
33+
; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
34+
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
35+
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
36+
; CHECK-NEXT: ret
37+
%sub = fsub float -0.0, %x
38+
%copysign = call float @llvm.copysign.f32(float %sub, float %y)
39+
ret float %copysign
40+
}
41+
42+
define i1 @test_div_sqrt_fcmp(float %x) {
43+
; CHECK-LABEL: test_div_sqrt_fcmp:
44+
; CHECK: // %bb.0:
45+
; CHECK-NEXT: fsqrt s0, s0
46+
; CHECK-NEXT: fcmp s0, #0.0
47+
; CHECK-NEXT: cset w0, gt
48+
; CHECK-NEXT: ret
49+
%sqrt = call float @llvm.sqrt.f32(float %x)
50+
%div = fdiv reassoc float %x, %sqrt
51+
%cmp = fcmp ogt float %div, 0.0
52+
ret i1 %cmp
53+
}
54+
55+
define float @test_frem_fabs(float %x) {
56+
; CHECK-LABEL: test_frem_fabs:
57+
; CHECK: // %bb.0:
58+
; CHECK-NEXT: fmov s1, #0.50000000
59+
; CHECK-NEXT: fmov s2, #-2.00000000
60+
; CHECK-NEXT: fmul s1, s0, s1
61+
; CHECK-NEXT: frintz s1, s1
62+
; CHECK-NEXT: fmadd s0, s1, s2, s0
63+
; CHECK-NEXT: fabs s0, s0
64+
; CHECK-NEXT: ret
65+
%rem = frem float %x, 2.0
66+
%abs = call float @llvm.fabs.f32(float %rem)
67+
ret float %abs
68+
}
69+
70+
declare float @llvm.fabs.f32(float)
71+
declare float @llvm.copysign.f32(float, float)
72+
declare float @llvm.sqrt.f32(float)

llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(ptr addrspace(1
371371
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
372372
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
373373
%load = load float, ptr addrspace(1) %gep, align 4
374-
%v0 = fadd float %load, 0.0
374+
%v0 = fadd float %load, 1.0
375375
%v = tail call float @llvm.fabs.f32(float %v0)
376376
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
377377
store float %canonicalized, ptr addrspace(1) %gep, align 4

llvm/test/CodeGen/AMDGPU/swdev380865.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,13 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
2828
; CHECK-NEXT: v_mov_b32_e32 v1, s7
2929
; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader
3030
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
31-
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0
3231
; CHECK-NEXT: s_mov_b32 s6, 0
3332
; CHECK-NEXT: s_mov_b32 s7, 0x40140000
34-
; CHECK-NEXT: s_add_i32 s1, s1, s0
35-
; CHECK-NEXT: s_cmpk_lt_i32 s1, 0xa00
3633
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7]
3734
; CHECK-NEXT: s_mov_b32 s6, 0
3835
; CHECK-NEXT: s_mov_b32 s7, 0x40180000
36+
; CHECK-NEXT: s_add_i32 s1, s1, s0
37+
; CHECK-NEXT: s_cmpk_lt_i32 s1, 0xa00
3938
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7]
4039
; CHECK-NEXT: s_mov_b32 s6, 0
4140
; CHECK-NEXT: s_mov_b32 s7, 0x401c0000

0 commit comments

Comments
 (0)