Skip to content

Commit f587fe3

Browse files
[AArch64] Eliminate redundant setcc on vector comparison results
Vector comparisons produce all-zeros or all-ones per lane. For values with this property, comparing < 0 is an identity operation. This eliminates redundant cmlt instructions after cmgt when the result is used for select/bsl operations. The optimization traces through shuffles, narrowing bitcasts, and DUPLANE operations that preserve the all-zeros-or-all-ones property.
1 parent 5c6918f commit f587fe3

File tree

2 files changed

+174
-0
lines changed

2 files changed

+174
-0
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26556,6 +26556,46 @@ performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
2655626556
return SDValue();
2655726557
}
2655826558

26559+
// Check if the value is derived from a vector comparison through operations
26560+
// that preserve the all-zeros-or-all-ones property per lane.
26561+
static bool isDerivedFromVectorCompare(SDValue V) {
26562+
switch (V.getOpcode()) {
26563+
case ISD::SETCC:
26564+
// Found a vector comparison - this is the source of 0/-1 values
26565+
return true;
26566+
26567+
case ISD::VECTOR_SHUFFLE:
26568+
case ISD::EXTRACT_SUBVECTOR:
26569+
// Any shuffle or subvector extract preserves the property
26570+
return isDerivedFromVectorCompare(V.getOperand(0));
26571+
26572+
case ISD::BITCAST: {
26573+
// Bitcast preserves the property only if source element size >=
26574+
// destination element size. This ensures each destination element
26575+
// is entirely within one source element.
26576+
// E.g., v4i32 -> v16i8 is safe (each byte is 0x00 or 0xFF)
26577+
// But v16i8 -> v4i32 is NOT safe (mixing bytes can create non-0/-1)
26578+
SDValue Src = V.getOperand(0);
26579+
EVT SrcVT = Src.getValueType();
26580+
EVT DstVT = V.getValueType();
26581+
if (SrcVT.isVector() && DstVT.isVector() &&
26582+
SrcVT.getScalarSizeInBits() >= DstVT.getScalarSizeInBits())
26583+
return isDerivedFromVectorCompare(Src);
26584+
return false;
26585+
}
26586+
26587+
case AArch64ISD::DUPLANE8:
26588+
case AArch64ISD::DUPLANE16:
26589+
case AArch64ISD::DUPLANE32:
26590+
case AArch64ISD::DUPLANE64:
26591+
// DUPLANE broadcasts one lane - preserves the property
26592+
return isDerivedFromVectorCompare(V.getOperand(0));
26593+
26594+
default:
26595+
return false;
26596+
}
26597+
}
26598+
2655926599
static SDValue performSETCCCombine(SDNode *N,
2656026600
TargetLowering::DAGCombinerInfo &DCI,
2656126601
SelectionDAG &DAG) {
@@ -26628,6 +26668,25 @@ static SDValue performSETCCCombine(SDNode *N,
2662826668

2662926669
EVT CmpVT = LHS.getValueType();
2663026670

26671+
// setcc X, 0, setlt --> X (when X is derived from a vector comparison)
26672+
// setcc 0, X, setgt --> X (equivalent form)
26673+
//
26674+
// Vector comparisons produce all-zeros or all-ones per lane. For any value
26675+
// where each lane is either 0 or -1, comparing < 0 is an identity operation.
26676+
if (VT.isVector() && VT == CmpVT) {
26677+
SDValue Candidate;
26678+
// Match: setcc LHS, 0, setlt
26679+
if (Cond == ISD::SETLT && ISD::isConstantSplatVectorAllZeros(RHS.getNode()))
26680+
Candidate = LHS;
26681+
// Match: setcc 0, RHS, setgt (equivalent to RHS < 0)
26682+
else if (Cond == ISD::SETGT &&
26683+
ISD::isConstantSplatVectorAllZeros(LHS.getNode()))
26684+
Candidate = RHS;
26685+
26686+
if (Candidate && isDerivedFromVectorCompare(Candidate))
26687+
return Candidate;
26688+
}
26689+
2663126690
// NOTE: This exists as a combine only because it proved too awkward to match
2663226691
// splat(1) across all the NEON types during isel.
2663326692
APInt SplatLHSVal;
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
3+
4+
define <4 x i32> @direct_setcc_lt0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
5+
; CHECK-LABEL: direct_setcc_lt0:
6+
; CHECK: // %bb.0:
7+
; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
8+
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
9+
; CHECK-NEXT: ret
10+
%cmp = icmp slt <4 x i32> %a, %b
11+
%sext = sext <4 x i1> %cmp to <4 x i32>
12+
%lt0 = icmp slt <4 x i32> %sext, zeroinitializer
13+
%sel = select <4 x i1> %lt0, <4 x i32> %x, <4 x i32> %y
14+
ret <4 x i32> %sel
15+
}
16+
17+
define <4 x i32> @shuffle_setcc_lt0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
18+
; CHECK-LABEL: shuffle_setcc_lt0:
19+
; CHECK: // %bb.0:
20+
; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
21+
; CHECK-NEXT: dup v0.4s, v0.s[2]
22+
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
23+
; CHECK-NEXT: ret
24+
%cmp = icmp slt <4 x i32> %a, %b
25+
%sext = sext <4 x i1> %cmp to <4 x i32>
26+
%dup = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
27+
%lt0 = icmp slt <4 x i32> %dup, zeroinitializer
28+
%sel = select <4 x i1> %lt0, <4 x i32> %x, <4 x i32> %y
29+
ret <4 x i32> %sel
30+
}
31+
32+
define <4 x i32> @direct_setcc_0gt(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
33+
; CHECK-LABEL: direct_setcc_0gt:
34+
; CHECK: // %bb.0:
35+
; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
36+
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
37+
; CHECK-NEXT: ret
38+
%cmp = icmp slt <4 x i32> %a, %b
39+
%sext = sext <4 x i1> %cmp to <4 x i32>
40+
%gt0 = icmp sgt <4 x i32> zeroinitializer, %sext
41+
%sel = select <4 x i1> %gt0, <4 x i32> %x, <4 x i32> %y
42+
ret <4 x i32> %sel
43+
}
44+
45+
define <8 x i16> @direct_setcc_lt0_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %x, <8 x i16> %y) {
46+
; CHECK-LABEL: direct_setcc_lt0_v8i16:
47+
; CHECK: // %bb.0:
48+
; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h
49+
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
50+
; CHECK-NEXT: ret
51+
%cmp = icmp slt <8 x i16> %a, %b
52+
%sext = sext <8 x i1> %cmp to <8 x i16>
53+
%lt0 = icmp slt <8 x i16> %sext, zeroinitializer
54+
%sel = select <8 x i1> %lt0, <8 x i16> %x, <8 x i16> %y
55+
ret <8 x i16> %sel
56+
}
57+
58+
define <4 x i32> @non_splat_shuffle(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
59+
; CHECK-LABEL: non_splat_shuffle:
60+
; CHECK: // %bb.0:
61+
; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
62+
; CHECK-NOT: cmlt
63+
%cmp = icmp slt <4 x i32> %a, %b
64+
%sext = sext <4 x i1> %cmp to <4 x i32>
65+
%shuf = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
66+
%lt0 = icmp slt <4 x i32> %shuf, zeroinitializer
67+
%sel = select <4 x i1> %lt0, <4 x i32> %x, <4 x i32> %y
68+
ret <4 x i32> %sel
69+
}
70+
71+
define <16 x i8> @bitcast_narrow(<4 x i32> %a, <4 x i32> %b, <16 x i8> %x, <16 x i8> %y) {
72+
; CHECK-LABEL: bitcast_narrow:
73+
; CHECK: // %bb.0:
74+
; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
75+
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
76+
; CHECK-NEXT: ret
77+
%cmp = icmp slt <4 x i32> %a, %b
78+
%sext = sext <4 x i1> %cmp to <4 x i32>
79+
%bc = bitcast <4 x i32> %sext to <16 x i8>
80+
%lt0 = icmp slt <16 x i8> %bc, zeroinitializer
81+
%sel = select <16 x i1> %lt0, <16 x i8> %x, <16 x i8> %y
82+
ret <16 x i8> %sel
83+
}
84+
85+
define <8 x i16> @chain_shuffle_bitcast(<4 x i32> %a, <4 x i32> %b, <8 x i16> %x, <8 x i16> %y) {
86+
; CHECK-LABEL: chain_shuffle_bitcast:
87+
; CHECK: // %bb.0:
88+
; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
89+
; CHECK-NEXT: dup v0.4s, v0.s[2]
90+
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
91+
; CHECK-NEXT: ret
92+
%cmp = icmp slt <4 x i32> %a, %b
93+
%sext = sext <4 x i1> %cmp to <4 x i32>
94+
%shuf = shufflevector <4 x i32> %sext, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
95+
%bc = bitcast <4 x i32> %shuf to <8 x i16>
96+
%lt0 = icmp slt <8 x i16> %bc, zeroinitializer
97+
%sel = select <8 x i1> %lt0, <8 x i16> %x, <8 x i16> %y
98+
ret <8 x i16> %sel
99+
}
100+
101+
; NEGATIVE TEST: Widening bitcast should NOT be optimized
102+
define <4 x i32> @bitcast_widen_negative(<16 x i8> %a, <16 x i8> %b, <4 x i32> %x, <4 x i32> %y) {
103+
; CHECK-LABEL: bitcast_widen_negative:
104+
; CHECK: // %bb.0:
105+
; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b
106+
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
107+
; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
108+
; CHECK-NEXT: ret
109+
%cmp = icmp slt <16 x i8> %a, %b
110+
%sext = sext <16 x i1> %cmp to <16 x i8>
111+
%bc = bitcast <16 x i8> %sext to <4 x i32>
112+
%lt0 = icmp slt <4 x i32> %bc, zeroinitializer
113+
%sel = select <4 x i1> %lt0, <4 x i32> %x, <4 x i32> %y
114+
ret <4 x i32> %sel
115+
}

0 commit comments

Comments
 (0)