Skip to content
24 changes: 15 additions & 9 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4802,16 +4802,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
// Limit to loops with trip counts that are cheap to expand.
UP.SCEVExpansionBudget = 1;

// Try to unroll small, single block loops, if they have load/store
// dependencies, to expose more parallel memory access streams.
// Try to unroll small loops, of few-blocks with low budget, if they have
// load/store dependencies, to expose more parallel memory access streams,
// or if they do little work inside a block (i.e. load -> X -> store pattern).
BasicBlock *Header = L->getHeader();
if (Header == L->getLoopLatch()) {
// Estimate the size of the loop.
unsigned Size;
if (!isLoopSizeWithinBudget(L, TTI, 8, &Size))
unsigned Width=10;
if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
return;

SmallPtrSet<Value *, 8> LoadedValues;
SmallPtrSet<Value *, 8> LoadedValuesPlus;
SmallVector<StoreInst *> Stores;
for (auto *BB : L->blocks()) {
for (auto &I : *BB) {
Expand All @@ -4821,9 +4823,13 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
if (SE.isLoopInvariant(PtrSCEV, L))
continue;
if (isa<LoadInst>(&I))
LoadedValues.insert(&I);
else
if (isa<LoadInst>(&I)) {
LoadedValuesPlus.insert(&I);
// Include in-loop 1st users of loaded values.
for (auto *U : I.users())
if (L->contains(cast<Instruction>(U)))
LoadedValuesPlus.insert(U);
} else
Stores.push_back(cast<StoreInst>(&I));
}
}
Expand All @@ -4846,8 +4852,8 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
UC++;
}

if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
return LoadedValues.contains(SI->getOperand(0));
if (BestUC == 1 || none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
return LoadedValuesPlus.contains(SI->getOperand(0));
}))
return;

Expand Down
198 changes: 198 additions & 0 deletions llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,204 @@ exit:
ret void
}

define void @load_op_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale, float %k) {
; APPLE-LABEL: define void @load_op_store_loop(
; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
; APPLE: [[ENTRY_NEW]]:
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
; APPLE-NEXT: br label %[[LOOP:.*]]
; APPLE: [[LOOP]]:
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
; APPLE-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
; APPLE-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
; APPLE-NEXT: [[O:%.*]] = fadd float [[L]], [[K]]
; APPLE-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
; APPLE-NEXT: store float [[O]], ptr [[GEP_DST]], align 4
; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
; APPLE-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT]], [[SCALE]]
; APPLE-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
; APPLE-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
; APPLE-NEXT: [[O_1:%.*]] = fadd float [[L_1]], [[K]]
; APPLE-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT]]
; APPLE-NEXT: store float [[O_1]], ptr [[GEP_DST_1]], align 4
; APPLE-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
; APPLE-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
; APPLE-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
; APPLE-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
; APPLE: [[EXIT_UNR_LCSSA]]:
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
; APPLE: [[LOOP_EPIL_PREHEADER]]:
; APPLE-NEXT: br label %[[LOOP_EPIL:.*]]
; APPLE: [[LOOP_EPIL]]:
; APPLE-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_UNR]], [[SCALE]]
; APPLE-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
; APPLE-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
; APPLE-NEXT: [[O_EPIL:%.*]] = fadd float [[L_EPIL]], [[K]]
; APPLE-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_UNR]]
; APPLE-NEXT: store float [[O_EPIL]], ptr [[GEP_DST_EPIL]], align 4
; APPLE-NEXT: br label %[[EXIT]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
; OTHER-LABEL: define void @load_op_store_loop(
; OTHER-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
; OTHER-NEXT: [[ENTRY:.*]]:
; OTHER-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
; OTHER-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
; OTHER-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
; OTHER-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
; OTHER: [[ENTRY_NEW]]:
; OTHER-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
; OTHER-NEXT: br label %[[LOOP:.*]]
; OTHER: [[LOOP]]:
; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
; OTHER-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
; OTHER-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
; OTHER-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
; OTHER-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
; OTHER-NEXT: [[O:%.*]] = fadd float [[L]], [[K]]
; OTHER-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
; OTHER-NEXT: store float [[O]], ptr [[GEP_DST]], align 4
; OTHER-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
; OTHER-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT]], [[SCALE]]
; OTHER-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
; OTHER-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
; OTHER-NEXT: [[O_1:%.*]] = fadd float [[L_1]], [[K]]
; OTHER-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT]]
; OTHER-NEXT: store float [[O_1]], ptr [[GEP_DST_1]], align 4
; OTHER-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
; OTHER-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
; OTHER-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
; OTHER-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
; OTHER: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
; OTHER-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
; OTHER-NEXT: br label %[[EXIT_UNR_LCSSA]]
; OTHER: [[EXIT_UNR_LCSSA]]:
; OTHER-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
; OTHER-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; OTHER-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
; OTHER: [[LOOP_EPIL_PREHEADER]]:
; OTHER-NEXT: br label %[[LOOP_EPIL:.*]]
; OTHER: [[LOOP_EPIL]]:
; OTHER-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_UNR]], [[SCALE]]
; OTHER-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
; OTHER-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
; OTHER-NEXT: [[O_EPIL:%.*]] = fadd float [[L_EPIL]], [[K]]
; OTHER-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_UNR]]
; OTHER-NEXT: store float [[O_EPIL]], ptr [[GEP_DST_EPIL]], align 4
; OTHER-NEXT: br label %[[EXIT]]
; OTHER: [[EXIT]]:
; OTHER-NEXT: ret void
;
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%scaled.iv = mul nuw nsw i64 %iv, %scale
%gep.src = getelementptr inbounds float, ptr %src, i64 %scaled.iv
%l = load float, ptr %gep.src, align 4
%o = fadd float %l, %k
%gep.dst = getelementptr inbounds float, ptr %dst, i64 %iv
store float %o, ptr %gep.dst, align 4
%iv.next = add nuw nsw i64 %iv, 1
%ec = icmp eq i64 %iv.next, %N
br i1 %ec, label %exit, label %loop

exit:
ret void
}

define void @load_op_store_loop_multiblock(ptr %src, ptr %dst, i64 %N, i64 %scale, float %k) {
; APPLE-LABEL: define void @load_op_store_loop_multiblock(
; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: br label %[[LOOP:.*]]
; APPLE: [[LOOP]]:
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOPCONT:.*]] ]
; APPLE-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
; APPLE-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
; APPLE-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
; APPLE-NEXT: [[AND:%.*]] = and i64 [[IV]], 1
; APPLE-NEXT: [[ODD:%.*]] = icmp eq i64 [[AND]], 1
; APPLE-NEXT: br i1 [[ODD]], label %[[LOOPODD:.*]], label %[[LOOPCONT]]
; APPLE: [[LOOPCONT]]:
; APPLE-NEXT: [[D:%.*]] = phi float [ [[L2:%.*]], %[[LOOPODD]] ], [ [[L]], %[[LOOP]] ]
; APPLE-NEXT: [[O:%.*]] = fadd float [[D]], [[K]]
; APPLE-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
; APPLE-NEXT: store float [[O]], ptr [[GEP_DST]], align 4
; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
; APPLE: [[LOOPODD]]:
; APPLE-NEXT: [[L2]] = fneg float [[L]]
; APPLE-NEXT: br label %[[LOOPCONT]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
; OTHER-LABEL: define void @load_op_store_loop_multiblock(
; OTHER-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
; OTHER-NEXT: [[ENTRY:.*]]:
; OTHER-NEXT: br label %[[LOOP:.*]]
; OTHER: [[LOOP]]:
; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOPCONT:.*]] ]
; OTHER-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
; OTHER-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
; OTHER-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
; OTHER-NEXT: [[AND:%.*]] = and i64 [[IV]], 1
; OTHER-NEXT: [[ODD:%.*]] = icmp eq i64 [[AND]], 1
; OTHER-NEXT: br i1 [[ODD]], label %[[LOOPODD:.*]], label %[[LOOPCONT]]
; OTHER: [[LOOPCONT]]:
; OTHER-NEXT: [[D:%.*]] = phi float [ [[L2:%.*]], %[[LOOPODD]] ], [ [[L]], %[[LOOP]] ]
; OTHER-NEXT: [[O:%.*]] = fadd float [[D]], [[K]]
; OTHER-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
; OTHER-NEXT: store float [[O]], ptr [[GEP_DST]], align 4
; OTHER-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; OTHER-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; OTHER-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
; OTHER: [[LOOPODD]]:
; OTHER-NEXT: [[L2]] = fneg float [[L]]
; OTHER-NEXT: br label %[[LOOPCONT]]
; OTHER: [[EXIT]]:
; OTHER-NEXT: ret void
;
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loopcont ]
%scaled.iv = mul nuw nsw i64 %iv, %scale
%gep.src = getelementptr inbounds float, ptr %src, i64 %scaled.iv
%l1 = load float, ptr %gep.src, align 4
%and = and i64 %iv, 1
%odd = icmp eq i64 %and, 1
br i1 %odd, label %loopodd, label %loopcont
loopcont:
%d = phi float [ %l2, %loopodd ], [ %l1, %loop]
%o = fadd float %d, %k
%gep.dst = getelementptr inbounds float, ptr %dst, i64 %iv
store float %o, ptr %gep.dst, align 4
%iv.next = add nuw nsw i64 %iv, 1
%ec = icmp eq i64 %iv.next, %N
br i1 %ec, label %exit, label %loop
loopodd:
%l2 = fneg float %l1
br label %loopcont
exit:
ret void
}

@A = external constant [9 x i8], align 1
@B = external constant [8 x i32], align 4
@C = external constant [8 x i32], align 4
Expand Down