Skip to content

Commit e88a83a

Browse files
authored
[GlobalOpt][FMV] Perform expensive checks when NumVersions < Threshold (#168054)
Extends the static resolution algorith to handle cases where we can infer additional information on why a prior caller version of higher priority was skipped, based on the features of the current caller version. For example let's say the current caller is aes+sve2 and a previous caller was mops+sve2. Knowing that sve2 is available we could deduce that mops is unavailable. This would allow us to skip callee versions which depend on mops. This comes at the expense of performing more checks. However we can control the threshold (number of versions) which decides whether the expensive checks will be performed or not.
1 parent b341885 commit e88a83a

File tree

2 files changed

+136
-18
lines changed

2 files changed

+136
-18
lines changed

llvm/lib/Transforms/IPO/GlobalOpt.cpp

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,11 @@ static cl::opt<bool>
9999
"functions from non-versioned callers."),
100100
cl::init(true), cl::Hidden);
101101

102+
static cl::opt<unsigned> MaxIFuncVersions(
103+
"max-ifunc-versions", cl::Hidden, cl::init(5),
104+
cl::desc("Maximum number of caller/callee versions that is allowed for "
105+
"using the expensive (cubic) static resolution algorithm."));
106+
102107
static cl::opt<bool>
103108
EnableColdCCStressTest("enable-coldcc-stress-test",
104109
cl::desc("Enable stress test of coldcc by adding "
@@ -2632,31 +2637,56 @@ static bool OptimizeNonTrivialIFuncs(
26322637
LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
26332638
<< CalleeIF->getResolverFunction()->getName() << "\n");
26342639

2635-
// The complexity of this algorithm is linear: O(NumCallers + NumCallees).
2636-
// TODO
2637-
// A limitation it has is that we are not using information about the
2638-
// current caller to deduce why an earlier caller of higher priority was
2639-
// skipped. For example let's say the current caller is aes+sve2 and a
2640-
// previous caller was mops+sve2. Knowing that sve2 is available we could
2641-
// infer that mops is unavailable. This would allow us to skip callee
2642-
// versions which depend on mops. I tried implementing this but the
2643-
// complexity was cubic :/
2640+
// The complexity of this algorithm is linear: O(NumCallers + NumCallees)
2641+
// if NumCallers > MaxIFuncVersions || NumCallees > MaxIFuncVersions,
2642+
// otherwise it is cubic: O((NumCallers ^ 2) x NumCallees).
26442643
auto staticallyResolveCalls = [&](ArrayRef<Function *> Callers,
26452644
ArrayRef<Function *> Callees,
26462645
bool CallerIsFMV) {
2646+
bool AllowExpensiveChecks = CallerIsFMV &&
2647+
Callers.size() <= MaxIFuncVersions &&
2648+
Callees.size() <= MaxIFuncVersions;
26472649
// Index to the highest callee candidate.
2648-
unsigned I = 0;
2650+
unsigned J = 0;
26492651

2650-
for (Function *const &Caller : Callers) {
2651-
if (I == Callees.size())
2652+
for (unsigned I = 0, E = Callers.size(); I < E; ++I) {
2653+
// There are no callee candidates left.
2654+
if (J == Callees.size())
26522655
break;
26532656

2657+
Function *Caller = Callers[I];
2658+
APInt CallerBits = FeatureMask[Caller];
2659+
2660+
// Compare the feature bits of the best callee candidate with all the
2661+
// caller versions preceeding the current one. For each prior caller
2662+
// discard feature bits that are known to be available in the current
2663+
// caller. As long as the known missing feature bits are a subset of the
2664+
// callee feature bits, advance to the next callee and start over.
2665+
auto eliminateAvailableFeatures = [&](unsigned BestCandidate) {
2666+
unsigned K = 0;
2667+
while (K < I && BestCandidate < Callees.size()) {
2668+
APInt MissingBits = FeatureMask[Callers[K]] & ~CallerBits;
2669+
if (MissingBits.isSubsetOf(FeatureMask[Callees[BestCandidate]])) {
2670+
++BestCandidate;
2671+
// Start over.
2672+
K = 0;
2673+
} else
2674+
++K;
2675+
}
2676+
return BestCandidate;
2677+
};
2678+
2679+
unsigned BestCandidate =
2680+
AllowExpensiveChecks ? eliminateAvailableFeatures(J) : J;
2681+
// No callee candidate was found for this caller.
2682+
if (BestCandidate == Callees.size())
2683+
continue;
2684+
26542685
LLVM_DEBUG(dbgs() << " Examining "
26552686
<< (CallerIsFMV ? "FMV" : "regular") << " caller "
26562687
<< Caller->getName() << "\n");
26572688

2658-
Function *Callee = Callees[I];
2659-
APInt CallerBits = FeatureMask[Caller];
2689+
Function *Callee = Callees[BestCandidate];
26602690
APInt CalleeBits = FeatureMask[Callee];
26612691

26622692
// Statically resolve calls from the current caller to the current
@@ -2682,8 +2712,8 @@ static bool OptimizeNonTrivialIFuncs(
26822712
// the callee feature bits, advance to the next callee. This effectively
26832713
// prevents considering the current callee as a candidate for static
26842714
// resolution by following callers.
2685-
while (CallerBits.isSubsetOf(FeatureMask[Callees[I]]) &&
2686-
++I < Callees.size())
2715+
while (CallerBits.isSubsetOf(FeatureMask[Callees[J]]) &&
2716+
++J < Callees.size())
26872717
;
26882718
}
26892719
};

llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers)" --version 4
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers|test_known_bits)" --version 4
22

33
; REQUIRES: aarch64-registered-target
44

@@ -14,13 +14,15 @@ $test_non_fmv_caller.resolver = comdat any
1414
$test_priority.resolver = comdat any
1515
$test_alternative_names.resolver = comdat any
1616
$test_unrelated_callers.resolver = comdat any
17+
$test_known_bits.resolver = comdat any
1718
$caller1.resolver = comdat any
1819
$caller2.resolver = comdat any
1920
$caller3.resolver = comdat any
2021
$caller6.resolver = comdat any
2122
$caller7.resolver = comdat any
2223
$caller8.resolver = comdat any
2324
$caller9.resolver = comdat any
25+
$caller11.resolver = comdat any
2426

2527
@__aarch64_cpu_features = external local_unnamed_addr global { i64 }
2628

@@ -31,13 +33,15 @@ $caller9.resolver = comdat any
3133
@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver
3234
@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver
3335
@test_unrelated_callers = weak_odr ifunc i32 (), ptr @test_unrelated_callers.resolver
36+
@test_known_bits = weak_odr ifunc i32 (), ptr @test_known_bits.resolver
3437
@caller1 = weak_odr ifunc i32 (), ptr @caller1.resolver
3538
@caller2 = weak_odr ifunc i32 (), ptr @caller2.resolver
3639
@caller3 = weak_odr ifunc i32 (), ptr @caller3.resolver
3740
@caller6 = weak_odr ifunc i32 (), ptr @caller6.resolver
3841
@caller7 = weak_odr ifunc i32 (), ptr @caller7.resolver
3942
@caller8 = weak_odr ifunc i32 (), ptr @caller8.resolver
4043
@caller9 = weak_odr ifunc i32 (), ptr @caller9.resolver
44+
@caller11 = weak_odr ifunc i32 (), ptr @caller11.resolver
4145

4246
declare void @__init_cpu_features_resolver() local_unnamed_addr
4347

@@ -509,7 +513,7 @@ entry:
509513
define dso_local i32 @caller8._Msve2() #2 {
510514
; CHECK-LABEL: define dso_local i32 @caller8._Msve2(
511515
; CHECK-SAME: ) #[[ATTR2]] {
512-
; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers()
516+
; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Msve2()
513517
;
514518
entry:
515519
%call = tail call i32 @test_unrelated_callers()
@@ -591,6 +595,89 @@ entry:
591595
ret i32 %call
592596
}
593597

598+
declare i32 @test_known_bits._Mmops() #3
599+
declare i32 @test_known_bits._Maes() #6
600+
declare i32 @test_known_bits.default() #0
601+
602+
define weak_odr ptr @test_known_bits.resolver() comdat {
603+
; CHECK-LABEL: define weak_odr ptr @test_known_bits.resolver() comdat {
604+
resolver_entry:
605+
tail call void @__init_cpu_features_resolver()
606+
%0 = load i64, ptr @__aarch64_cpu_features, align 8
607+
%1 = and i64 %0, 576460752303423488
608+
%.not = icmp eq i64 %1, 0
609+
%2 = and i64 %0, 33536
610+
%3 = icmp eq i64 %2, 33536
611+
%test_known_bits._Maes.test_known_bits.default = select i1 %3, ptr @test_known_bits._Maes, ptr @test_known_bits.default
612+
%common.ret.op = select i1 %.not, ptr %test_known_bits._Maes.test_known_bits.default, ptr @test_known_bits._Mmops
613+
ret ptr %common.ret.op
614+
}
615+
616+
define i32 @caller11._MmopsMsve2() #4 {
617+
; CHECK-LABEL: define i32 @caller11._MmopsMsve2(
618+
; CHECK-SAME: ) #[[ATTR4]] {
619+
; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits._Mmops()
620+
;
621+
entry:
622+
%call = tail call i32 @test_known_bits()
623+
ret i32 %call
624+
}
625+
626+
define i32 @caller11._Msme() #5 {
627+
; CHECK-LABEL: define i32 @caller11._Msme(
628+
; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
629+
; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits()
630+
;
631+
entry:
632+
%call = tail call i32 @test_known_bits()
633+
ret i32 %call
634+
}
635+
636+
define noundef i32 @caller11._MaesMsve2() #19 {
637+
; CHECK-LABEL: define noundef i32 @caller11._MaesMsve2(
638+
; CHECK-SAME: ) #[[ATTR19:[0-9]+]] {
639+
; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits._Maes()
640+
;
641+
entry:
642+
%call = tail call i32 @test_known_bits()
643+
ret i32 %call
644+
}
645+
646+
define i32 @caller11.default() #0 {
647+
; CHECK-LABEL: define i32 @caller11.default(
648+
; CHECK-SAME: ) #[[ATTR0]] {
649+
; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits()
650+
;
651+
entry:
652+
%call = tail call i32 @test_known_bits()
653+
ret i32 %call
654+
}
655+
656+
define weak_odr ptr @caller11.resolver() comdat {
657+
; CHECK-LABEL: define weak_odr ptr @caller11.resolver() comdat {
658+
resolver_entry:
659+
tail call void @__init_cpu_features_resolver()
660+
%0 = load i64, ptr @__aarch64_cpu_features, align 8
661+
%1 = and i64 %0, 576460822096707840
662+
%2 = icmp eq i64 %1, 576460822096707840
663+
br i1 %2, label %common.ret, label %resolver_else
664+
665+
common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry
666+
%common.ret.op = phi ptr [ @caller11._MmopsMsve2, %resolver_entry ], [ @caller11._Msme, %resolver_else ], [ %caller11._MaesMsve2.caller11.default, %resolver_else2 ]
667+
ret ptr %common.ret.op
668+
669+
resolver_else: ; preds = %resolver_entry
670+
%3 = and i64 %0, 4398180795136
671+
%4 = icmp eq i64 %3, 4398180795136
672+
br i1 %4, label %common.ret, label %resolver_else2
673+
674+
resolver_else2: ; preds = %resolver_else
675+
%5 = and i64 %0, 69793317632
676+
%6 = icmp eq i64 %5, 69793317632
677+
%caller11._MaesMsve2.caller11.default = select i1 %6, ptr @caller11._MaesMsve2, ptr @caller11.default
678+
br label %common.ret
679+
}
680+
594681
attributes #0 = { "fmv-features" }
595682
attributes #1 = { "fmv-features"="sve" }
596683
attributes #2 = { "fmv-features"="sve2" }
@@ -610,3 +697,4 @@ attributes #15 = { "fmv-features"="flagm2,frintts" }
610697
attributes #16 = { "fmv-features"="rcpc2" }
611698
attributes #17 = { "fmv-features"="frintts" }
612699
attributes #18 = { "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+sve,+v8a" }
700+
attributes #19 = { "fmv-features"="aes,sve2" }

0 commit comments

Comments
 (0)