@@ -27096,21 +27096,37 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
2709627096 : AtomicExpansionKind::LLSC;
2709727097}
2709827098
27099+ // Return true if the atomic operation expansion will lower to use a library
27100+ // call, and is thus ineligible to use an LLSC expansion.
27101+ static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
27102+ const AtomicRMWInst *RMW) {
27103+ if (!RMW->isFloatingPointOperation())
27104+ return false;
27105+ switch (RMW->getType()->getScalarType()->getTypeID()) {
27106+ case Type::FloatTyID:
27107+ case Type::DoubleTyID:
27108+ case Type::HalfTyID:
27109+ case Type::BFloatTyID:
27110+ // Will use soft float
27111+ return !Subtarget.hasFPARMv8();
27112+ default:
27113+ // fp128 will emit library calls.
27114+ return true;
27115+ }
27116+
27117+ llvm_unreachable("covered type switch");
27118+ }
27119+
2709927120// The "default" for integer RMW operations is to expand to an LL/SC loop.
2710027121// However, with the LSE instructions (or outline-atomics mode, which provides
2710127122// library routines in place of the LSE-instructions), we can directly emit many
2710227123// operations instead.
27103- //
27104- // Floating-point operations are always emitted to a cmpxchg loop, because they
27105- // may trigger a trap which aborts an LLSC sequence.
2710627124TargetLowering::AtomicExpansionKind
2710727125AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
27108- unsigned Size = AI->getType()->getPrimitiveSizeInBits();
27126+ Type *Ty = AI->getType();
27127+ unsigned Size = Ty->getPrimitiveSizeInBits();
2710927128 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
2711027129
27111- if (AI->isFloatingPointOperation())
27112- return AtomicExpansionKind::CmpXChg;
27113-
2711427130 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
2711527131 (AI->getOperation() == AtomicRMWInst::Xchg ||
2711627132 AI->getOperation() == AtomicRMWInst::Or ||
@@ -27120,7 +27136,8 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
2712027136
2712127137 // Nand is not supported in LSE.
2712227138 // Leave 128 bits to LLSC or CmpXChg.
27123- if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
27139+ if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
27140+ !AI->isFloatingPointOperation()) {
2712427141 if (Subtarget->hasLSE())
2712527142 return AtomicExpansionKind::None;
2712627143 if (Subtarget->outlineAtomics()) {
@@ -27146,7 +27163,7 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
2714627163 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
2714727164 // we have a single CAS instruction that can replace the loop.
2714827165 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
27149- Subtarget->hasLSE())
27166+ Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI) )
2715027167 return AtomicExpansionKind::CmpXChg;
2715127168
2715227169 return AtomicExpansionKind::LLSC;
@@ -27193,10 +27210,14 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
2719327210
2719427211 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
2719527212 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
27196- Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
27197- Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
27198- return Builder.CreateOr(
27199- Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
27213+
27214+ auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
27215+ Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
27216+ Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
27217+
27218+ Value *Or = Builder.CreateOr(
27219+ Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
27220+ return Builder.CreateBitCast(Or, ValueTy);
2720027221 }
2720127222
2720227223 Type *Tys[] = { Addr->getType() };
@@ -27207,8 +27228,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
2720727228 const DataLayout &DL = M->getDataLayout();
2720827229 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
2720927230 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
27210- CI->addParamAttr(
27211- 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy ));
27231+ CI->addParamAttr(0, Attribute::get(Builder.getContext(),
27232+ Attribute::ElementType, IntEltTy ));
2721227233 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
2721327234
2721427235 return Builder.CreateBitCast(Trunc, ValueTy);
@@ -27234,9 +27255,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
2723427255 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
2723527256 Function *Stxr = Intrinsic::getDeclaration(M, Int);
2723627257 Type *Int64Ty = Type::getInt64Ty(M->getContext());
27258+ Type *Int128Ty = Type::getInt128Ty(M->getContext());
27259+
27260+ Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
2723727261
27238- Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
27239- Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
27262+ Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
27263+ Value *Hi =
27264+ Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
2724027265 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
2724127266 }
2724227267
0 commit comments