Skip to content

Commit bcea769

Browse files
authored
merge main into amd-staging (#622)
2 parents a9c2a06 + 3d3844f commit bcea769

File tree

50 files changed

+923
-163
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+923
-163
lines changed

.github/workflows/release-binaries.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,9 +188,6 @@ jobs:
188188
with:
189189
ref: ${{ needs.prepare.outputs.ref }}
190190

191-
- name: Install Ninja
192-
uses: llvm/actions/install-ninja@5dd955034a6742a2e21d82bf165fcb1050ae7b49 # main
193-
194191
- name: Set Build Prefix
195192
id: setup-stage
196193
shell: bash

flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
#include "mlir/Dialect/OpenACC/OpenACC.h"
1717

1818
namespace fir {
19+
class AddrOfOp;
1920
class DeclareOp;
21+
class GlobalOp;
2022
} // namespace fir
2123

2224
namespace hlfir {
@@ -53,6 +55,18 @@ struct PartialEntityAccessModel<hlfir::DeclareOp>
5355
bool isCompleteView(mlir::Operation *op) const;
5456
};
5557

58+
struct AddressOfGlobalModel
59+
: public mlir::acc::AddressOfGlobalOpInterface::ExternalModel<
60+
AddressOfGlobalModel, fir::AddrOfOp> {
61+
mlir::SymbolRefAttr getSymbol(mlir::Operation *op) const;
62+
};
63+
64+
struct GlobalVariableModel
65+
: public mlir::acc::GlobalVariableOpInterface::ExternalModel<
66+
GlobalVariableModel, fir::GlobalOp> {
67+
bool isConstant(mlir::Operation *op) const;
68+
};
69+
5670
} // namespace fir::acc
5771

5872
#endif // FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_

flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,4 +59,13 @@ bool PartialEntityAccessModel<hlfir::DeclareOp>::isCompleteView(
5959
return !getBaseEntity(op);
6060
}
6161

62+
mlir::SymbolRefAttr AddressOfGlobalModel::getSymbol(mlir::Operation *op) const {
63+
return mlir::cast<fir::AddrOfOp>(op).getSymbolAttr();
64+
}
65+
66+
bool GlobalVariableModel::isConstant(mlir::Operation *op) const {
67+
auto globalOp = mlir::cast<fir::GlobalOp>(op);
68+
return globalOp.getConstant().has_value();
69+
}
70+
6271
} // namespace fir::acc

flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ void registerOpenACCExtensions(mlir::DialectRegistry &registry) {
4949
PartialEntityAccessModel<fir::CoordinateOp>>(*ctx);
5050
fir::DeclareOp::attachInterface<PartialEntityAccessModel<fir::DeclareOp>>(
5151
*ctx);
52+
53+
fir::AddrOfOp::attachInterface<AddressOfGlobalModel>(*ctx);
54+
fir::GlobalOp::attachInterface<GlobalVariableModel>(*ctx);
5255
});
5356

5457
// Register HLFIR operation interfaces

llvm/docs/NVPTXUsage.rst

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,112 @@ every time. For more information, refer PTX ISA
796796
Membar/Fences
797797
-------------
798798

799+
'``llvm.nvvm.fence.acquire/release.sync_restrict.*``'
800+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
801+
802+
Syntax:
803+
"""""""
804+
805+
.. code-block:: llvm
806+
807+
declare void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster()
808+
declare void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster()
809+
810+
Overview:
811+
"""""""""
812+
813+
The `nvvm.fence.{semantics}.sync_restrict.*` restrict the class of memory
814+
operations for which the fence instruction provides the memory ordering guarantees.
815+
When `.sync_restrict` is restricted to `shared_cta`, then memory semantics must
816+
be `release` and the effect of the fence operation only applies to operations
817+
performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is
818+
restricted to `shared_cluster`, then memory semantics must be `acquire` and the
819+
effect of the fence operation only applies to operations performed on objects in
820+
`shared_cluster` memory space. The scope for both operations is `cluster`. For more details,
821+
please refer the `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`__
822+
823+
'``llvm.nvvm.fence.mbarrier_init.release.cluster``'
824+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
825+
826+
Syntax:
827+
"""""""
828+
829+
.. code-block:: llvm
830+
831+
declare void @llvm.nvvm.fence.mbarrier_init.release.cluster()
832+
833+
Overview:
834+
"""""""""
835+
836+
`nvvm.fence.mbarrier_init.release.cluster` intrinsic restrict the class of
837+
memory operations for which the fence instruction provides the memory ordering
838+
guarantees. The `mbarrier_init` modifiers restricts the synchronizing effect to
839+
the prior `mbarrier_init` operation executed by the same thread on mbarrier objects
840+
in `shared_cta` memory space. For more details, please refer the `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`__
841+
842+
'``llvm.nvvm.fence.proxy.async_generic.acquire/release.sync_restrict``'
843+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
844+
845+
Syntax:
846+
"""""""
847+
848+
.. code-block:: llvm
849+
850+
declare void @llvm.nvvm.fence.proxy.async.generic.acquire.sync_restrict.space.cluster.scope.cluster()
851+
declare void @llvm.nvvm.fence.proxy.async.generic.release.sync_restrict.space.cta.scope.cluster()
852+
853+
Overview:
854+
"""""""""
855+
856+
`nvvm.fence.proxy.async_generic.{semantics}.sync_restrict` are used to establish
857+
ordering between a prior memory access performed via the `async proxy<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#proxies>__`
858+
and a subsequent memory access performed via the generic proxy.
859+
``nvvm.fence.proxy.async_generic.release.sync_restrict`` can form a release
860+
sequence that synchronizes with an acquire sequence that contains the
861+
``nvvm.fence.proxy.async_generic.acquire.sync_restrict`` proxy fence. When
862+
`.sync_restrict` is restricted to `shared_cta`, then memory semantics must
863+
be `release` and the effect of the fence operation only applies to operations
864+
performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is
865+
restricted to `shared_cluster`, then memory semantics must be `acquire` and the
866+
effect of the fence operation only applies to operations performed on objects in
867+
`shared_cluster` memory space. The scope for both operations is `cluster`.
868+
For more details, please refer the `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`__
869+
870+
'``llvm.nvvm.fence.proxy.<proxykind>``'
871+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
872+
873+
Syntax:
874+
"""""""
875+
876+
.. code-block:: llvm
877+
878+
declare void @llvm.nvvm.fence.proxy.alias()
879+
declare void @llvm.nvvm.fence.proxy.async()
880+
declare void @llvm.nvvm.fence.proxy.async.global()
881+
declare void @llvm.nvvm.fence.proxy.async.shared_cluster()
882+
declare void @llvm.nvvm.fence.proxy.async.shared_cta()
883+
884+
Overview:
885+
"""""""""
886+
887+
`nvvm.fence.proxy.{proxykind}` intrinsics represent a fence with bi-directional
888+
proxy ordering that is established between the memory accesses done between the
889+
`generic proxy<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#proxies>__`
890+
and the proxy specified by `proxykind`. A `bi-directional proxy` ordering between
891+
two proxykinds establishes two `uni-directional` proxy orderings: one from the
892+
first proxykind to the second proxykind and the other from the second proxykind
893+
to the first proxykind.
894+
895+
`alias` proxykind refers to memory accesses performed using virtually aliased
896+
addresses to the same memory location
897+
898+
`async` proxykind specifies that the memory ordering is established between the
899+
`async proxy` and the `generic proxy`. The memory ordering is limited only to
900+
operations performed on objects in the state space specified (`generic`, `global`,
901+
`shared_cluster`, `shared_cta`). If no state space is specified, then the memory
902+
ordering applies on all state spaces. For more details, please refer the
903+
`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`__
904+
799905
'``llvm.nvvm.fence.proxy.tensormap_generic.*``'
800906
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
801907

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,32 @@ struct HardwareLoopInfo {
123123
LLVM_ABI bool canAnalyze(LoopInfo &LI);
124124
};
125125

126+
/// Information for memory intrinsic cost model.
127+
class MemIntrinsicCostAttributes {
128+
/// Vector type of the data to be loaded or stored.
129+
Type *DataTy = nullptr;
130+
131+
/// ID of the memory intrinsic.
132+
Intrinsic::ID IID;
133+
134+
/// Address space of the pointer.
135+
unsigned AddressSpace = 0;
136+
137+
/// Alignment of single element.
138+
Align Alignment;
139+
140+
public:
141+
LLVM_ABI MemIntrinsicCostAttributes(Intrinsic::ID Id, Type *DataTy,
142+
Align Alignment, unsigned AddressSpace)
143+
: DataTy(DataTy), IID(Id), AddressSpace(AddressSpace),
144+
Alignment(Alignment) {}
145+
146+
Intrinsic::ID getID() const { return IID; }
147+
Type *getDataType() const { return DataTy; }
148+
unsigned getAddressSpace() const { return AddressSpace; }
149+
Align getAlignment() const { return Alignment; }
150+
};
151+
126152
class IntrinsicCostAttributes {
127153
const IntrinsicInst *II = nullptr;
128154
Type *RetTy = nullptr;
@@ -1556,7 +1582,7 @@ class TargetTransformInfo {
15561582

15571583
/// \return The cost of masked Load and Store instructions.
15581584
LLVM_ABI InstructionCost getMaskedMemoryOpCost(
1559-
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
1585+
const MemIntrinsicCostAttributes &MICA,
15601586
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
15611587

15621588
/// \return The cost of Gather or Scatter operation

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -842,8 +842,7 @@ class TargetTransformInfoImplBase {
842842
}
843843

844844
virtual InstructionCost
845-
getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
846-
unsigned AddressSpace,
845+
getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
847846
TTI::TargetCostKind CostKind) const {
848847
return 1;
849848
}

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,9 +1558,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
15581558
}
15591559

15601560
InstructionCost
1561-
getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment,
1562-
unsigned AddressSpace,
1561+
getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
15631562
TTI::TargetCostKind CostKind) const override {
1563+
Type *DataTy = MICA.getDataType();
1564+
Align Alignment = MICA.getAlignment();
1565+
unsigned Opcode = MICA.getID() == Intrinsic::masked_load
1566+
? Instruction::Load
1567+
: Instruction::Store;
15641568
// TODO: Pass on AddressSpace when we have test coverage.
15651569
return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
15661570
CostKind);
@@ -1617,10 +1621,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
16171621

16181622
// Firstly, the cost of load/store operation.
16191623
InstructionCost Cost;
1620-
if (UseMaskForCond || UseMaskForGaps)
1621-
Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1622-
AddressSpace, CostKind);
1623-
else
1624+
if (UseMaskForCond || UseMaskForGaps) {
1625+
unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
1626+
: Intrinsic::masked_store;
1627+
Cost = thisT()->getMaskedMemoryOpCost(
1628+
{IID, VecTy, Alignment, AddressSpace}, CostKind);
1629+
} else
16241630
Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
16251631
CostKind);
16261632

@@ -2403,14 +2409,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
24032409
case Intrinsic::masked_store: {
24042410
Type *Ty = Tys[0];
24052411
Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2406-
return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2407-
CostKind);
2412+
return thisT()->getMaskedMemoryOpCost({IID, Ty, TyAlign, 0}, CostKind);
24082413
}
24092414
case Intrinsic::masked_load: {
24102415
Type *Ty = RetTy;
24112416
Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2412-
return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2413-
CostKind);
2417+
return thisT()->getMaskedMemoryOpCost({IID, Ty, TyAlign, 0}, CostKind);
24142418
}
24152419
case Intrinsic::experimental_vp_strided_store: {
24162420
auto *Ty = cast<VectorType>(ICA.getArgTypes()[0]);

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1746,33 +1746,65 @@ let TargetPrefix = "nvvm" in {
17461746
def int_nvvm_barrier_cluster_wait_aligned : Intrinsic<[]>;
17471747
}
17481748

1749-
//
1750-
// Membar
1751-
//
1752-
let IntrProperties = [IntrNoCallback] in {
1749+
//
1750+
// Membar / Fence
1751+
//
1752+
let IntrProperties = [IntrNoCallback] in {
17531753
def int_nvvm_membar_cta : NVVMBuiltin, Intrinsic<[]>;
17541754
def int_nvvm_membar_gl : NVVMBuiltin, Intrinsic<[]>;
17551755
def int_nvvm_membar_sys : NVVMBuiltin, Intrinsic<[]>;
17561756
def int_nvvm_fence_sc_cluster : Intrinsic<[]>;
1757-
}
17581757

1759-
//
1760-
// Proxy fence (uni-directional)
1761-
//
1758+
// Operation fence
1759+
def int_nvvm_fence_mbarrier_init_release_cluster: Intrinsic<[], [], [],
1760+
"llvm.nvvm.fence.mbarrier_init.release.cluster">;
1761+
1762+
// Thread fence
1763+
def int_nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster :
1764+
Intrinsic<[], [], [],
1765+
"llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster">;
1766+
1767+
def int_nvvm_fence_release_sync_restrict_space_cta_scope_cluster :
1768+
Intrinsic<[], [], [],
1769+
"llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster">;
1770+
1771+
//
1772+
// Proxy fence (uni-directional)
1773+
//
1774+
1775+
def int_nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster :
1776+
Intrinsic<[], [], [],
1777+
"llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster">;
1778+
1779+
def int_nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster :
1780+
Intrinsic<[], [], [],
1781+
"llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster">;
1782+
17621783
foreach scope = ["cta", "cluster", "gpu", "sys"] in {
17631784

17641785
def int_nvvm_fence_proxy_tensormap_generic_release_ # scope :
1765-
Intrinsic<[], [], [IntrNoCallback],
1786+
Intrinsic<[], [], [],
17661787
"llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>;
17671788

17681789
// The imm-arg 'size' can only be 128.
17691790
def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope :
1770-
Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty],
1771-
[IntrNoCallback, IntrArgMemOnly, ImmArg<ArgIndex<1>>,
1772-
Range<ArgIndex<1>, 128, 129>],
1773-
"llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>;
1791+
Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [],
1792+
"llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope> {
1793+
let IntrProperties = [IntrNoCallback, IntrArgMemOnly,
1794+
ImmArg<ArgIndex<1>>, Range<ArgIndex<1>, 128, 129>];
1795+
}
17741796
}
17751797

1798+
//
1799+
// Proxy fence (bi-directional)
1800+
//
1801+
foreach proxykind = ["alias", "async", "async.global", "async.shared_cta",
1802+
"async.shared_cluster"] in {
1803+
defvar Intr = IntrinsicName<"llvm.nvvm.fence.proxy." # proxykind>;
1804+
def Intr.record_name: Intrinsic<[], [], [], Intr.intr_name>;
1805+
}
1806+
}
1807+
17761808
//
17771809
// Async Copy
17781810
//

0 commit comments

Comments
 (0)