10b57cec5SDimitry Andric //===- SIMemoryLegalizer.cpp ----------------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// Memory legalizer - implements memory model. More information can be 110b57cec5SDimitry Andric /// found here: 120b57cec5SDimitry Andric /// http://llvm.org/docs/AMDGPUUsage.html#memory-model 130b57cec5SDimitry Andric // 140b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 150b57cec5SDimitry Andric 160b57cec5SDimitry Andric #include "AMDGPU.h" 170b57cec5SDimitry Andric #include "AMDGPUMachineModuleInfo.h" 18e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 190b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 200b57cec5SDimitry Andric #include "llvm/ADT/BitmaskEnum.h" 21*0fca6ea1SDimitry Andric #include "llvm/ADT/StringExtras.h" 220b57cec5SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h" 2381ad6265SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 240b57cec5SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 25*0fca6ea1SDimitry Andric #include "llvm/IR/MemoryModelRelaxationAnnotations.h" 260b57cec5SDimitry Andric #include "llvm/Support/AtomicOrdering.h" 2706c3fb27SDimitry Andric #include "llvm/TargetParser/TargetParser.h" 280b57cec5SDimitry Andric 290b57cec5SDimitry Andric using namespace llvm; 300b57cec5SDimitry Andric using namespace llvm::AMDGPU; 310b57cec5SDimitry Andric 320b57cec5SDimitry Andric #define DEBUG_TYPE "si-memory-legalizer" 330b57cec5SDimitry Andric #define PASS_NAME "SI Memory Legalizer" 340b57cec5SDimitry Andric 35e8d8bef9SDimitry Andric static cl::opt<bool> AmdgcnSkipCacheInvalidations( 36e8d8bef9SDimitry Andric "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, 37e8d8bef9SDimitry Andric cl::desc("Use this to skip inserting cache invalidating instructions.")); 38e8d8bef9SDimitry Andric 390b57cec5SDimitry Andric namespace { 400b57cec5SDimitry Andric 410b57cec5SDimitry Andric LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 420b57cec5SDimitry Andric 430b57cec5SDimitry Andric /// Memory operation flags. Can be ORed together. 440b57cec5SDimitry Andric enum class SIMemOp { 450b57cec5SDimitry Andric NONE = 0u, 460b57cec5SDimitry Andric LOAD = 1u << 0, 470b57cec5SDimitry Andric STORE = 1u << 1, 480b57cec5SDimitry Andric LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) 490b57cec5SDimitry Andric }; 500b57cec5SDimitry Andric 510b57cec5SDimitry Andric /// Position to insert a new instruction relative to an existing 520b57cec5SDimitry Andric /// instruction. 530b57cec5SDimitry Andric enum class Position { 540b57cec5SDimitry Andric BEFORE, 550b57cec5SDimitry Andric AFTER 560b57cec5SDimitry Andric }; 570b57cec5SDimitry Andric 580b57cec5SDimitry Andric /// The atomic synchronization scopes supported by the AMDGPU target. 590b57cec5SDimitry Andric enum class SIAtomicScope { 600b57cec5SDimitry Andric NONE, 610b57cec5SDimitry Andric SINGLETHREAD, 620b57cec5SDimitry Andric WAVEFRONT, 630b57cec5SDimitry Andric WORKGROUP, 640b57cec5SDimitry Andric AGENT, 650b57cec5SDimitry Andric SYSTEM 660b57cec5SDimitry Andric }; 670b57cec5SDimitry Andric 680b57cec5SDimitry Andric /// The distinct address spaces supported by the AMDGPU target for 6981ad6265SDimitry Andric /// atomic memory operation. Can be ORed together. 700b57cec5SDimitry Andric enum class SIAtomicAddrSpace { 710b57cec5SDimitry Andric NONE = 0u, 720b57cec5SDimitry Andric GLOBAL = 1u << 0, 730b57cec5SDimitry Andric LDS = 1u << 1, 740b57cec5SDimitry Andric SCRATCH = 1u << 2, 750b57cec5SDimitry Andric GDS = 1u << 3, 760b57cec5SDimitry Andric OTHER = 1u << 4, 770b57cec5SDimitry Andric 780b57cec5SDimitry Andric /// The address spaces that can be accessed by a FLAT instruction. 790b57cec5SDimitry Andric FLAT = GLOBAL | LDS | SCRATCH, 800b57cec5SDimitry Andric 810b57cec5SDimitry Andric /// The address spaces that support atomic instructions. 820b57cec5SDimitry Andric ATOMIC = GLOBAL | LDS | SCRATCH | GDS, 830b57cec5SDimitry Andric 840b57cec5SDimitry Andric /// All address spaces. 850b57cec5SDimitry Andric ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, 860b57cec5SDimitry Andric 870b57cec5SDimitry Andric LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) 880b57cec5SDimitry Andric }; 890b57cec5SDimitry Andric 900b57cec5SDimitry Andric class SIMemOpInfo final { 910b57cec5SDimitry Andric private: 920b57cec5SDimitry Andric 930b57cec5SDimitry Andric friend class SIMemOpAccess; 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 960b57cec5SDimitry Andric AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 970b57cec5SDimitry Andric SIAtomicScope Scope = SIAtomicScope::SYSTEM; 980b57cec5SDimitry Andric SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 990b57cec5SDimitry Andric SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 1000b57cec5SDimitry Andric bool IsCrossAddressSpaceOrdering = false; 101e8d8bef9SDimitry Andric bool IsVolatile = false; 1020b57cec5SDimitry Andric bool IsNonTemporal = false; 103*0fca6ea1SDimitry Andric bool IsLastUse = false; 1040b57cec5SDimitry Andric 105*0fca6ea1SDimitry Andric SIMemOpInfo( 106*0fca6ea1SDimitry Andric AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, 1070b57cec5SDimitry Andric SIAtomicScope Scope = SIAtomicScope::SYSTEM, 1080b57cec5SDimitry Andric SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, 1090b57cec5SDimitry Andric SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, 1100b57cec5SDimitry Andric bool IsCrossAddressSpaceOrdering = true, 111*0fca6ea1SDimitry Andric AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent, 112*0fca6ea1SDimitry Andric bool IsVolatile = false, bool IsNonTemporal = false, 113*0fca6ea1SDimitry Andric bool IsLastUse = false) 114*0fca6ea1SDimitry Andric : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope), 115*0fca6ea1SDimitry Andric OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace), 1160b57cec5SDimitry Andric IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), 117*0fca6ea1SDimitry Andric IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal), 118*0fca6ea1SDimitry Andric IsLastUse(IsLastUse) { 119fe6060f1SDimitry Andric 120fe6060f1SDimitry Andric if (Ordering == AtomicOrdering::NotAtomic) { 121fe6060f1SDimitry Andric assert(Scope == SIAtomicScope::NONE && 122fe6060f1SDimitry Andric OrderingAddrSpace == SIAtomicAddrSpace::NONE && 123fe6060f1SDimitry Andric !IsCrossAddressSpaceOrdering && 124fe6060f1SDimitry Andric FailureOrdering == AtomicOrdering::NotAtomic); 125fe6060f1SDimitry Andric return; 126fe6060f1SDimitry Andric } 127fe6060f1SDimitry Andric 128fe6060f1SDimitry Andric assert(Scope != SIAtomicScope::NONE && 129fe6060f1SDimitry Andric (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != 130fe6060f1SDimitry Andric SIAtomicAddrSpace::NONE && 131fe6060f1SDimitry Andric (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != 132349cc55cSDimitry Andric SIAtomicAddrSpace::NONE); 133fe6060f1SDimitry Andric 1340b57cec5SDimitry Andric // There is also no cross address space ordering if the ordering 1350b57cec5SDimitry Andric // address space is the same as the instruction address space and 1360b57cec5SDimitry Andric // only contains a single address space. 1370b57cec5SDimitry Andric if ((OrderingAddrSpace == InstrAddrSpace) && 1380b57cec5SDimitry Andric isPowerOf2_32(uint32_t(InstrAddrSpace))) 1390b57cec5SDimitry Andric this->IsCrossAddressSpaceOrdering = false; 140fe6060f1SDimitry Andric 141fe6060f1SDimitry Andric // Limit the scope to the maximum supported by the instruction's address 142fe6060f1SDimitry Andric // spaces. 143fe6060f1SDimitry Andric if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == 144fe6060f1SDimitry Andric SIAtomicAddrSpace::NONE) { 145fe6060f1SDimitry Andric this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); 146fe6060f1SDimitry Andric } else if ((InstrAddrSpace & 147fe6060f1SDimitry Andric ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == 148fe6060f1SDimitry Andric SIAtomicAddrSpace::NONE) { 149fe6060f1SDimitry Andric this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); 150fe6060f1SDimitry Andric } else if ((InstrAddrSpace & 151fe6060f1SDimitry Andric ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | 152fe6060f1SDimitry Andric SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { 153fe6060f1SDimitry Andric this->Scope = std::min(Scope, SIAtomicScope::AGENT); 154fe6060f1SDimitry Andric } 1550b57cec5SDimitry Andric } 1560b57cec5SDimitry Andric 1570b57cec5SDimitry Andric public: 1580b57cec5SDimitry Andric /// \returns Atomic synchronization scope of the machine instruction used to 1590b57cec5SDimitry Andric /// create this SIMemOpInfo. 1600b57cec5SDimitry Andric SIAtomicScope getScope() const { 1610b57cec5SDimitry Andric return Scope; 1620b57cec5SDimitry Andric } 1630b57cec5SDimitry Andric 1640b57cec5SDimitry Andric /// \returns Ordering constraint of the machine instruction used to 1650b57cec5SDimitry Andric /// create this SIMemOpInfo. 1660b57cec5SDimitry Andric AtomicOrdering getOrdering() const { 1670b57cec5SDimitry Andric return Ordering; 1680b57cec5SDimitry Andric } 1690b57cec5SDimitry Andric 1700b57cec5SDimitry Andric /// \returns Failure ordering constraint of the machine instruction used to 1710b57cec5SDimitry Andric /// create this SIMemOpInfo. 1720b57cec5SDimitry Andric AtomicOrdering getFailureOrdering() const { 1730b57cec5SDimitry Andric return FailureOrdering; 1740b57cec5SDimitry Andric } 1750b57cec5SDimitry Andric 1760b57cec5SDimitry Andric /// \returns The address spaces be accessed by the machine 177bdd1243dSDimitry Andric /// instruction used to create this SIMemOpInfo. 1780b57cec5SDimitry Andric SIAtomicAddrSpace getInstrAddrSpace() const { 1790b57cec5SDimitry Andric return InstrAddrSpace; 1800b57cec5SDimitry Andric } 1810b57cec5SDimitry Andric 1820b57cec5SDimitry Andric /// \returns The address spaces that must be ordered by the machine 183bdd1243dSDimitry Andric /// instruction used to create this SIMemOpInfo. 1840b57cec5SDimitry Andric SIAtomicAddrSpace getOrderingAddrSpace() const { 1850b57cec5SDimitry Andric return OrderingAddrSpace; 1860b57cec5SDimitry Andric } 1870b57cec5SDimitry Andric 1880b57cec5SDimitry Andric /// \returns Return true iff memory ordering of operations on 1890b57cec5SDimitry Andric /// different address spaces is required. 1900b57cec5SDimitry Andric bool getIsCrossAddressSpaceOrdering() const { 1910b57cec5SDimitry Andric return IsCrossAddressSpaceOrdering; 1920b57cec5SDimitry Andric } 1930b57cec5SDimitry Andric 1940b57cec5SDimitry Andric /// \returns True if memory access of the machine instruction used to 195e8d8bef9SDimitry Andric /// create this SIMemOpInfo is volatile, false otherwise. 196e8d8bef9SDimitry Andric bool isVolatile() const { 197e8d8bef9SDimitry Andric return IsVolatile; 198e8d8bef9SDimitry Andric } 199e8d8bef9SDimitry Andric 200e8d8bef9SDimitry Andric /// \returns True if memory access of the machine instruction used to 201e8d8bef9SDimitry Andric /// create this SIMemOpInfo is nontemporal, false otherwise. 2020b57cec5SDimitry Andric bool isNonTemporal() const { 2030b57cec5SDimitry Andric return IsNonTemporal; 2040b57cec5SDimitry Andric } 2050b57cec5SDimitry Andric 206*0fca6ea1SDimitry Andric /// \returns True if memory access of the machine instruction used to 207*0fca6ea1SDimitry Andric /// create this SIMemOpInfo is last use, false otherwise. 208*0fca6ea1SDimitry Andric bool isLastUse() const { return IsLastUse; } 209*0fca6ea1SDimitry Andric 2100b57cec5SDimitry Andric /// \returns True if ordering constraint of the machine instruction used to 2110b57cec5SDimitry Andric /// create this SIMemOpInfo is unordered or higher, false otherwise. 2120b57cec5SDimitry Andric bool isAtomic() const { 2130b57cec5SDimitry Andric return Ordering != AtomicOrdering::NotAtomic; 2140b57cec5SDimitry Andric } 2150b57cec5SDimitry Andric 2160b57cec5SDimitry Andric }; 2170b57cec5SDimitry Andric 2180b57cec5SDimitry Andric class SIMemOpAccess final { 2190b57cec5SDimitry Andric private: 220*0fca6ea1SDimitry Andric const AMDGPUMachineModuleInfo *MMI = nullptr; 2210b57cec5SDimitry Andric 2220b57cec5SDimitry Andric /// Reports unsupported message \p Msg for \p MI to LLVM context. 2230b57cec5SDimitry Andric void reportUnsupported(const MachineBasicBlock::iterator &MI, 2240b57cec5SDimitry Andric const char *Msg) const; 2250b57cec5SDimitry Andric 226fe6060f1SDimitry Andric /// Inspects the target synchronization scope \p SSID and determines 2270b57cec5SDimitry Andric /// the SI atomic scope it corresponds to, the address spaces it 2280b57cec5SDimitry Andric /// covers, and whether the memory ordering applies between address 2290b57cec5SDimitry Andric /// spaces. 230bdd1243dSDimitry Andric std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 231fe6060f1SDimitry Andric toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; 2320b57cec5SDimitry Andric 2330b57cec5SDimitry Andric /// \return Return a bit set of the address spaces accessed by \p AS. 2340b57cec5SDimitry Andric SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; 2350b57cec5SDimitry Andric 2360b57cec5SDimitry Andric /// \returns Info constructed from \p MI, which has at least machine memory 2370b57cec5SDimitry Andric /// operand. 238bdd1243dSDimitry Andric std::optional<SIMemOpInfo> 239bdd1243dSDimitry Andric constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const; 2400b57cec5SDimitry Andric 2410b57cec5SDimitry Andric public: 2420b57cec5SDimitry Andric /// Construct class to support accessing the machine memory operands 2430b57cec5SDimitry Andric /// of instructions in the machine function \p MF. 244*0fca6ea1SDimitry Andric SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI); 2450b57cec5SDimitry Andric 246bdd1243dSDimitry Andric /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. 247bdd1243dSDimitry Andric std::optional<SIMemOpInfo> 248bdd1243dSDimitry Andric getLoadInfo(const MachineBasicBlock::iterator &MI) const; 2490b57cec5SDimitry Andric 250bdd1243dSDimitry Andric /// \returns Store info if \p MI is a store operation, "std::nullopt" 251bdd1243dSDimitry Andric /// otherwise. 252bdd1243dSDimitry Andric std::optional<SIMemOpInfo> 253bdd1243dSDimitry Andric getStoreInfo(const MachineBasicBlock::iterator &MI) const; 2540b57cec5SDimitry Andric 2550b57cec5SDimitry Andric /// \returns Atomic fence info if \p MI is an atomic fence operation, 256bdd1243dSDimitry Andric /// "std::nullopt" otherwise. 257bdd1243dSDimitry Andric std::optional<SIMemOpInfo> 258bdd1243dSDimitry Andric getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const; 2590b57cec5SDimitry Andric 2600b57cec5SDimitry Andric /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or 261bdd1243dSDimitry Andric /// rmw operation, "std::nullopt" otherwise. 262bdd1243dSDimitry Andric std::optional<SIMemOpInfo> 263bdd1243dSDimitry Andric getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; 2640b57cec5SDimitry Andric }; 2650b57cec5SDimitry Andric 2660b57cec5SDimitry Andric class SICacheControl { 2670b57cec5SDimitry Andric protected: 2680b57cec5SDimitry Andric 269e8d8bef9SDimitry Andric /// AMDGPU subtarget info. 270e8d8bef9SDimitry Andric const GCNSubtarget &ST; 271e8d8bef9SDimitry Andric 2720b57cec5SDimitry Andric /// Instruction info. 2730b57cec5SDimitry Andric const SIInstrInfo *TII = nullptr; 2740b57cec5SDimitry Andric 2750b57cec5SDimitry Andric IsaVersion IV; 2760b57cec5SDimitry Andric 277e8d8bef9SDimitry Andric /// Whether to insert cache invalidating instructions. 2785ffd83dbSDimitry Andric bool InsertCacheInv; 2795ffd83dbSDimitry Andric 2800b57cec5SDimitry Andric SICacheControl(const GCNSubtarget &ST); 2810b57cec5SDimitry Andric 282fe6060f1SDimitry Andric /// Sets named bit \p BitName to "true" if present in instruction \p MI. 283fe6060f1SDimitry Andric /// \returns Returns true if \p MI is modified, false otherwise. 284fe6060f1SDimitry Andric bool enableNamedBit(const MachineBasicBlock::iterator MI, 285fe6060f1SDimitry Andric AMDGPU::CPol::CPol Bit) const; 286fe6060f1SDimitry Andric 2870b57cec5SDimitry Andric public: 2880b57cec5SDimitry Andric 2890b57cec5SDimitry Andric /// Create a cache control for the subtarget \p ST. 2900b57cec5SDimitry Andric static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); 2910b57cec5SDimitry Andric 2920b57cec5SDimitry Andric /// Update \p MI memory load instruction to bypass any caches up to 2930b57cec5SDimitry Andric /// the \p Scope memory scope for address spaces \p 2940b57cec5SDimitry Andric /// AddrSpace. Return true iff the instruction was modified. 2950b57cec5SDimitry Andric virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 2960b57cec5SDimitry Andric SIAtomicScope Scope, 2970b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace) const = 0; 2980b57cec5SDimitry Andric 299fe6060f1SDimitry Andric /// Update \p MI memory store instruction to bypass any caches up to 300fe6060f1SDimitry Andric /// the \p Scope memory scope for address spaces \p 301fe6060f1SDimitry Andric /// AddrSpace. Return true iff the instruction was modified. 302fe6060f1SDimitry Andric virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 303fe6060f1SDimitry Andric SIAtomicScope Scope, 304fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const = 0; 305fe6060f1SDimitry Andric 306fe6060f1SDimitry Andric /// Update \p MI memory read-modify-write instruction to bypass any caches up 307fe6060f1SDimitry Andric /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true 308fe6060f1SDimitry Andric /// iff the instruction was modified. 309fe6060f1SDimitry Andric virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 310fe6060f1SDimitry Andric SIAtomicScope Scope, 311fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const = 0; 312fe6060f1SDimitry Andric 313e8d8bef9SDimitry Andric /// Update \p MI memory instruction of kind \p Op associated with address 314*0fca6ea1SDimitry Andric /// spaces \p AddrSpace to indicate it is volatile and/or 315*0fca6ea1SDimitry Andric /// nontemporal/last-use. Return true iff the instruction was modified. 316e8d8bef9SDimitry Andric virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 3170b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace, 318e8d8bef9SDimitry Andric SIMemOp Op, bool IsVolatile, 319*0fca6ea1SDimitry Andric bool IsNonTemporal, 320*0fca6ea1SDimitry Andric bool IsLastUse = false) const = 0; 321*0fca6ea1SDimitry Andric 322*0fca6ea1SDimitry Andric virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { 323*0fca6ea1SDimitry Andric return false; 324*0fca6ea1SDimitry Andric }; 3250b57cec5SDimitry Andric 3260b57cec5SDimitry Andric /// Inserts any necessary instructions at position \p Pos relative 327e8d8bef9SDimitry Andric /// to instruction \p MI to ensure memory instructions before \p Pos of kind 328e8d8bef9SDimitry Andric /// \p Op associated with address spaces \p AddrSpace have completed. Used 329e8d8bef9SDimitry Andric /// between memory instructions to enforce the order they become visible as 330e8d8bef9SDimitry Andric /// observed by other memory instructions executing in memory scope \p Scope. 331e8d8bef9SDimitry Andric /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between 332e8d8bef9SDimitry Andric /// address spaces. Returns true iff any instructions inserted. 3330b57cec5SDimitry Andric virtual bool insertWait(MachineBasicBlock::iterator &MI, 3340b57cec5SDimitry Andric SIAtomicScope Scope, 3350b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace, 3360b57cec5SDimitry Andric SIMemOp Op, 3370b57cec5SDimitry Andric bool IsCrossAddrSpaceOrdering, 3380b57cec5SDimitry Andric Position Pos) const = 0; 3390b57cec5SDimitry Andric 340e8d8bef9SDimitry Andric /// Inserts any necessary instructions at position \p Pos relative to 341e8d8bef9SDimitry Andric /// instruction \p MI to ensure any subsequent memory instructions of this 342e8d8bef9SDimitry Andric /// thread with address spaces \p AddrSpace will observe the previous memory 343e8d8bef9SDimitry Andric /// operations by any thread for memory scopes up to memory scope \p Scope . 344e8d8bef9SDimitry Andric /// Returns true iff any instructions inserted. 345e8d8bef9SDimitry Andric virtual bool insertAcquire(MachineBasicBlock::iterator &MI, 346e8d8bef9SDimitry Andric SIAtomicScope Scope, 347e8d8bef9SDimitry Andric SIAtomicAddrSpace AddrSpace, 348e8d8bef9SDimitry Andric Position Pos) const = 0; 349e8d8bef9SDimitry Andric 350e8d8bef9SDimitry Andric /// Inserts any necessary instructions at position \p Pos relative to 351e8d8bef9SDimitry Andric /// instruction \p MI to ensure previous memory instructions by this thread 352e8d8bef9SDimitry Andric /// with address spaces \p AddrSpace have completed and can be observed by 353e8d8bef9SDimitry Andric /// subsequent memory instructions by any thread executing in memory scope \p 354e8d8bef9SDimitry Andric /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is 355e8d8bef9SDimitry Andric /// between address spaces. Returns true iff any instructions inserted. 356e8d8bef9SDimitry Andric virtual bool insertRelease(MachineBasicBlock::iterator &MI, 357e8d8bef9SDimitry Andric SIAtomicScope Scope, 358e8d8bef9SDimitry Andric SIAtomicAddrSpace AddrSpace, 359e8d8bef9SDimitry Andric bool IsCrossAddrSpaceOrdering, 360e8d8bef9SDimitry Andric Position Pos) const = 0; 361e8d8bef9SDimitry Andric 3620b57cec5SDimitry Andric /// Virtual destructor to allow derivations to be deleted. 3630b57cec5SDimitry Andric virtual ~SICacheControl() = default; 3640b57cec5SDimitry Andric 36506c3fb27SDimitry Andric virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 36606c3fb27SDimitry Andric MachineBasicBlock::iterator &MI) const { 36706c3fb27SDimitry Andric return false; 36806c3fb27SDimitry Andric } 3690b57cec5SDimitry Andric }; 3700b57cec5SDimitry Andric 3710b57cec5SDimitry Andric class SIGfx6CacheControl : public SICacheControl { 3720b57cec5SDimitry Andric protected: 3730b57cec5SDimitry Andric 3740b57cec5SDimitry Andric /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI 3750b57cec5SDimitry Andric /// is modified, false otherwise. 3760b57cec5SDimitry Andric bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { 377fe6060f1SDimitry Andric return enableNamedBit(MI, AMDGPU::CPol::GLC); 3780b57cec5SDimitry Andric } 3790b57cec5SDimitry Andric 3800b57cec5SDimitry Andric /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI 3810b57cec5SDimitry Andric /// is modified, false otherwise. 3820b57cec5SDimitry Andric bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { 383fe6060f1SDimitry Andric return enableNamedBit(MI, AMDGPU::CPol::SLC); 3840b57cec5SDimitry Andric } 3850b57cec5SDimitry Andric 3860b57cec5SDimitry Andric public: 3870b57cec5SDimitry Andric 388349cc55cSDimitry Andric SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} 3890b57cec5SDimitry Andric 3900b57cec5SDimitry Andric bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 3910b57cec5SDimitry Andric SIAtomicScope Scope, 3920b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace) const override; 3930b57cec5SDimitry Andric 394fe6060f1SDimitry Andric bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 395fe6060f1SDimitry Andric SIAtomicScope Scope, 396fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const override; 397fe6060f1SDimitry Andric 398fe6060f1SDimitry Andric bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 399fe6060f1SDimitry Andric SIAtomicScope Scope, 400fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const override; 401fe6060f1SDimitry Andric 402e8d8bef9SDimitry Andric bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 403e8d8bef9SDimitry Andric SIAtomicAddrSpace AddrSpace, SIMemOp Op, 404*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, 405*0fca6ea1SDimitry Andric bool IsLastUse) const override; 4060b57cec5SDimitry Andric 4070b57cec5SDimitry Andric bool insertWait(MachineBasicBlock::iterator &MI, 4080b57cec5SDimitry Andric SIAtomicScope Scope, 4090b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace, 4100b57cec5SDimitry Andric SIMemOp Op, 4110b57cec5SDimitry Andric bool IsCrossAddrSpaceOrdering, 4120b57cec5SDimitry Andric Position Pos) const override; 413e8d8bef9SDimitry Andric 414e8d8bef9SDimitry Andric bool insertAcquire(MachineBasicBlock::iterator &MI, 415e8d8bef9SDimitry Andric SIAtomicScope Scope, 416e8d8bef9SDimitry Andric SIAtomicAddrSpace AddrSpace, 417e8d8bef9SDimitry Andric Position Pos) const override; 418e8d8bef9SDimitry Andric 419e8d8bef9SDimitry Andric bool insertRelease(MachineBasicBlock::iterator &MI, 420e8d8bef9SDimitry Andric SIAtomicScope Scope, 421e8d8bef9SDimitry Andric SIAtomicAddrSpace AddrSpace, 422e8d8bef9SDimitry Andric bool IsCrossAddrSpaceOrdering, 423e8d8bef9SDimitry Andric Position Pos) const override; 4240b57cec5SDimitry Andric }; 4250b57cec5SDimitry Andric 4260b57cec5SDimitry Andric class SIGfx7CacheControl : public SIGfx6CacheControl { 4270b57cec5SDimitry Andric public: 4280b57cec5SDimitry Andric 429349cc55cSDimitry Andric SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} 4300b57cec5SDimitry Andric 431e8d8bef9SDimitry Andric bool insertAcquire(MachineBasicBlock::iterator &MI, 4320b57cec5SDimitry Andric SIAtomicScope Scope, 4330b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace, 4340b57cec5SDimitry Andric Position Pos) const override; 4350b57cec5SDimitry Andric 4360b57cec5SDimitry Andric }; 4370b57cec5SDimitry Andric 438fe6060f1SDimitry Andric class SIGfx90ACacheControl : public SIGfx7CacheControl { 439fe6060f1SDimitry Andric public: 440fe6060f1SDimitry Andric 441349cc55cSDimitry Andric SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 442fe6060f1SDimitry Andric 443fe6060f1SDimitry Andric bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 444fe6060f1SDimitry Andric SIAtomicScope Scope, 445fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const override; 446fe6060f1SDimitry Andric 447fe6060f1SDimitry Andric bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 448fe6060f1SDimitry Andric SIAtomicScope Scope, 449fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const override; 450fe6060f1SDimitry Andric 451fe6060f1SDimitry Andric bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 452fe6060f1SDimitry Andric SIAtomicScope Scope, 453fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const override; 454fe6060f1SDimitry Andric 455fe6060f1SDimitry Andric bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 456fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace, SIMemOp Op, 457*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, 458*0fca6ea1SDimitry Andric bool IsLastUse) const override; 459fe6060f1SDimitry Andric 460fe6060f1SDimitry Andric bool insertWait(MachineBasicBlock::iterator &MI, 461fe6060f1SDimitry Andric SIAtomicScope Scope, 462fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace, 463fe6060f1SDimitry Andric SIMemOp Op, 464fe6060f1SDimitry Andric bool IsCrossAddrSpaceOrdering, 465fe6060f1SDimitry Andric Position Pos) const override; 466fe6060f1SDimitry Andric 467fe6060f1SDimitry Andric bool insertAcquire(MachineBasicBlock::iterator &MI, 468fe6060f1SDimitry Andric SIAtomicScope Scope, 469fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace, 470fe6060f1SDimitry Andric Position Pos) const override; 471fe6060f1SDimitry Andric 472fe6060f1SDimitry Andric bool insertRelease(MachineBasicBlock::iterator &MI, 473fe6060f1SDimitry Andric SIAtomicScope Scope, 474fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace, 475fe6060f1SDimitry Andric bool IsCrossAddrSpaceOrdering, 476fe6060f1SDimitry Andric Position Pos) const override; 477fe6060f1SDimitry Andric }; 478fe6060f1SDimitry Andric 47981ad6265SDimitry Andric class SIGfx940CacheControl : public SIGfx90ACacheControl { 48081ad6265SDimitry Andric protected: 48181ad6265SDimitry Andric 48281ad6265SDimitry Andric /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI 48381ad6265SDimitry Andric /// is modified, false otherwise. 48481ad6265SDimitry Andric bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { 48581ad6265SDimitry Andric return enableNamedBit(MI, AMDGPU::CPol::SC0); 48681ad6265SDimitry Andric } 48781ad6265SDimitry Andric 48881ad6265SDimitry Andric /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI 48981ad6265SDimitry Andric /// is modified, false otherwise. 49081ad6265SDimitry Andric bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { 49181ad6265SDimitry Andric return enableNamedBit(MI, AMDGPU::CPol::SC1); 49281ad6265SDimitry Andric } 49381ad6265SDimitry Andric 49481ad6265SDimitry Andric /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI 49581ad6265SDimitry Andric /// is modified, false otherwise. 49681ad6265SDimitry Andric bool enableNTBit(const MachineBasicBlock::iterator &MI) const { 49781ad6265SDimitry Andric return enableNamedBit(MI, AMDGPU::CPol::NT); 49881ad6265SDimitry Andric } 49981ad6265SDimitry Andric 50081ad6265SDimitry Andric public: 50181ad6265SDimitry Andric 50281ad6265SDimitry Andric SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; 50381ad6265SDimitry Andric 50481ad6265SDimitry Andric bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 50581ad6265SDimitry Andric SIAtomicScope Scope, 50681ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace) const override; 50781ad6265SDimitry Andric 50881ad6265SDimitry Andric bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 50981ad6265SDimitry Andric SIAtomicScope Scope, 51081ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace) const override; 51181ad6265SDimitry Andric 51281ad6265SDimitry Andric bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 51381ad6265SDimitry Andric SIAtomicScope Scope, 51481ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace) const override; 51581ad6265SDimitry Andric 51681ad6265SDimitry Andric bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 51781ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace, SIMemOp Op, 518*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, 519*0fca6ea1SDimitry Andric bool IsLastUse) const override; 52081ad6265SDimitry Andric 52181ad6265SDimitry Andric bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 52281ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace, Position Pos) const override; 52381ad6265SDimitry Andric 52481ad6265SDimitry Andric bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 52581ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 52681ad6265SDimitry Andric Position Pos) const override; 52706c3fb27SDimitry Andric 52806c3fb27SDimitry Andric bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, 52906c3fb27SDimitry Andric MachineBasicBlock::iterator &MI) const override { 53006c3fb27SDimitry Andric bool Changed = false; 53106c3fb27SDimitry Andric if (ST.hasForceStoreSC0SC1() && 53206c3fb27SDimitry Andric (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | 53306c3fb27SDimitry Andric SIAtomicAddrSpace::GLOBAL | 53406c3fb27SDimitry Andric SIAtomicAddrSpace::OTHER)) != 53506c3fb27SDimitry Andric SIAtomicAddrSpace::NONE) { 53606c3fb27SDimitry Andric Changed |= enableSC0Bit(MI); 53706c3fb27SDimitry Andric Changed |= enableSC1Bit(MI); 53806c3fb27SDimitry Andric } 53906c3fb27SDimitry Andric return Changed; 54006c3fb27SDimitry Andric } 54181ad6265SDimitry Andric }; 54281ad6265SDimitry Andric 5430b57cec5SDimitry Andric class SIGfx10CacheControl : public SIGfx7CacheControl { 5440b57cec5SDimitry Andric protected: 5450b57cec5SDimitry Andric 5460b57cec5SDimitry Andric /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI 5470b57cec5SDimitry Andric /// is modified, false otherwise. 5480b57cec5SDimitry Andric bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { 549fe6060f1SDimitry Andric return enableNamedBit(MI, AMDGPU::CPol::DLC); 5500b57cec5SDimitry Andric } 5510b57cec5SDimitry Andric 5520b57cec5SDimitry Andric public: 5530b57cec5SDimitry Andric 554349cc55cSDimitry Andric SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} 5550b57cec5SDimitry Andric 5560b57cec5SDimitry Andric bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 5570b57cec5SDimitry Andric SIAtomicScope Scope, 5580b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace) const override; 5590b57cec5SDimitry Andric 560e8d8bef9SDimitry Andric bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 561e8d8bef9SDimitry Andric SIAtomicAddrSpace AddrSpace, SIMemOp Op, 562*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, 563*0fca6ea1SDimitry Andric bool IsLastUse) const override; 5640b57cec5SDimitry Andric 5650b57cec5SDimitry Andric bool insertWait(MachineBasicBlock::iterator &MI, 5660b57cec5SDimitry Andric SIAtomicScope Scope, 5670b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace, 5680b57cec5SDimitry Andric SIMemOp Op, 5690b57cec5SDimitry Andric bool IsCrossAddrSpaceOrdering, 5700b57cec5SDimitry Andric Position Pos) const override; 571e8d8bef9SDimitry Andric 572e8d8bef9SDimitry Andric bool insertAcquire(MachineBasicBlock::iterator &MI, 573e8d8bef9SDimitry Andric SIAtomicScope Scope, 574e8d8bef9SDimitry Andric SIAtomicAddrSpace AddrSpace, 575e8d8bef9SDimitry Andric Position Pos) const override; 5760b57cec5SDimitry Andric }; 5770b57cec5SDimitry Andric 57881ad6265SDimitry Andric class SIGfx11CacheControl : public SIGfx10CacheControl { 57981ad6265SDimitry Andric public: 58081ad6265SDimitry Andric SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} 58181ad6265SDimitry Andric 58281ad6265SDimitry Andric bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 58381ad6265SDimitry Andric SIAtomicScope Scope, 58481ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace) const override; 58581ad6265SDimitry Andric 58681ad6265SDimitry Andric bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 58781ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace, SIMemOp Op, 588*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, 589*0fca6ea1SDimitry Andric bool IsLastUse) const override; 59081ad6265SDimitry Andric }; 59181ad6265SDimitry Andric 5921db9f3b2SDimitry Andric class SIGfx12CacheControl : public SIGfx11CacheControl { 5937a6dacacSDimitry Andric protected: 5947a6dacacSDimitry Andric // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. 5957a6dacacSDimitry Andric // \returns Returns true if \p MI is modified, false otherwise. 5967a6dacacSDimitry Andric bool setTH(const MachineBasicBlock::iterator MI, 5977a6dacacSDimitry Andric AMDGPU::CPol::CPol Value) const; 5987a6dacacSDimitry Andric // Sets Scope policy to \p Value if CPol operand is present in instruction \p 5997a6dacacSDimitry Andric // MI. \returns Returns true if \p MI is modified, false otherwise. 6007a6dacacSDimitry Andric bool setScope(const MachineBasicBlock::iterator MI, 6017a6dacacSDimitry Andric AMDGPU::CPol::CPol Value) const; 6027a6dacacSDimitry Andric 603*0fca6ea1SDimitry Andric // Stores with system scope (SCOPE_SYS) need to wait for: 604*0fca6ea1SDimitry Andric // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0 605*0fca6ea1SDimitry Andric // - non-returning-atomics - wait for STORECNT==0 606*0fca6ea1SDimitry Andric // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits 607*0fca6ea1SDimitry Andric // since it does not distinguish atomics-with-return from regular stores. 608*0fca6ea1SDimitry Andric // There is no need to wait if memory is cached (mtype != UC). 609*0fca6ea1SDimitry Andric bool 610*0fca6ea1SDimitry Andric insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const; 611*0fca6ea1SDimitry Andric 612*0fca6ea1SDimitry Andric bool setAtomicScope(const MachineBasicBlock::iterator &MI, 613*0fca6ea1SDimitry Andric SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; 614*0fca6ea1SDimitry Andric 6151db9f3b2SDimitry Andric public: 6161db9f3b2SDimitry Andric SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} 6171db9f3b2SDimitry Andric 6187a6dacacSDimitry Andric bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 6197a6dacacSDimitry Andric SIAtomicAddrSpace AddrSpace, SIMemOp Op, 6207a6dacacSDimitry Andric bool IsCrossAddrSpaceOrdering, Position Pos) const override; 6217a6dacacSDimitry Andric 6221db9f3b2SDimitry Andric bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 6231db9f3b2SDimitry Andric SIAtomicAddrSpace AddrSpace, Position Pos) const override; 6247a6dacacSDimitry Andric 6257a6dacacSDimitry Andric bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, 6267a6dacacSDimitry Andric SIAtomicAddrSpace AddrSpace, SIMemOp Op, 627*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, 628*0fca6ea1SDimitry Andric bool IsLastUse) const override; 629*0fca6ea1SDimitry Andric 630*0fca6ea1SDimitry Andric bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; 631*0fca6ea1SDimitry Andric 632*0fca6ea1SDimitry Andric bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 633*0fca6ea1SDimitry Andric SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, 634*0fca6ea1SDimitry Andric Position Pos) const override; 635*0fca6ea1SDimitry Andric 636*0fca6ea1SDimitry Andric bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, 637*0fca6ea1SDimitry Andric SIAtomicScope Scope, 638*0fca6ea1SDimitry Andric SIAtomicAddrSpace AddrSpace) const override { 639*0fca6ea1SDimitry Andric return setAtomicScope(MI, Scope, AddrSpace); 640*0fca6ea1SDimitry Andric } 641*0fca6ea1SDimitry Andric 642*0fca6ea1SDimitry Andric bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, 643*0fca6ea1SDimitry Andric SIAtomicScope Scope, 644*0fca6ea1SDimitry Andric SIAtomicAddrSpace AddrSpace) const override { 645*0fca6ea1SDimitry Andric return setAtomicScope(MI, Scope, AddrSpace); 646*0fca6ea1SDimitry Andric } 647*0fca6ea1SDimitry Andric 648*0fca6ea1SDimitry Andric bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, 649*0fca6ea1SDimitry Andric SIAtomicScope Scope, 650*0fca6ea1SDimitry Andric SIAtomicAddrSpace AddrSpace) const override { 651*0fca6ea1SDimitry Andric return setAtomicScope(MI, Scope, AddrSpace); 652*0fca6ea1SDimitry Andric } 6531db9f3b2SDimitry Andric }; 6541db9f3b2SDimitry Andric 6550b57cec5SDimitry Andric class SIMemoryLegalizer final : public MachineFunctionPass { 6560b57cec5SDimitry Andric private: 6570b57cec5SDimitry Andric 6580b57cec5SDimitry Andric /// Cache Control. 6590b57cec5SDimitry Andric std::unique_ptr<SICacheControl> CC = nullptr; 6600b57cec5SDimitry Andric 6610b57cec5SDimitry Andric /// List of atomic pseudo instructions. 6620b57cec5SDimitry Andric std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; 6630b57cec5SDimitry Andric 6640b57cec5SDimitry Andric /// Return true iff instruction \p MI is a atomic instruction that 6650b57cec5SDimitry Andric /// returns a result. 6660b57cec5SDimitry Andric bool isAtomicRet(const MachineInstr &MI) const { 667fe6060f1SDimitry Andric return SIInstrInfo::isAtomicRet(MI); 6680b57cec5SDimitry Andric } 6690b57cec5SDimitry Andric 6700b57cec5SDimitry Andric /// Removes all processed atomic pseudo instructions from the current 6710b57cec5SDimitry Andric /// function. Returns true if current function is modified, false otherwise. 6720b57cec5SDimitry Andric bool removeAtomicPseudoMIs(); 6730b57cec5SDimitry Andric 6740b57cec5SDimitry Andric /// Expands load operation \p MI. Returns true if instructions are 6750b57cec5SDimitry Andric /// added/deleted or \p MI is modified, false otherwise. 6760b57cec5SDimitry Andric bool expandLoad(const SIMemOpInfo &MOI, 6770b57cec5SDimitry Andric MachineBasicBlock::iterator &MI); 6780b57cec5SDimitry Andric /// Expands store operation \p MI. Returns true if instructions are 6790b57cec5SDimitry Andric /// added/deleted or \p MI is modified, false otherwise. 6800b57cec5SDimitry Andric bool expandStore(const SIMemOpInfo &MOI, 6810b57cec5SDimitry Andric MachineBasicBlock::iterator &MI); 6820b57cec5SDimitry Andric /// Expands atomic fence operation \p MI. Returns true if 6830b57cec5SDimitry Andric /// instructions are added/deleted or \p MI is modified, false otherwise. 6840b57cec5SDimitry Andric bool expandAtomicFence(const SIMemOpInfo &MOI, 6850b57cec5SDimitry Andric MachineBasicBlock::iterator &MI); 6860b57cec5SDimitry Andric /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if 6870b57cec5SDimitry Andric /// instructions are added/deleted or \p MI is modified, false otherwise. 6880b57cec5SDimitry Andric bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 6890b57cec5SDimitry Andric MachineBasicBlock::iterator &MI); 6900b57cec5SDimitry Andric 6910b57cec5SDimitry Andric public: 6920b57cec5SDimitry Andric static char ID; 6930b57cec5SDimitry Andric 6940b57cec5SDimitry Andric SIMemoryLegalizer() : MachineFunctionPass(ID) {} 6950b57cec5SDimitry Andric 6960b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 6970b57cec5SDimitry Andric AU.setPreservesCFG(); 6980b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 6990b57cec5SDimitry Andric } 7000b57cec5SDimitry Andric 7010b57cec5SDimitry Andric StringRef getPassName() const override { 7020b57cec5SDimitry Andric return PASS_NAME; 7030b57cec5SDimitry Andric } 7040b57cec5SDimitry Andric 7050b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 7060b57cec5SDimitry Andric }; 7070b57cec5SDimitry Andric 708*0fca6ea1SDimitry Andric static const StringMap<SIAtomicAddrSpace> ASNames = {{ 709*0fca6ea1SDimitry Andric {"global", SIAtomicAddrSpace::GLOBAL}, 710*0fca6ea1SDimitry Andric {"local", SIAtomicAddrSpace::LDS}, 711*0fca6ea1SDimitry Andric }}; 712*0fca6ea1SDimitry Andric 713*0fca6ea1SDimitry Andric void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) { 714*0fca6ea1SDimitry Andric const MachineFunction *MF = MI.getMF(); 715*0fca6ea1SDimitry Andric const Function &Fn = MF->getFunction(); 716*0fca6ea1SDimitry Andric SmallString<128> Str; 717*0fca6ea1SDimitry Andric raw_svector_ostream OS(Str); 718*0fca6ea1SDimitry Andric OS << "unknown address space '" << AS << "'; expected one of "; 719*0fca6ea1SDimitry Andric ListSeparator LS; 720*0fca6ea1SDimitry Andric for (const auto &[Name, Val] : ASNames) 721*0fca6ea1SDimitry Andric OS << LS << '\'' << Name << '\''; 722*0fca6ea1SDimitry Andric DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning); 723*0fca6ea1SDimitry Andric Fn.getContext().diagnose(BadTag); 724*0fca6ea1SDimitry Andric } 725*0fca6ea1SDimitry Andric 726*0fca6ea1SDimitry Andric /// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA. 727*0fca6ea1SDimitry Andric /// If this tag isn't present, or if it has no meaningful values, returns \p 728*0fca6ea1SDimitry Andric /// Default. Otherwise returns all the address spaces concerned by the MMRA. 729*0fca6ea1SDimitry Andric static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, 730*0fca6ea1SDimitry Andric SIAtomicAddrSpace Default) { 731*0fca6ea1SDimitry Andric static constexpr StringLiteral FenceASPrefix = "amdgpu-as"; 732*0fca6ea1SDimitry Andric 733*0fca6ea1SDimitry Andric auto MMRA = MMRAMetadata(MI.getMMRAMetadata()); 734*0fca6ea1SDimitry Andric if (!MMRA) 735*0fca6ea1SDimitry Andric return Default; 736*0fca6ea1SDimitry Andric 737*0fca6ea1SDimitry Andric SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE; 738*0fca6ea1SDimitry Andric for (const auto &[Prefix, Suffix] : MMRA) { 739*0fca6ea1SDimitry Andric if (Prefix != FenceASPrefix) 740*0fca6ea1SDimitry Andric continue; 741*0fca6ea1SDimitry Andric 742*0fca6ea1SDimitry Andric if (auto It = ASNames.find(Suffix); It != ASNames.end()) 743*0fca6ea1SDimitry Andric Result |= It->second; 744*0fca6ea1SDimitry Andric else 745*0fca6ea1SDimitry Andric diagnoseUnknownMMRAASName(MI, Suffix); 746*0fca6ea1SDimitry Andric } 747*0fca6ea1SDimitry Andric 748*0fca6ea1SDimitry Andric return (Result != SIAtomicAddrSpace::NONE) ? Result : Default; 749*0fca6ea1SDimitry Andric } 750*0fca6ea1SDimitry Andric 751*0fca6ea1SDimitry Andric } // end anonymous namespace 7520b57cec5SDimitry Andric 7530b57cec5SDimitry Andric void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, 7540b57cec5SDimitry Andric const char *Msg) const { 7550b57cec5SDimitry Andric const Function &Func = MI->getParent()->getParent()->getFunction(); 7560b57cec5SDimitry Andric DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); 7570b57cec5SDimitry Andric Func.getContext().diagnose(Diag); 7580b57cec5SDimitry Andric } 7590b57cec5SDimitry Andric 760bdd1243dSDimitry Andric std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> 7610b57cec5SDimitry Andric SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, 762fe6060f1SDimitry Andric SIAtomicAddrSpace InstrAddrSpace) const { 7630b57cec5SDimitry Andric if (SSID == SyncScope::System) 764bdd1243dSDimitry Andric return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); 7650b57cec5SDimitry Andric if (SSID == MMI->getAgentSSID()) 766bdd1243dSDimitry Andric return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); 7670b57cec5SDimitry Andric if (SSID == MMI->getWorkgroupSSID()) 768bdd1243dSDimitry Andric return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, 7690b57cec5SDimitry Andric true); 7700b57cec5SDimitry Andric if (SSID == MMI->getWavefrontSSID()) 771bdd1243dSDimitry Andric return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC, 7720b57cec5SDimitry Andric true); 7730b57cec5SDimitry Andric if (SSID == SyncScope::SingleThread) 774bdd1243dSDimitry Andric return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC, 7750b57cec5SDimitry Andric true); 7760b57cec5SDimitry Andric if (SSID == MMI->getSystemOneAddressSpaceSSID()) 777bdd1243dSDimitry Andric return std::tuple(SIAtomicScope::SYSTEM, 778bdd1243dSDimitry Andric SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 7790b57cec5SDimitry Andric if (SSID == MMI->getAgentOneAddressSpaceSSID()) 780bdd1243dSDimitry Andric return std::tuple(SIAtomicScope::AGENT, 781bdd1243dSDimitry Andric SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 7820b57cec5SDimitry Andric if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) 783bdd1243dSDimitry Andric return std::tuple(SIAtomicScope::WORKGROUP, 784bdd1243dSDimitry Andric SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 7850b57cec5SDimitry Andric if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) 786bdd1243dSDimitry Andric return std::tuple(SIAtomicScope::WAVEFRONT, 787bdd1243dSDimitry Andric SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 7880b57cec5SDimitry Andric if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) 789bdd1243dSDimitry Andric return std::tuple(SIAtomicScope::SINGLETHREAD, 790bdd1243dSDimitry Andric SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); 791bdd1243dSDimitry Andric return std::nullopt; 7920b57cec5SDimitry Andric } 7930b57cec5SDimitry Andric 7940b57cec5SDimitry Andric SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { 7950b57cec5SDimitry Andric if (AS == AMDGPUAS::FLAT_ADDRESS) 7960b57cec5SDimitry Andric return SIAtomicAddrSpace::FLAT; 7970b57cec5SDimitry Andric if (AS == AMDGPUAS::GLOBAL_ADDRESS) 7980b57cec5SDimitry Andric return SIAtomicAddrSpace::GLOBAL; 7990b57cec5SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS) 8000b57cec5SDimitry Andric return SIAtomicAddrSpace::LDS; 8010b57cec5SDimitry Andric if (AS == AMDGPUAS::PRIVATE_ADDRESS) 8020b57cec5SDimitry Andric return SIAtomicAddrSpace::SCRATCH; 8030b57cec5SDimitry Andric if (AS == AMDGPUAS::REGION_ADDRESS) 8040b57cec5SDimitry Andric return SIAtomicAddrSpace::GDS; 8050b57cec5SDimitry Andric 8060b57cec5SDimitry Andric return SIAtomicAddrSpace::OTHER; 8070b57cec5SDimitry Andric } 8080b57cec5SDimitry Andric 809*0fca6ea1SDimitry Andric SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_) 810*0fca6ea1SDimitry Andric : MMI(&MMI_) {} 8110b57cec5SDimitry Andric 812bdd1243dSDimitry Andric std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( 8130b57cec5SDimitry Andric const MachineBasicBlock::iterator &MI) const { 8140b57cec5SDimitry Andric assert(MI->getNumMemOperands() > 0); 8150b57cec5SDimitry Andric 8160b57cec5SDimitry Andric SyncScope::ID SSID = SyncScope::SingleThread; 8170b57cec5SDimitry Andric AtomicOrdering Ordering = AtomicOrdering::NotAtomic; 8180b57cec5SDimitry Andric AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; 8190b57cec5SDimitry Andric SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; 8200b57cec5SDimitry Andric bool IsNonTemporal = true; 821e8d8bef9SDimitry Andric bool IsVolatile = false; 822*0fca6ea1SDimitry Andric bool IsLastUse = false; 8230b57cec5SDimitry Andric 8240b57cec5SDimitry Andric // Validator should check whether or not MMOs cover the entire set of 8250b57cec5SDimitry Andric // locations accessed by the memory instruction. 8260b57cec5SDimitry Andric for (const auto &MMO : MI->memoperands()) { 8270b57cec5SDimitry Andric IsNonTemporal &= MMO->isNonTemporal(); 828e8d8bef9SDimitry Andric IsVolatile |= MMO->isVolatile(); 829*0fca6ea1SDimitry Andric IsLastUse |= MMO->getFlags() & MOLastUse; 8300b57cec5SDimitry Andric InstrAddrSpace |= 8310b57cec5SDimitry Andric toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); 832fe6060f1SDimitry Andric AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); 8330b57cec5SDimitry Andric if (OpOrdering != AtomicOrdering::NotAtomic) { 8340b57cec5SDimitry Andric const auto &IsSyncScopeInclusion = 8350b57cec5SDimitry Andric MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); 8360b57cec5SDimitry Andric if (!IsSyncScopeInclusion) { 8370b57cec5SDimitry Andric reportUnsupported(MI, 8380b57cec5SDimitry Andric "Unsupported non-inclusive atomic synchronization scope"); 839bdd1243dSDimitry Andric return std::nullopt; 8400b57cec5SDimitry Andric } 8410b57cec5SDimitry Andric 84281ad6265SDimitry Andric SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); 843349cc55cSDimitry Andric Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); 8440b57cec5SDimitry Andric assert(MMO->getFailureOrdering() != AtomicOrdering::Release && 8450b57cec5SDimitry Andric MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); 8460b57cec5SDimitry Andric FailureOrdering = 847349cc55cSDimitry Andric getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); 8480b57cec5SDimitry Andric } 8490b57cec5SDimitry Andric } 8500b57cec5SDimitry Andric 8510b57cec5SDimitry Andric SIAtomicScope Scope = SIAtomicScope::NONE; 8520b57cec5SDimitry Andric SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 8530b57cec5SDimitry Andric bool IsCrossAddressSpaceOrdering = false; 8540b57cec5SDimitry Andric if (Ordering != AtomicOrdering::NotAtomic) { 8550b57cec5SDimitry Andric auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); 8560b57cec5SDimitry Andric if (!ScopeOrNone) { 8570b57cec5SDimitry Andric reportUnsupported(MI, "Unsupported atomic synchronization scope"); 858bdd1243dSDimitry Andric return std::nullopt; 8590b57cec5SDimitry Andric } 8600b57cec5SDimitry Andric std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 86181ad6265SDimitry Andric *ScopeOrNone; 8620b57cec5SDimitry Andric if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 863fe6060f1SDimitry Andric ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || 864fe6060f1SDimitry Andric ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { 8650b57cec5SDimitry Andric reportUnsupported(MI, "Unsupported atomic address space"); 866bdd1243dSDimitry Andric return std::nullopt; 8670b57cec5SDimitry Andric } 8680b57cec5SDimitry Andric } 8690b57cec5SDimitry Andric return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, 870e8d8bef9SDimitry Andric IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, 871*0fca6ea1SDimitry Andric IsNonTemporal, IsLastUse); 8720b57cec5SDimitry Andric } 8730b57cec5SDimitry Andric 874bdd1243dSDimitry Andric std::optional<SIMemOpInfo> 875bdd1243dSDimitry Andric SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { 8760b57cec5SDimitry Andric assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 8770b57cec5SDimitry Andric 8780b57cec5SDimitry Andric if (!(MI->mayLoad() && !MI->mayStore())) 879bdd1243dSDimitry Andric return std::nullopt; 8800b57cec5SDimitry Andric 8810b57cec5SDimitry Andric // Be conservative if there are no memory operands. 8820b57cec5SDimitry Andric if (MI->getNumMemOperands() == 0) 8830b57cec5SDimitry Andric return SIMemOpInfo(); 8840b57cec5SDimitry Andric 8850b57cec5SDimitry Andric return constructFromMIWithMMO(MI); 8860b57cec5SDimitry Andric } 8870b57cec5SDimitry Andric 888bdd1243dSDimitry Andric std::optional<SIMemOpInfo> 889bdd1243dSDimitry Andric SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { 8900b57cec5SDimitry Andric assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 8910b57cec5SDimitry Andric 8920b57cec5SDimitry Andric if (!(!MI->mayLoad() && MI->mayStore())) 893bdd1243dSDimitry Andric return std::nullopt; 8940b57cec5SDimitry Andric 8950b57cec5SDimitry Andric // Be conservative if there are no memory operands. 8960b57cec5SDimitry Andric if (MI->getNumMemOperands() == 0) 8970b57cec5SDimitry Andric return SIMemOpInfo(); 8980b57cec5SDimitry Andric 8990b57cec5SDimitry Andric return constructFromMIWithMMO(MI); 9000b57cec5SDimitry Andric } 9010b57cec5SDimitry Andric 902bdd1243dSDimitry Andric std::optional<SIMemOpInfo> 903bdd1243dSDimitry Andric SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { 9040b57cec5SDimitry Andric assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 9050b57cec5SDimitry Andric 9060b57cec5SDimitry Andric if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) 907bdd1243dSDimitry Andric return std::nullopt; 9080b57cec5SDimitry Andric 9090b57cec5SDimitry Andric AtomicOrdering Ordering = 9100b57cec5SDimitry Andric static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); 9110b57cec5SDimitry Andric 9120b57cec5SDimitry Andric SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); 9130b57cec5SDimitry Andric auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); 9140b57cec5SDimitry Andric if (!ScopeOrNone) { 9150b57cec5SDimitry Andric reportUnsupported(MI, "Unsupported atomic synchronization scope"); 916bdd1243dSDimitry Andric return std::nullopt; 9170b57cec5SDimitry Andric } 9180b57cec5SDimitry Andric 9190b57cec5SDimitry Andric SIAtomicScope Scope = SIAtomicScope::NONE; 9200b57cec5SDimitry Andric SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; 9210b57cec5SDimitry Andric bool IsCrossAddressSpaceOrdering = false; 9220b57cec5SDimitry Andric std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = 92381ad6265SDimitry Andric *ScopeOrNone; 9240b57cec5SDimitry Andric 9250b57cec5SDimitry Andric if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || 9260b57cec5SDimitry Andric ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { 9270b57cec5SDimitry Andric reportUnsupported(MI, "Unsupported atomic address space"); 928bdd1243dSDimitry Andric return std::nullopt; 9290b57cec5SDimitry Andric } 9300b57cec5SDimitry Andric 9310b57cec5SDimitry Andric return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, 932fe6060f1SDimitry Andric IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); 9330b57cec5SDimitry Andric } 9340b57cec5SDimitry Andric 935bdd1243dSDimitry Andric std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( 9360b57cec5SDimitry Andric const MachineBasicBlock::iterator &MI) const { 9370b57cec5SDimitry Andric assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); 9380b57cec5SDimitry Andric 9390b57cec5SDimitry Andric if (!(MI->mayLoad() && MI->mayStore())) 940bdd1243dSDimitry Andric return std::nullopt; 9410b57cec5SDimitry Andric 9420b57cec5SDimitry Andric // Be conservative if there are no memory operands. 9430b57cec5SDimitry Andric if (MI->getNumMemOperands() == 0) 9440b57cec5SDimitry Andric return SIMemOpInfo(); 9450b57cec5SDimitry Andric 9460b57cec5SDimitry Andric return constructFromMIWithMMO(MI); 9470b57cec5SDimitry Andric } 9480b57cec5SDimitry Andric 949e8d8bef9SDimitry Andric SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { 9500b57cec5SDimitry Andric TII = ST.getInstrInfo(); 9510b57cec5SDimitry Andric IV = getIsaVersion(ST.getCPU()); 952e8d8bef9SDimitry Andric InsertCacheInv = !AmdgcnSkipCacheInvalidations; 9530b57cec5SDimitry Andric } 9540b57cec5SDimitry Andric 955fe6060f1SDimitry Andric bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, 956fe6060f1SDimitry Andric AMDGPU::CPol::CPol Bit) const { 957fe6060f1SDimitry Andric MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); 958fe6060f1SDimitry Andric if (!CPol) 959fe6060f1SDimitry Andric return false; 960fe6060f1SDimitry Andric 961fe6060f1SDimitry Andric CPol->setImm(CPol->getImm() | Bit); 962fe6060f1SDimitry Andric return true; 963fe6060f1SDimitry Andric } 964fe6060f1SDimitry Andric 9650b57cec5SDimitry Andric /* static */ 9660b57cec5SDimitry Andric std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { 9670b57cec5SDimitry Andric GCNSubtarget::Generation Generation = ST.getGeneration(); 96881ad6265SDimitry Andric if (ST.hasGFX940Insts()) 96981ad6265SDimitry Andric return std::make_unique<SIGfx940CacheControl>(ST); 970fe6060f1SDimitry Andric if (ST.hasGFX90AInsts()) 971fe6060f1SDimitry Andric return std::make_unique<SIGfx90ACacheControl>(ST); 9720b57cec5SDimitry Andric if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) 9738bcb0991SDimitry Andric return std::make_unique<SIGfx6CacheControl>(ST); 9740b57cec5SDimitry Andric if (Generation < AMDGPUSubtarget::GFX10) 9758bcb0991SDimitry Andric return std::make_unique<SIGfx7CacheControl>(ST); 97681ad6265SDimitry Andric if (Generation < AMDGPUSubtarget::GFX11) 977e8d8bef9SDimitry Andric return std::make_unique<SIGfx10CacheControl>(ST); 9781db9f3b2SDimitry Andric if (Generation < AMDGPUSubtarget::GFX12) 97981ad6265SDimitry Andric return std::make_unique<SIGfx11CacheControl>(ST); 9801db9f3b2SDimitry Andric return std::make_unique<SIGfx12CacheControl>(ST); 9810b57cec5SDimitry Andric } 9820b57cec5SDimitry Andric 9830b57cec5SDimitry Andric bool SIGfx6CacheControl::enableLoadCacheBypass( 9840b57cec5SDimitry Andric const MachineBasicBlock::iterator &MI, 9850b57cec5SDimitry Andric SIAtomicScope Scope, 9860b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace) const { 9870b57cec5SDimitry Andric assert(MI->mayLoad() && !MI->mayStore()); 9880b57cec5SDimitry Andric bool Changed = false; 9890b57cec5SDimitry Andric 9900b57cec5SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 9910b57cec5SDimitry Andric switch (Scope) { 9920b57cec5SDimitry Andric case SIAtomicScope::SYSTEM: 9930b57cec5SDimitry Andric case SIAtomicScope::AGENT: 9944824e7fdSDimitry Andric // Set L1 cache policy to MISS_EVICT. 9954824e7fdSDimitry Andric // Note: there is no L2 cache bypass policy at the ISA level. 9960b57cec5SDimitry Andric Changed |= enableGLCBit(MI); 9970b57cec5SDimitry Andric break; 9980b57cec5SDimitry Andric case SIAtomicScope::WORKGROUP: 9990b57cec5SDimitry Andric case SIAtomicScope::WAVEFRONT: 10000b57cec5SDimitry Andric case SIAtomicScope::SINGLETHREAD: 10010b57cec5SDimitry Andric // No cache to bypass. 10020b57cec5SDimitry Andric break; 10030b57cec5SDimitry Andric default: 10040b57cec5SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 10050b57cec5SDimitry Andric } 10060b57cec5SDimitry Andric } 10070b57cec5SDimitry Andric 10080b57cec5SDimitry Andric /// The scratch address space does not need the global memory caches 10090b57cec5SDimitry Andric /// to be bypassed as all memory operations by the same thread are 10100b57cec5SDimitry Andric /// sequentially consistent, and no other thread can access scratch 10110b57cec5SDimitry Andric /// memory. 10120b57cec5SDimitry Andric 1013e8d8bef9SDimitry Andric /// Other address spaces do not have a cache. 10140b57cec5SDimitry Andric 10150b57cec5SDimitry Andric return Changed; 10160b57cec5SDimitry Andric } 10170b57cec5SDimitry Andric 1018fe6060f1SDimitry Andric bool SIGfx6CacheControl::enableStoreCacheBypass( 1019fe6060f1SDimitry Andric const MachineBasicBlock::iterator &MI, 1020fe6060f1SDimitry Andric SIAtomicScope Scope, 1021fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const { 1022fe6060f1SDimitry Andric assert(!MI->mayLoad() && MI->mayStore()); 1023fe6060f1SDimitry Andric bool Changed = false; 1024fe6060f1SDimitry Andric 1025fe6060f1SDimitry Andric /// The L1 cache is write through so does not need to be bypassed. There is no 1026fe6060f1SDimitry Andric /// bypass control for the L2 cache at the isa level. 1027fe6060f1SDimitry Andric 1028fe6060f1SDimitry Andric return Changed; 1029fe6060f1SDimitry Andric } 1030fe6060f1SDimitry Andric 1031fe6060f1SDimitry Andric bool SIGfx6CacheControl::enableRMWCacheBypass( 1032fe6060f1SDimitry Andric const MachineBasicBlock::iterator &MI, 1033fe6060f1SDimitry Andric SIAtomicScope Scope, 1034fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const { 1035fe6060f1SDimitry Andric assert(MI->mayLoad() && MI->mayStore()); 1036fe6060f1SDimitry Andric bool Changed = false; 1037fe6060f1SDimitry Andric 10384824e7fdSDimitry Andric /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically 10394824e7fdSDimitry Andric /// bypassed, and the GLC bit is instead used to indicate if they are 10404824e7fdSDimitry Andric /// return or no-return. 10414824e7fdSDimitry Andric /// Note: there is no L2 cache coherent bypass control at the ISA level. 1042fe6060f1SDimitry Andric 1043fe6060f1SDimitry Andric return Changed; 1044fe6060f1SDimitry Andric } 1045fe6060f1SDimitry Andric 1046e8d8bef9SDimitry Andric bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( 1047e8d8bef9SDimitry Andric MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1048*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1049e8d8bef9SDimitry Andric // Only handle load and store, not atomic read-modify-write insructions. The 1050e8d8bef9SDimitry Andric // latter use glc to indicate if the atomic returns a result and so must not 1051e8d8bef9SDimitry Andric // be used for cache control. 10520b57cec5SDimitry Andric assert(MI->mayLoad() ^ MI->mayStore()); 1053e8d8bef9SDimitry Andric 1054e8d8bef9SDimitry Andric // Only update load and store, not LLVM IR atomic read-modify-write 1055e8d8bef9SDimitry Andric // instructions. The latter are always marked as volatile so cannot sensibly 1056e8d8bef9SDimitry Andric // handle it as do not want to pessimize all atomics. Also they do not support 1057e8d8bef9SDimitry Andric // the nontemporal attribute. 1058e8d8bef9SDimitry Andric assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1059e8d8bef9SDimitry Andric 10600b57cec5SDimitry Andric bool Changed = false; 10610b57cec5SDimitry Andric 1062e8d8bef9SDimitry Andric if (IsVolatile) { 10634824e7fdSDimitry Andric // Set L1 cache policy to be MISS_EVICT for load instructions 10644824e7fdSDimitry Andric // and MISS_LRU for store instructions. 10654824e7fdSDimitry Andric // Note: there is no L2 cache bypass policy at the ISA level. 1066e8d8bef9SDimitry Andric if (Op == SIMemOp::LOAD) 10670b57cec5SDimitry Andric Changed |= enableGLCBit(MI); 1068e8d8bef9SDimitry Andric 1069e8d8bef9SDimitry Andric // Ensure operation has completed at system scope to cause all volatile 1070e8d8bef9SDimitry Andric // operations to be visible outside the program in a global order. Do not 1071e8d8bef9SDimitry Andric // request cross address space as only the global address space can be 1072e8d8bef9SDimitry Andric // observable outside the program, so no need to cause a waitcnt for LDS 1073e8d8bef9SDimitry Andric // address space operations. 1074e8d8bef9SDimitry Andric Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1075e8d8bef9SDimitry Andric Position::AFTER); 10760b57cec5SDimitry Andric 10770b57cec5SDimitry Andric return Changed; 10780b57cec5SDimitry Andric } 10790b57cec5SDimitry Andric 1080e8d8bef9SDimitry Andric if (IsNonTemporal) { 10814824e7fdSDimitry Andric // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 10824824e7fdSDimitry Andric // for both loads and stores, and the L2 cache policy to STREAM. 1083e8d8bef9SDimitry Andric Changed |= enableGLCBit(MI); 1084e8d8bef9SDimitry Andric Changed |= enableSLCBit(MI); 1085e8d8bef9SDimitry Andric return Changed; 1086e8d8bef9SDimitry Andric } 1087e8d8bef9SDimitry Andric 1088e8d8bef9SDimitry Andric return Changed; 1089e8d8bef9SDimitry Andric } 1090e8d8bef9SDimitry Andric 1091e8d8bef9SDimitry Andric bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, 1092e8d8bef9SDimitry Andric SIAtomicScope Scope, 1093e8d8bef9SDimitry Andric SIAtomicAddrSpace AddrSpace, 1094e8d8bef9SDimitry Andric SIMemOp Op, 1095e8d8bef9SDimitry Andric bool IsCrossAddrSpaceOrdering, 1096e8d8bef9SDimitry Andric Position Pos) const { 1097e8d8bef9SDimitry Andric bool Changed = false; 1098e8d8bef9SDimitry Andric 1099e8d8bef9SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 1100e8d8bef9SDimitry Andric DebugLoc DL = MI->getDebugLoc(); 1101e8d8bef9SDimitry Andric 1102e8d8bef9SDimitry Andric if (Pos == Position::AFTER) 1103e8d8bef9SDimitry Andric ++MI; 1104e8d8bef9SDimitry Andric 1105e8d8bef9SDimitry Andric bool VMCnt = false; 1106e8d8bef9SDimitry Andric bool LGKMCnt = false; 1107e8d8bef9SDimitry Andric 1108e8d8bef9SDimitry Andric if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 1109e8d8bef9SDimitry Andric SIAtomicAddrSpace::NONE) { 1110e8d8bef9SDimitry Andric switch (Scope) { 1111e8d8bef9SDimitry Andric case SIAtomicScope::SYSTEM: 1112e8d8bef9SDimitry Andric case SIAtomicScope::AGENT: 1113e8d8bef9SDimitry Andric VMCnt |= true; 1114e8d8bef9SDimitry Andric break; 1115e8d8bef9SDimitry Andric case SIAtomicScope::WORKGROUP: 1116e8d8bef9SDimitry Andric case SIAtomicScope::WAVEFRONT: 1117e8d8bef9SDimitry Andric case SIAtomicScope::SINGLETHREAD: 1118e8d8bef9SDimitry Andric // The L1 cache keeps all memory operations in order for 1119e8d8bef9SDimitry Andric // wavefronts in the same work-group. 1120e8d8bef9SDimitry Andric break; 1121e8d8bef9SDimitry Andric default: 1122e8d8bef9SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 1123e8d8bef9SDimitry Andric } 1124e8d8bef9SDimitry Andric } 1125e8d8bef9SDimitry Andric 1126e8d8bef9SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 1127e8d8bef9SDimitry Andric switch (Scope) { 1128e8d8bef9SDimitry Andric case SIAtomicScope::SYSTEM: 1129e8d8bef9SDimitry Andric case SIAtomicScope::AGENT: 1130e8d8bef9SDimitry Andric case SIAtomicScope::WORKGROUP: 1131e8d8bef9SDimitry Andric // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 1132e8d8bef9SDimitry Andric // not needed as LDS operations for all waves are executed in a total 1133e8d8bef9SDimitry Andric // global ordering as observed by all waves. Required if also 1134e8d8bef9SDimitry Andric // synchronizing with global/GDS memory as LDS operations could be 1135e8d8bef9SDimitry Andric // reordered with respect to later global/GDS memory operations of the 1136e8d8bef9SDimitry Andric // same wave. 1137e8d8bef9SDimitry Andric LGKMCnt |= IsCrossAddrSpaceOrdering; 1138e8d8bef9SDimitry Andric break; 1139e8d8bef9SDimitry Andric case SIAtomicScope::WAVEFRONT: 1140e8d8bef9SDimitry Andric case SIAtomicScope::SINGLETHREAD: 1141e8d8bef9SDimitry Andric // The LDS keeps all memory operations in order for 114281ad6265SDimitry Andric // the same wavefront. 1143e8d8bef9SDimitry Andric break; 1144e8d8bef9SDimitry Andric default: 1145e8d8bef9SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 1146e8d8bef9SDimitry Andric } 1147e8d8bef9SDimitry Andric } 1148e8d8bef9SDimitry Andric 1149e8d8bef9SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 1150e8d8bef9SDimitry Andric switch (Scope) { 1151e8d8bef9SDimitry Andric case SIAtomicScope::SYSTEM: 1152e8d8bef9SDimitry Andric case SIAtomicScope::AGENT: 1153e8d8bef9SDimitry Andric // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 1154e8d8bef9SDimitry Andric // is not needed as GDS operations for all waves are executed in a total 1155e8d8bef9SDimitry Andric // global ordering as observed by all waves. Required if also 1156e8d8bef9SDimitry Andric // synchronizing with global/LDS memory as GDS operations could be 1157e8d8bef9SDimitry Andric // reordered with respect to later global/LDS memory operations of the 1158e8d8bef9SDimitry Andric // same wave. 1159e8d8bef9SDimitry Andric LGKMCnt |= IsCrossAddrSpaceOrdering; 1160e8d8bef9SDimitry Andric break; 1161e8d8bef9SDimitry Andric case SIAtomicScope::WORKGROUP: 1162e8d8bef9SDimitry Andric case SIAtomicScope::WAVEFRONT: 1163e8d8bef9SDimitry Andric case SIAtomicScope::SINGLETHREAD: 1164e8d8bef9SDimitry Andric // The GDS keeps all memory operations in order for 1165e8d8bef9SDimitry Andric // the same work-group. 1166e8d8bef9SDimitry Andric break; 1167e8d8bef9SDimitry Andric default: 1168e8d8bef9SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 1169e8d8bef9SDimitry Andric } 1170e8d8bef9SDimitry Andric } 1171e8d8bef9SDimitry Andric 1172e8d8bef9SDimitry Andric if (VMCnt || LGKMCnt) { 1173e8d8bef9SDimitry Andric unsigned WaitCntImmediate = 1174e8d8bef9SDimitry Andric AMDGPU::encodeWaitcnt(IV, 1175e8d8bef9SDimitry Andric VMCnt ? 0 : getVmcntBitMask(IV), 1176e8d8bef9SDimitry Andric getExpcntBitMask(IV), 1177e8d8bef9SDimitry Andric LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 11785f757f3fSDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 11795f757f3fSDimitry Andric .addImm(WaitCntImmediate); 1180e8d8bef9SDimitry Andric Changed = true; 1181e8d8bef9SDimitry Andric } 1182e8d8bef9SDimitry Andric 1183e8d8bef9SDimitry Andric if (Pos == Position::AFTER) 1184e8d8bef9SDimitry Andric --MI; 1185e8d8bef9SDimitry Andric 1186e8d8bef9SDimitry Andric return Changed; 1187e8d8bef9SDimitry Andric } 1188e8d8bef9SDimitry Andric 1189e8d8bef9SDimitry Andric bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 11900b57cec5SDimitry Andric SIAtomicScope Scope, 11910b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace, 11920b57cec5SDimitry Andric Position Pos) const { 11935ffd83dbSDimitry Andric if (!InsertCacheInv) 11945ffd83dbSDimitry Andric return false; 11955ffd83dbSDimitry Andric 11960b57cec5SDimitry Andric bool Changed = false; 11970b57cec5SDimitry Andric 11980b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 11990b57cec5SDimitry Andric DebugLoc DL = MI->getDebugLoc(); 12000b57cec5SDimitry Andric 12010b57cec5SDimitry Andric if (Pos == Position::AFTER) 12020b57cec5SDimitry Andric ++MI; 12030b57cec5SDimitry Andric 12040b57cec5SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 12050b57cec5SDimitry Andric switch (Scope) { 12060b57cec5SDimitry Andric case SIAtomicScope::SYSTEM: 12070b57cec5SDimitry Andric case SIAtomicScope::AGENT: 12080b57cec5SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); 12090b57cec5SDimitry Andric Changed = true; 12100b57cec5SDimitry Andric break; 12110b57cec5SDimitry Andric case SIAtomicScope::WORKGROUP: 12120b57cec5SDimitry Andric case SIAtomicScope::WAVEFRONT: 12130b57cec5SDimitry Andric case SIAtomicScope::SINGLETHREAD: 12140b57cec5SDimitry Andric // No cache to invalidate. 12150b57cec5SDimitry Andric break; 12160b57cec5SDimitry Andric default: 12170b57cec5SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 12180b57cec5SDimitry Andric } 12190b57cec5SDimitry Andric } 12200b57cec5SDimitry Andric 12210b57cec5SDimitry Andric /// The scratch address space does not need the global memory cache 12220b57cec5SDimitry Andric /// to be flushed as all memory operations by the same thread are 12230b57cec5SDimitry Andric /// sequentially consistent, and no other thread can access scratch 12240b57cec5SDimitry Andric /// memory. 12250b57cec5SDimitry Andric 1226e8d8bef9SDimitry Andric /// Other address spaces do not have a cache. 12270b57cec5SDimitry Andric 12280b57cec5SDimitry Andric if (Pos == Position::AFTER) 12290b57cec5SDimitry Andric --MI; 12300b57cec5SDimitry Andric 12310b57cec5SDimitry Andric return Changed; 12320b57cec5SDimitry Andric } 12330b57cec5SDimitry Andric 1234e8d8bef9SDimitry Andric bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 12350b57cec5SDimitry Andric SIAtomicScope Scope, 12360b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace, 12370b57cec5SDimitry Andric bool IsCrossAddrSpaceOrdering, 12380b57cec5SDimitry Andric Position Pos) const { 1239e8d8bef9SDimitry Andric return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 1240e8d8bef9SDimitry Andric IsCrossAddrSpaceOrdering, Pos); 12410b57cec5SDimitry Andric } 12420b57cec5SDimitry Andric 1243e8d8bef9SDimitry Andric bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 12440b57cec5SDimitry Andric SIAtomicScope Scope, 12450b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace, 12460b57cec5SDimitry Andric Position Pos) const { 12475ffd83dbSDimitry Andric if (!InsertCacheInv) 12485ffd83dbSDimitry Andric return false; 12495ffd83dbSDimitry Andric 12500b57cec5SDimitry Andric bool Changed = false; 12510b57cec5SDimitry Andric 12520b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 12530b57cec5SDimitry Andric DebugLoc DL = MI->getDebugLoc(); 12540b57cec5SDimitry Andric 12550b57cec5SDimitry Andric const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); 12560b57cec5SDimitry Andric 1257e8d8bef9SDimitry Andric const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() 12580b57cec5SDimitry Andric ? AMDGPU::BUFFER_WBINVL1 12590b57cec5SDimitry Andric : AMDGPU::BUFFER_WBINVL1_VOL; 12600b57cec5SDimitry Andric 12610b57cec5SDimitry Andric if (Pos == Position::AFTER) 12620b57cec5SDimitry Andric ++MI; 12630b57cec5SDimitry Andric 12640b57cec5SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 12650b57cec5SDimitry Andric switch (Scope) { 12660b57cec5SDimitry Andric case SIAtomicScope::SYSTEM: 12670b57cec5SDimitry Andric case SIAtomicScope::AGENT: 1268e8d8bef9SDimitry Andric BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); 12690b57cec5SDimitry Andric Changed = true; 12700b57cec5SDimitry Andric break; 12710b57cec5SDimitry Andric case SIAtomicScope::WORKGROUP: 12720b57cec5SDimitry Andric case SIAtomicScope::WAVEFRONT: 12730b57cec5SDimitry Andric case SIAtomicScope::SINGLETHREAD: 12740b57cec5SDimitry Andric // No cache to invalidate. 12750b57cec5SDimitry Andric break; 12760b57cec5SDimitry Andric default: 12770b57cec5SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 12780b57cec5SDimitry Andric } 12790b57cec5SDimitry Andric } 12800b57cec5SDimitry Andric 12810b57cec5SDimitry Andric /// The scratch address space does not need the global memory cache 12820b57cec5SDimitry Andric /// to be flushed as all memory operations by the same thread are 12830b57cec5SDimitry Andric /// sequentially consistent, and no other thread can access scratch 12840b57cec5SDimitry Andric /// memory. 12850b57cec5SDimitry Andric 1286e8d8bef9SDimitry Andric /// Other address spaces do not have a cache. 12870b57cec5SDimitry Andric 12880b57cec5SDimitry Andric if (Pos == Position::AFTER) 12890b57cec5SDimitry Andric --MI; 12900b57cec5SDimitry Andric 12910b57cec5SDimitry Andric return Changed; 12920b57cec5SDimitry Andric } 12930b57cec5SDimitry Andric 1294fe6060f1SDimitry Andric bool SIGfx90ACacheControl::enableLoadCacheBypass( 1295fe6060f1SDimitry Andric const MachineBasicBlock::iterator &MI, 1296fe6060f1SDimitry Andric SIAtomicScope Scope, 1297fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const { 1298fe6060f1SDimitry Andric assert(MI->mayLoad() && !MI->mayStore()); 1299fe6060f1SDimitry Andric bool Changed = false; 1300fe6060f1SDimitry Andric 1301fe6060f1SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1302fe6060f1SDimitry Andric switch (Scope) { 1303fe6060f1SDimitry Andric case SIAtomicScope::SYSTEM: 1304fe6060f1SDimitry Andric case SIAtomicScope::AGENT: 13054824e7fdSDimitry Andric // Set the L1 cache policy to MISS_LRU. 13064824e7fdSDimitry Andric // Note: there is no L2 cache bypass policy at the ISA level. 1307fe6060f1SDimitry Andric Changed |= enableGLCBit(MI); 1308fe6060f1SDimitry Andric break; 1309fe6060f1SDimitry Andric case SIAtomicScope::WORKGROUP: 1310fe6060f1SDimitry Andric // In threadgroup split mode the waves of a work-group can be executing on 1311fe6060f1SDimitry Andric // different CUs. Therefore need to bypass the L1 which is per CU. 1312fe6060f1SDimitry Andric // Otherwise in non-threadgroup split mode all waves of a work-group are 1313fe6060f1SDimitry Andric // on the same CU, and so the L1 does not need to be bypassed. 1314349cc55cSDimitry Andric if (ST.isTgSplitEnabled()) 1315349cc55cSDimitry Andric Changed |= enableGLCBit(MI); 1316fe6060f1SDimitry Andric break; 1317fe6060f1SDimitry Andric case SIAtomicScope::WAVEFRONT: 1318fe6060f1SDimitry Andric case SIAtomicScope::SINGLETHREAD: 1319fe6060f1SDimitry Andric // No cache to bypass. 1320fe6060f1SDimitry Andric break; 1321fe6060f1SDimitry Andric default: 1322fe6060f1SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 1323fe6060f1SDimitry Andric } 1324fe6060f1SDimitry Andric } 1325fe6060f1SDimitry Andric 1326fe6060f1SDimitry Andric /// The scratch address space does not need the global memory caches 1327fe6060f1SDimitry Andric /// to be bypassed as all memory operations by the same thread are 1328fe6060f1SDimitry Andric /// sequentially consistent, and no other thread can access scratch 1329fe6060f1SDimitry Andric /// memory. 1330fe6060f1SDimitry Andric 1331fe6060f1SDimitry Andric /// Other address spaces do not have a cache. 1332fe6060f1SDimitry Andric 1333fe6060f1SDimitry Andric return Changed; 1334fe6060f1SDimitry Andric } 1335fe6060f1SDimitry Andric 1336fe6060f1SDimitry Andric bool SIGfx90ACacheControl::enableStoreCacheBypass( 1337fe6060f1SDimitry Andric const MachineBasicBlock::iterator &MI, 1338fe6060f1SDimitry Andric SIAtomicScope Scope, 1339fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const { 1340fe6060f1SDimitry Andric assert(!MI->mayLoad() && MI->mayStore()); 1341fe6060f1SDimitry Andric bool Changed = false; 1342fe6060f1SDimitry Andric 1343fe6060f1SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1344fe6060f1SDimitry Andric switch (Scope) { 1345fe6060f1SDimitry Andric case SIAtomicScope::SYSTEM: 1346fe6060f1SDimitry Andric case SIAtomicScope::AGENT: 1347fe6060f1SDimitry Andric /// Do not set glc for store atomic operations as they implicitly write 1348fe6060f1SDimitry Andric /// through the L1 cache. 1349fe6060f1SDimitry Andric break; 1350fe6060f1SDimitry Andric case SIAtomicScope::WORKGROUP: 1351fe6060f1SDimitry Andric case SIAtomicScope::WAVEFRONT: 1352fe6060f1SDimitry Andric case SIAtomicScope::SINGLETHREAD: 1353fe6060f1SDimitry Andric // No cache to bypass. Store atomics implicitly write through the L1 1354fe6060f1SDimitry Andric // cache. 1355fe6060f1SDimitry Andric break; 1356fe6060f1SDimitry Andric default: 1357fe6060f1SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 1358fe6060f1SDimitry Andric } 1359fe6060f1SDimitry Andric } 1360fe6060f1SDimitry Andric 1361fe6060f1SDimitry Andric /// The scratch address space does not need the global memory caches 1362fe6060f1SDimitry Andric /// to be bypassed as all memory operations by the same thread are 1363fe6060f1SDimitry Andric /// sequentially consistent, and no other thread can access scratch 1364fe6060f1SDimitry Andric /// memory. 1365fe6060f1SDimitry Andric 1366fe6060f1SDimitry Andric /// Other address spaces do not have a cache. 1367fe6060f1SDimitry Andric 1368fe6060f1SDimitry Andric return Changed; 1369fe6060f1SDimitry Andric } 1370fe6060f1SDimitry Andric 1371fe6060f1SDimitry Andric bool SIGfx90ACacheControl::enableRMWCacheBypass( 1372fe6060f1SDimitry Andric const MachineBasicBlock::iterator &MI, 1373fe6060f1SDimitry Andric SIAtomicScope Scope, 1374fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace) const { 1375fe6060f1SDimitry Andric assert(MI->mayLoad() && MI->mayStore()); 1376fe6060f1SDimitry Andric bool Changed = false; 1377fe6060f1SDimitry Andric 1378fe6060f1SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1379fe6060f1SDimitry Andric switch (Scope) { 1380fe6060f1SDimitry Andric case SIAtomicScope::SYSTEM: 1381fe6060f1SDimitry Andric case SIAtomicScope::AGENT: 1382fe6060f1SDimitry Andric /// Do not set glc for RMW atomic operations as they implicitly bypass 1383fe6060f1SDimitry Andric /// the L1 cache, and the glc bit is instead used to indicate if they are 1384fe6060f1SDimitry Andric /// return or no-return. 1385fe6060f1SDimitry Andric break; 1386fe6060f1SDimitry Andric case SIAtomicScope::WORKGROUP: 1387fe6060f1SDimitry Andric case SIAtomicScope::WAVEFRONT: 1388fe6060f1SDimitry Andric case SIAtomicScope::SINGLETHREAD: 1389fe6060f1SDimitry Andric // No cache to bypass. RMW atomics implicitly bypass the L1 cache. 1390fe6060f1SDimitry Andric break; 1391fe6060f1SDimitry Andric default: 1392fe6060f1SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 1393fe6060f1SDimitry Andric } 1394fe6060f1SDimitry Andric } 1395fe6060f1SDimitry Andric 1396fe6060f1SDimitry Andric return Changed; 1397fe6060f1SDimitry Andric } 1398fe6060f1SDimitry Andric 1399fe6060f1SDimitry Andric bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( 1400fe6060f1SDimitry Andric MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1401*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1402fe6060f1SDimitry Andric // Only handle load and store, not atomic read-modify-write insructions. The 1403fe6060f1SDimitry Andric // latter use glc to indicate if the atomic returns a result and so must not 1404fe6060f1SDimitry Andric // be used for cache control. 1405fe6060f1SDimitry Andric assert(MI->mayLoad() ^ MI->mayStore()); 1406fe6060f1SDimitry Andric 1407fe6060f1SDimitry Andric // Only update load and store, not LLVM IR atomic read-modify-write 1408fe6060f1SDimitry Andric // instructions. The latter are always marked as volatile so cannot sensibly 1409fe6060f1SDimitry Andric // handle it as do not want to pessimize all atomics. Also they do not support 1410fe6060f1SDimitry Andric // the nontemporal attribute. 1411fe6060f1SDimitry Andric assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1412fe6060f1SDimitry Andric 1413fe6060f1SDimitry Andric bool Changed = false; 1414fe6060f1SDimitry Andric 1415fe6060f1SDimitry Andric if (IsVolatile) { 14164824e7fdSDimitry Andric // Set L1 cache policy to be MISS_EVICT for load instructions 14174824e7fdSDimitry Andric // and MISS_LRU for store instructions. 14184824e7fdSDimitry Andric // Note: there is no L2 cache bypass policy at the ISA level. 1419349cc55cSDimitry Andric if (Op == SIMemOp::LOAD) 1420fe6060f1SDimitry Andric Changed |= enableGLCBit(MI); 1421fe6060f1SDimitry Andric 1422fe6060f1SDimitry Andric // Ensure operation has completed at system scope to cause all volatile 1423fe6060f1SDimitry Andric // operations to be visible outside the program in a global order. Do not 1424fe6060f1SDimitry Andric // request cross address space as only the global address space can be 1425fe6060f1SDimitry Andric // observable outside the program, so no need to cause a waitcnt for LDS 1426fe6060f1SDimitry Andric // address space operations. 1427fe6060f1SDimitry Andric Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1428fe6060f1SDimitry Andric Position::AFTER); 1429fe6060f1SDimitry Andric 1430fe6060f1SDimitry Andric return Changed; 1431fe6060f1SDimitry Andric } 1432fe6060f1SDimitry Andric 1433fe6060f1SDimitry Andric if (IsNonTemporal) { 14344824e7fdSDimitry Andric // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT 14354824e7fdSDimitry Andric // for both loads and stores, and the L2 cache policy to STREAM. 1436fe6060f1SDimitry Andric Changed |= enableGLCBit(MI); 1437fe6060f1SDimitry Andric Changed |= enableSLCBit(MI); 1438fe6060f1SDimitry Andric return Changed; 1439fe6060f1SDimitry Andric } 1440fe6060f1SDimitry Andric 1441fe6060f1SDimitry Andric return Changed; 1442fe6060f1SDimitry Andric } 1443fe6060f1SDimitry Andric 1444fe6060f1SDimitry Andric bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, 1445fe6060f1SDimitry Andric SIAtomicScope Scope, 1446fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace, 1447fe6060f1SDimitry Andric SIMemOp Op, 1448fe6060f1SDimitry Andric bool IsCrossAddrSpaceOrdering, 1449fe6060f1SDimitry Andric Position Pos) const { 1450fe6060f1SDimitry Andric if (ST.isTgSplitEnabled()) { 1451fe6060f1SDimitry Andric // In threadgroup split mode the waves of a work-group can be executing on 1452fe6060f1SDimitry Andric // different CUs. Therefore need to wait for global or GDS memory operations 1453fe6060f1SDimitry Andric // to complete to ensure they are visible to waves in the other CUs. 1454fe6060f1SDimitry Andric // Otherwise in non-threadgroup split mode all waves of a work-group are on 1455fe6060f1SDimitry Andric // the same CU, so no need to wait for global memory as all waves in the 1456fe6060f1SDimitry Andric // work-group access the same the L1, nor wait for GDS as access are ordered 1457fe6060f1SDimitry Andric // on a CU. 1458fe6060f1SDimitry Andric if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | 1459fe6060f1SDimitry Andric SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && 1460fe6060f1SDimitry Andric (Scope == SIAtomicScope::WORKGROUP)) { 1461fe6060f1SDimitry Andric // Same as GFX7 using agent scope. 1462fe6060f1SDimitry Andric Scope = SIAtomicScope::AGENT; 1463fe6060f1SDimitry Andric } 1464fe6060f1SDimitry Andric // In threadgroup split mode LDS cannot be allocated so no need to wait for 1465fe6060f1SDimitry Andric // LDS memory operations. 1466fe6060f1SDimitry Andric AddrSpace &= ~SIAtomicAddrSpace::LDS; 1467fe6060f1SDimitry Andric } 1468fe6060f1SDimitry Andric return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, 1469fe6060f1SDimitry Andric IsCrossAddrSpaceOrdering, Pos); 1470fe6060f1SDimitry Andric } 1471fe6060f1SDimitry Andric 1472fe6060f1SDimitry Andric bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 1473fe6060f1SDimitry Andric SIAtomicScope Scope, 1474fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace, 1475fe6060f1SDimitry Andric Position Pos) const { 1476fe6060f1SDimitry Andric if (!InsertCacheInv) 1477fe6060f1SDimitry Andric return false; 1478fe6060f1SDimitry Andric 1479fe6060f1SDimitry Andric bool Changed = false; 1480fe6060f1SDimitry Andric 1481fe6060f1SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 1482fe6060f1SDimitry Andric DebugLoc DL = MI->getDebugLoc(); 1483fe6060f1SDimitry Andric 1484fe6060f1SDimitry Andric if (Pos == Position::AFTER) 1485fe6060f1SDimitry Andric ++MI; 1486fe6060f1SDimitry Andric 1487fe6060f1SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1488fe6060f1SDimitry Andric switch (Scope) { 1489fe6060f1SDimitry Andric case SIAtomicScope::SYSTEM: 1490fe6060f1SDimitry Andric // Ensures that following loads will not see stale remote VMEM data or 1491fe6060f1SDimitry Andric // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 1492fe6060f1SDimitry Andric // CC will never be stale due to the local memory probes. 1493fe6060f1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); 1494fe6060f1SDimitry Andric // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 1495fe6060f1SDimitry Andric // hardware does not reorder memory operations by the same wave with 1496fe6060f1SDimitry Andric // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to 1497fe6060f1SDimitry Andric // remove any cache lines of earlier writes by the same wave and ensures 1498fe6060f1SDimitry Andric // later reads by the same wave will refetch the cache lines. 1499fe6060f1SDimitry Andric Changed = true; 1500fe6060f1SDimitry Andric break; 1501fe6060f1SDimitry Andric case SIAtomicScope::AGENT: 1502fe6060f1SDimitry Andric // Same as GFX7. 1503fe6060f1SDimitry Andric break; 1504fe6060f1SDimitry Andric case SIAtomicScope::WORKGROUP: 1505fe6060f1SDimitry Andric // In threadgroup split mode the waves of a work-group can be executing on 1506fe6060f1SDimitry Andric // different CUs. Therefore need to invalidate the L1 which is per CU. 1507fe6060f1SDimitry Andric // Otherwise in non-threadgroup split mode all waves of a work-group are 1508fe6060f1SDimitry Andric // on the same CU, and so the L1 does not need to be invalidated. 1509fe6060f1SDimitry Andric if (ST.isTgSplitEnabled()) { 1510fe6060f1SDimitry Andric // Same as GFX7 using agent scope. 1511fe6060f1SDimitry Andric Scope = SIAtomicScope::AGENT; 1512fe6060f1SDimitry Andric } 1513fe6060f1SDimitry Andric break; 1514fe6060f1SDimitry Andric case SIAtomicScope::WAVEFRONT: 1515fe6060f1SDimitry Andric case SIAtomicScope::SINGLETHREAD: 1516fe6060f1SDimitry Andric // Same as GFX7. 1517fe6060f1SDimitry Andric break; 1518fe6060f1SDimitry Andric default: 1519fe6060f1SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 1520fe6060f1SDimitry Andric } 1521fe6060f1SDimitry Andric } 1522fe6060f1SDimitry Andric 1523fe6060f1SDimitry Andric /// The scratch address space does not need the global memory cache 1524fe6060f1SDimitry Andric /// to be flushed as all memory operations by the same thread are 1525fe6060f1SDimitry Andric /// sequentially consistent, and no other thread can access scratch 1526fe6060f1SDimitry Andric /// memory. 1527fe6060f1SDimitry Andric 1528fe6060f1SDimitry Andric /// Other address spaces do not have a cache. 1529fe6060f1SDimitry Andric 1530fe6060f1SDimitry Andric if (Pos == Position::AFTER) 1531fe6060f1SDimitry Andric --MI; 1532fe6060f1SDimitry Andric 1533fe6060f1SDimitry Andric Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); 1534fe6060f1SDimitry Andric 1535fe6060f1SDimitry Andric return Changed; 1536fe6060f1SDimitry Andric } 1537fe6060f1SDimitry Andric 1538fe6060f1SDimitry Andric bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, 1539fe6060f1SDimitry Andric SIAtomicScope Scope, 1540fe6060f1SDimitry Andric SIAtomicAddrSpace AddrSpace, 1541fe6060f1SDimitry Andric bool IsCrossAddrSpaceOrdering, 1542fe6060f1SDimitry Andric Position Pos) const { 1543fe6060f1SDimitry Andric bool Changed = false; 1544fe6060f1SDimitry Andric 1545fe6060f1SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 15461db9f3b2SDimitry Andric const DebugLoc &DL = MI->getDebugLoc(); 1547fe6060f1SDimitry Andric 1548fe6060f1SDimitry Andric if (Pos == Position::AFTER) 1549fe6060f1SDimitry Andric ++MI; 1550fe6060f1SDimitry Andric 1551fe6060f1SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 1552fe6060f1SDimitry Andric switch (Scope) { 1553fe6060f1SDimitry Andric case SIAtomicScope::SYSTEM: 1554fe6060f1SDimitry Andric // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 1555fe6060f1SDimitry Andric // hardware does not reorder memory operations by the same wave with 1556fe6060f1SDimitry Andric // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 1557fe6060f1SDimitry Andric // to initiate writeback of any dirty cache lines of earlier writes by the 1558fe6060f1SDimitry Andric // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 1559fe6060f1SDimitry Andric // writeback has completed. 156081ad6265SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 156181ad6265SDimitry Andric // Set SC bits to indicate system scope. 156281ad6265SDimitry Andric .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 1563fe6060f1SDimitry Andric // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT 1564fe6060f1SDimitry Andric // vmcnt(0)" needed by the "BUFFER_WBL2". 1565fe6060f1SDimitry Andric Changed = true; 1566fe6060f1SDimitry Andric break; 1567fe6060f1SDimitry Andric case SIAtomicScope::AGENT: 1568fe6060f1SDimitry Andric case SIAtomicScope::WORKGROUP: 1569fe6060f1SDimitry Andric case SIAtomicScope::WAVEFRONT: 1570fe6060f1SDimitry Andric case SIAtomicScope::SINGLETHREAD: 1571fe6060f1SDimitry Andric // Same as GFX7. 1572fe6060f1SDimitry Andric break; 1573fe6060f1SDimitry Andric default: 1574fe6060f1SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 1575fe6060f1SDimitry Andric } 1576fe6060f1SDimitry Andric } 1577fe6060f1SDimitry Andric 1578fe6060f1SDimitry Andric if (Pos == Position::AFTER) 1579fe6060f1SDimitry Andric --MI; 1580fe6060f1SDimitry Andric 1581fe6060f1SDimitry Andric Changed |= 1582fe6060f1SDimitry Andric SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, 1583fe6060f1SDimitry Andric IsCrossAddrSpaceOrdering, Pos); 1584fe6060f1SDimitry Andric 1585fe6060f1SDimitry Andric return Changed; 1586fe6060f1SDimitry Andric } 1587fe6060f1SDimitry Andric 158881ad6265SDimitry Andric bool SIGfx940CacheControl::enableLoadCacheBypass( 158981ad6265SDimitry Andric const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 159081ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace) const { 159181ad6265SDimitry Andric assert(MI->mayLoad() && !MI->mayStore()); 159281ad6265SDimitry Andric bool Changed = false; 159381ad6265SDimitry Andric 159481ad6265SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 159581ad6265SDimitry Andric switch (Scope) { 159681ad6265SDimitry Andric case SIAtomicScope::SYSTEM: 159781ad6265SDimitry Andric // Set SC bits to indicate system scope. 159881ad6265SDimitry Andric Changed |= enableSC0Bit(MI); 159981ad6265SDimitry Andric Changed |= enableSC1Bit(MI); 160081ad6265SDimitry Andric break; 160181ad6265SDimitry Andric case SIAtomicScope::AGENT: 160281ad6265SDimitry Andric // Set SC bits to indicate agent scope. 160381ad6265SDimitry Andric Changed |= enableSC1Bit(MI); 160481ad6265SDimitry Andric break; 160581ad6265SDimitry Andric case SIAtomicScope::WORKGROUP: 160681ad6265SDimitry Andric // In threadgroup split mode the waves of a work-group can be executing on 160781ad6265SDimitry Andric // different CUs. Therefore need to bypass the L1 which is per CU. 160881ad6265SDimitry Andric // Otherwise in non-threadgroup split mode all waves of a work-group are 160981ad6265SDimitry Andric // on the same CU, and so the L1 does not need to be bypassed. Setting SC 161081ad6265SDimitry Andric // bits to indicate work-group scope will do this automatically. 161181ad6265SDimitry Andric Changed |= enableSC0Bit(MI); 161281ad6265SDimitry Andric break; 161381ad6265SDimitry Andric case SIAtomicScope::WAVEFRONT: 161481ad6265SDimitry Andric case SIAtomicScope::SINGLETHREAD: 161581ad6265SDimitry Andric // Leave SC bits unset to indicate wavefront scope. 161681ad6265SDimitry Andric break; 161781ad6265SDimitry Andric default: 161881ad6265SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 161981ad6265SDimitry Andric } 162081ad6265SDimitry Andric } 162181ad6265SDimitry Andric 162281ad6265SDimitry Andric /// The scratch address space does not need the global memory caches 162381ad6265SDimitry Andric /// to be bypassed as all memory operations by the same thread are 162481ad6265SDimitry Andric /// sequentially consistent, and no other thread can access scratch 162581ad6265SDimitry Andric /// memory. 162681ad6265SDimitry Andric 162781ad6265SDimitry Andric /// Other address spaces do not have a cache. 162881ad6265SDimitry Andric 162981ad6265SDimitry Andric return Changed; 163081ad6265SDimitry Andric } 163181ad6265SDimitry Andric 163281ad6265SDimitry Andric bool SIGfx940CacheControl::enableStoreCacheBypass( 163381ad6265SDimitry Andric const MachineBasicBlock::iterator &MI, 163481ad6265SDimitry Andric SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { 163581ad6265SDimitry Andric assert(!MI->mayLoad() && MI->mayStore()); 163681ad6265SDimitry Andric bool Changed = false; 163781ad6265SDimitry Andric 163881ad6265SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 163981ad6265SDimitry Andric switch (Scope) { 164081ad6265SDimitry Andric case SIAtomicScope::SYSTEM: 164181ad6265SDimitry Andric // Set SC bits to indicate system scope. 164281ad6265SDimitry Andric Changed |= enableSC0Bit(MI); 164381ad6265SDimitry Andric Changed |= enableSC1Bit(MI); 164481ad6265SDimitry Andric break; 164581ad6265SDimitry Andric case SIAtomicScope::AGENT: 164681ad6265SDimitry Andric // Set SC bits to indicate agent scope. 164781ad6265SDimitry Andric Changed |= enableSC1Bit(MI); 164881ad6265SDimitry Andric break; 164981ad6265SDimitry Andric case SIAtomicScope::WORKGROUP: 165081ad6265SDimitry Andric // Set SC bits to indicate workgroup scope. 165181ad6265SDimitry Andric Changed |= enableSC0Bit(MI); 165281ad6265SDimitry Andric break; 165381ad6265SDimitry Andric case SIAtomicScope::WAVEFRONT: 165481ad6265SDimitry Andric case SIAtomicScope::SINGLETHREAD: 165581ad6265SDimitry Andric // Leave SC bits unset to indicate wavefront scope. 165681ad6265SDimitry Andric break; 165781ad6265SDimitry Andric default: 165881ad6265SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 165981ad6265SDimitry Andric } 166081ad6265SDimitry Andric } 166181ad6265SDimitry Andric 166281ad6265SDimitry Andric /// The scratch address space does not need the global memory caches 166381ad6265SDimitry Andric /// to be bypassed as all memory operations by the same thread are 166481ad6265SDimitry Andric /// sequentially consistent, and no other thread can access scratch 166581ad6265SDimitry Andric /// memory. 166681ad6265SDimitry Andric 166781ad6265SDimitry Andric /// Other address spaces do not have a cache. 166881ad6265SDimitry Andric 166981ad6265SDimitry Andric return Changed; 167081ad6265SDimitry Andric } 167181ad6265SDimitry Andric 167281ad6265SDimitry Andric bool SIGfx940CacheControl::enableRMWCacheBypass( 167381ad6265SDimitry Andric const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 167481ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace) const { 167581ad6265SDimitry Andric assert(MI->mayLoad() && MI->mayStore()); 167681ad6265SDimitry Andric bool Changed = false; 167781ad6265SDimitry Andric 167881ad6265SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 167981ad6265SDimitry Andric switch (Scope) { 168081ad6265SDimitry Andric case SIAtomicScope::SYSTEM: 168181ad6265SDimitry Andric // Set SC1 bit to indicate system scope. 168281ad6265SDimitry Andric Changed |= enableSC1Bit(MI); 168381ad6265SDimitry Andric break; 168481ad6265SDimitry Andric case SIAtomicScope::AGENT: 168581ad6265SDimitry Andric case SIAtomicScope::WORKGROUP: 168681ad6265SDimitry Andric case SIAtomicScope::WAVEFRONT: 168781ad6265SDimitry Andric case SIAtomicScope::SINGLETHREAD: 168881ad6265SDimitry Andric // RMW atomic operations implicitly bypass the L1 cache and only use SC1 168981ad6265SDimitry Andric // to indicate system or agent scope. The SC0 bit is used to indicate if 169081ad6265SDimitry Andric // they are return or no-return. Leave SC1 bit unset to indicate agent 169181ad6265SDimitry Andric // scope. 169281ad6265SDimitry Andric break; 169381ad6265SDimitry Andric default: 169481ad6265SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 169581ad6265SDimitry Andric } 169681ad6265SDimitry Andric } 169781ad6265SDimitry Andric 169881ad6265SDimitry Andric return Changed; 169981ad6265SDimitry Andric } 170081ad6265SDimitry Andric 170181ad6265SDimitry Andric bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( 170281ad6265SDimitry Andric MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1703*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 170481ad6265SDimitry Andric // Only handle load and store, not atomic read-modify-write insructions. The 170581ad6265SDimitry Andric // latter use glc to indicate if the atomic returns a result and so must not 170681ad6265SDimitry Andric // be used for cache control. 170781ad6265SDimitry Andric assert(MI->mayLoad() ^ MI->mayStore()); 170881ad6265SDimitry Andric 170981ad6265SDimitry Andric // Only update load and store, not LLVM IR atomic read-modify-write 171081ad6265SDimitry Andric // instructions. The latter are always marked as volatile so cannot sensibly 171181ad6265SDimitry Andric // handle it as do not want to pessimize all atomics. Also they do not support 171281ad6265SDimitry Andric // the nontemporal attribute. 171381ad6265SDimitry Andric assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 171481ad6265SDimitry Andric 171581ad6265SDimitry Andric bool Changed = false; 171681ad6265SDimitry Andric 171781ad6265SDimitry Andric if (IsVolatile) { 171881ad6265SDimitry Andric // Set SC bits to indicate system scope. 171981ad6265SDimitry Andric Changed |= enableSC0Bit(MI); 172081ad6265SDimitry Andric Changed |= enableSC1Bit(MI); 172181ad6265SDimitry Andric 172281ad6265SDimitry Andric // Ensure operation has completed at system scope to cause all volatile 172381ad6265SDimitry Andric // operations to be visible outside the program in a global order. Do not 172481ad6265SDimitry Andric // request cross address space as only the global address space can be 172581ad6265SDimitry Andric // observable outside the program, so no need to cause a waitcnt for LDS 172681ad6265SDimitry Andric // address space operations. 172781ad6265SDimitry Andric Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 172881ad6265SDimitry Andric Position::AFTER); 172981ad6265SDimitry Andric 173081ad6265SDimitry Andric return Changed; 173181ad6265SDimitry Andric } 173281ad6265SDimitry Andric 173381ad6265SDimitry Andric if (IsNonTemporal) { 173481ad6265SDimitry Andric Changed |= enableNTBit(MI); 173581ad6265SDimitry Andric return Changed; 173681ad6265SDimitry Andric } 173781ad6265SDimitry Andric 173881ad6265SDimitry Andric return Changed; 173981ad6265SDimitry Andric } 174081ad6265SDimitry Andric 174181ad6265SDimitry Andric bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 174281ad6265SDimitry Andric SIAtomicScope Scope, 174381ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace, 174481ad6265SDimitry Andric Position Pos) const { 174581ad6265SDimitry Andric if (!InsertCacheInv) 174681ad6265SDimitry Andric return false; 174781ad6265SDimitry Andric 174881ad6265SDimitry Andric bool Changed = false; 174981ad6265SDimitry Andric 175081ad6265SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 175181ad6265SDimitry Andric DebugLoc DL = MI->getDebugLoc(); 175281ad6265SDimitry Andric 175381ad6265SDimitry Andric if (Pos == Position::AFTER) 175481ad6265SDimitry Andric ++MI; 175581ad6265SDimitry Andric 175681ad6265SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 175781ad6265SDimitry Andric switch (Scope) { 175881ad6265SDimitry Andric case SIAtomicScope::SYSTEM: 175981ad6265SDimitry Andric // Ensures that following loads will not see stale remote VMEM data or 176081ad6265SDimitry Andric // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and 176181ad6265SDimitry Andric // CC will never be stale due to the local memory probes. 176281ad6265SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 176381ad6265SDimitry Andric // Set SC bits to indicate system scope. 176481ad6265SDimitry Andric .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 176581ad6265SDimitry Andric // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the 176681ad6265SDimitry Andric // hardware does not reorder memory operations by the same wave with 176781ad6265SDimitry Andric // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to 176881ad6265SDimitry Andric // remove any cache lines of earlier writes by the same wave and ensures 176981ad6265SDimitry Andric // later reads by the same wave will refetch the cache lines. 177081ad6265SDimitry Andric Changed = true; 177181ad6265SDimitry Andric break; 177281ad6265SDimitry Andric case SIAtomicScope::AGENT: 177381ad6265SDimitry Andric // Ensures that following loads will not see stale remote date or local 177481ad6265SDimitry Andric // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale 177581ad6265SDimitry Andric // due to the memory probes. 177681ad6265SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 177781ad6265SDimitry Andric // Set SC bits to indicate agent scope. 177881ad6265SDimitry Andric .addImm(AMDGPU::CPol::SC1); 177981ad6265SDimitry Andric // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 178081ad6265SDimitry Andric // does not reorder memory operations with respect to preceeding buffer 178181ad6265SDimitry Andric // invalidate. The invalidate is guaranteed to remove any cache lines of 178281ad6265SDimitry Andric // earlier writes and ensures later writes will refetch the cache lines. 178381ad6265SDimitry Andric Changed = true; 178481ad6265SDimitry Andric break; 178581ad6265SDimitry Andric case SIAtomicScope::WORKGROUP: 178681ad6265SDimitry Andric // In threadgroup split mode the waves of a work-group can be executing on 178781ad6265SDimitry Andric // different CUs. Therefore need to invalidate the L1 which is per CU. 178881ad6265SDimitry Andric // Otherwise in non-threadgroup split mode all waves of a work-group are 178981ad6265SDimitry Andric // on the same CU, and so the L1 does not need to be invalidated. 179081ad6265SDimitry Andric if (ST.isTgSplitEnabled()) { 179181ad6265SDimitry Andric // Ensures L1 is invalidated if in threadgroup split mode. In 179281ad6265SDimitry Andric // non-threadgroup split mode it is a NOP, but no point generating it in 179381ad6265SDimitry Andric // that case if know not in that mode. 179481ad6265SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) 179581ad6265SDimitry Andric // Set SC bits to indicate work-group scope. 179681ad6265SDimitry Andric .addImm(AMDGPU::CPol::SC0); 179781ad6265SDimitry Andric // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware 179881ad6265SDimitry Andric // does not reorder memory operations with respect to preceeding buffer 179981ad6265SDimitry Andric // invalidate. The invalidate is guaranteed to remove any cache lines of 180081ad6265SDimitry Andric // earlier writes and ensures later writes will refetch the cache lines. 180181ad6265SDimitry Andric Changed = true; 180281ad6265SDimitry Andric } 180381ad6265SDimitry Andric break; 180481ad6265SDimitry Andric case SIAtomicScope::WAVEFRONT: 180581ad6265SDimitry Andric case SIAtomicScope::SINGLETHREAD: 180681ad6265SDimitry Andric // Could generate "BUFFER_INV" but it would do nothing as there are no 180781ad6265SDimitry Andric // caches to invalidate. 180881ad6265SDimitry Andric break; 180981ad6265SDimitry Andric default: 181081ad6265SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 181181ad6265SDimitry Andric } 181281ad6265SDimitry Andric } 181381ad6265SDimitry Andric 181481ad6265SDimitry Andric /// The scratch address space does not need the global memory cache 181581ad6265SDimitry Andric /// to be flushed as all memory operations by the same thread are 181681ad6265SDimitry Andric /// sequentially consistent, and no other thread can access scratch 181781ad6265SDimitry Andric /// memory. 181881ad6265SDimitry Andric 181981ad6265SDimitry Andric /// Other address spaces do not have a cache. 182081ad6265SDimitry Andric 182181ad6265SDimitry Andric if (Pos == Position::AFTER) 182281ad6265SDimitry Andric --MI; 182381ad6265SDimitry Andric 182481ad6265SDimitry Andric return Changed; 182581ad6265SDimitry Andric } 182681ad6265SDimitry Andric 182781ad6265SDimitry Andric bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 182881ad6265SDimitry Andric SIAtomicScope Scope, 182981ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace, 183081ad6265SDimitry Andric bool IsCrossAddrSpaceOrdering, 183181ad6265SDimitry Andric Position Pos) const { 183281ad6265SDimitry Andric bool Changed = false; 183381ad6265SDimitry Andric 183481ad6265SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 183581ad6265SDimitry Andric DebugLoc DL = MI->getDebugLoc(); 183681ad6265SDimitry Andric 183781ad6265SDimitry Andric if (Pos == Position::AFTER) 183881ad6265SDimitry Andric ++MI; 183981ad6265SDimitry Andric 184081ad6265SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 184181ad6265SDimitry Andric switch (Scope) { 184281ad6265SDimitry Andric case SIAtomicScope::SYSTEM: 184381ad6265SDimitry Andric // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the 184481ad6265SDimitry Andric // hardware does not reorder memory operations by the same wave with 184581ad6265SDimitry Andric // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed 184681ad6265SDimitry Andric // to initiate writeback of any dirty cache lines of earlier writes by the 184781ad6265SDimitry Andric // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the 184881ad6265SDimitry Andric // writeback has completed. 184981ad6265SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 185081ad6265SDimitry Andric // Set SC bits to indicate system scope. 185181ad6265SDimitry Andric .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); 185281ad6265SDimitry Andric // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 185381ad6265SDimitry Andric // SIAtomicScope::SYSTEM, the following insertWait will generate the 185481ad6265SDimitry Andric // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". 185581ad6265SDimitry Andric Changed = true; 185681ad6265SDimitry Andric break; 185781ad6265SDimitry Andric case SIAtomicScope::AGENT: 185881ad6265SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) 185981ad6265SDimitry Andric // Set SC bits to indicate agent scope. 186081ad6265SDimitry Andric .addImm(AMDGPU::CPol::SC1); 186181ad6265SDimitry Andric 186281ad6265SDimitry Andric // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is 186381ad6265SDimitry Andric // SIAtomicScope::AGENT, the following insertWait will generate the 186481ad6265SDimitry Andric // required "S_WAITCNT vmcnt(0)". 186581ad6265SDimitry Andric Changed = true; 186681ad6265SDimitry Andric break; 186781ad6265SDimitry Andric case SIAtomicScope::WORKGROUP: 186881ad6265SDimitry Andric case SIAtomicScope::WAVEFRONT: 186981ad6265SDimitry Andric case SIAtomicScope::SINGLETHREAD: 187081ad6265SDimitry Andric // Do not generate "BUFFER_WBL2" as there are no caches it would 187181ad6265SDimitry Andric // writeback, and would require an otherwise unnecessary 187281ad6265SDimitry Andric // "S_WAITCNT vmcnt(0)". 187381ad6265SDimitry Andric break; 187481ad6265SDimitry Andric default: 187581ad6265SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 187681ad6265SDimitry Andric } 187781ad6265SDimitry Andric } 187881ad6265SDimitry Andric 187981ad6265SDimitry Andric if (Pos == Position::AFTER) 188081ad6265SDimitry Andric --MI; 188181ad6265SDimitry Andric 188281ad6265SDimitry Andric // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other 188381ad6265SDimitry Andric // S_WAITCNT needed. 188481ad6265SDimitry Andric Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 188581ad6265SDimitry Andric IsCrossAddrSpaceOrdering, Pos); 188681ad6265SDimitry Andric 188781ad6265SDimitry Andric return Changed; 188881ad6265SDimitry Andric } 188981ad6265SDimitry Andric 18900b57cec5SDimitry Andric bool SIGfx10CacheControl::enableLoadCacheBypass( 18910b57cec5SDimitry Andric const MachineBasicBlock::iterator &MI, 18920b57cec5SDimitry Andric SIAtomicScope Scope, 18930b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace) const { 18940b57cec5SDimitry Andric assert(MI->mayLoad() && !MI->mayStore()); 18950b57cec5SDimitry Andric bool Changed = false; 18960b57cec5SDimitry Andric 18970b57cec5SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 18980b57cec5SDimitry Andric switch (Scope) { 18990b57cec5SDimitry Andric case SIAtomicScope::SYSTEM: 19000b57cec5SDimitry Andric case SIAtomicScope::AGENT: 19014824e7fdSDimitry Andric // Set the L0 and L1 cache policies to MISS_EVICT. 19024824e7fdSDimitry Andric // Note: there is no L2 cache coherent bypass control at the ISA level. 19030b57cec5SDimitry Andric Changed |= enableGLCBit(MI); 19040b57cec5SDimitry Andric Changed |= enableDLCBit(MI); 19050b57cec5SDimitry Andric break; 19060b57cec5SDimitry Andric case SIAtomicScope::WORKGROUP: 19070b57cec5SDimitry Andric // In WGP mode the waves of a work-group can be executing on either CU of 19080b57cec5SDimitry Andric // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 1909e8d8bef9SDimitry Andric // CU mode all waves of a work-group are on the same CU, and so the L0 1910e8d8bef9SDimitry Andric // does not need to be bypassed. 1911349cc55cSDimitry Andric if (!ST.isCuModeEnabled()) 1912349cc55cSDimitry Andric Changed |= enableGLCBit(MI); 19130b57cec5SDimitry Andric break; 19140b57cec5SDimitry Andric case SIAtomicScope::WAVEFRONT: 19150b57cec5SDimitry Andric case SIAtomicScope::SINGLETHREAD: 19160b57cec5SDimitry Andric // No cache to bypass. 19170b57cec5SDimitry Andric break; 19180b57cec5SDimitry Andric default: 19190b57cec5SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 19200b57cec5SDimitry Andric } 19210b57cec5SDimitry Andric } 19220b57cec5SDimitry Andric 19230b57cec5SDimitry Andric /// The scratch address space does not need the global memory caches 19240b57cec5SDimitry Andric /// to be bypassed as all memory operations by the same thread are 19250b57cec5SDimitry Andric /// sequentially consistent, and no other thread can access scratch 19260b57cec5SDimitry Andric /// memory. 19270b57cec5SDimitry Andric 1928e8d8bef9SDimitry Andric /// Other address spaces do not have a cache. 19290b57cec5SDimitry Andric 19300b57cec5SDimitry Andric return Changed; 19310b57cec5SDimitry Andric } 19320b57cec5SDimitry Andric 1933e8d8bef9SDimitry Andric bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( 1934e8d8bef9SDimitry Andric MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 1935*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 1936e8d8bef9SDimitry Andric 1937e8d8bef9SDimitry Andric // Only handle load and store, not atomic read-modify-write insructions. The 1938e8d8bef9SDimitry Andric // latter use glc to indicate if the atomic returns a result and so must not 1939e8d8bef9SDimitry Andric // be used for cache control. 19400b57cec5SDimitry Andric assert(MI->mayLoad() ^ MI->mayStore()); 1941e8d8bef9SDimitry Andric 1942e8d8bef9SDimitry Andric // Only update load and store, not LLVM IR atomic read-modify-write 1943e8d8bef9SDimitry Andric // instructions. The latter are always marked as volatile so cannot sensibly 1944e8d8bef9SDimitry Andric // handle it as do not want to pessimize all atomics. Also they do not support 1945e8d8bef9SDimitry Andric // the nontemporal attribute. 1946e8d8bef9SDimitry Andric assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 1947e8d8bef9SDimitry Andric 19480b57cec5SDimitry Andric bool Changed = false; 19490b57cec5SDimitry Andric 1950e8d8bef9SDimitry Andric if (IsVolatile) { 19514824e7fdSDimitry Andric // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 19524824e7fdSDimitry Andric // and MISS_LRU for store instructions. 19534824e7fdSDimitry Andric // Note: there is no L2 cache coherent bypass control at the ISA level. 1954e8d8bef9SDimitry Andric if (Op == SIMemOp::LOAD) { 1955e8d8bef9SDimitry Andric Changed |= enableGLCBit(MI); 1956e8d8bef9SDimitry Andric Changed |= enableDLCBit(MI); 1957e8d8bef9SDimitry Andric } 1958e8d8bef9SDimitry Andric 1959e8d8bef9SDimitry Andric // Ensure operation has completed at system scope to cause all volatile 1960e8d8bef9SDimitry Andric // operations to be visible outside the program in a global order. Do not 1961e8d8bef9SDimitry Andric // request cross address space as only the global address space can be 1962e8d8bef9SDimitry Andric // observable outside the program, so no need to cause a waitcnt for LDS 1963e8d8bef9SDimitry Andric // address space operations. 1964e8d8bef9SDimitry Andric Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 1965e8d8bef9SDimitry Andric Position::AFTER); 19660b57cec5SDimitry Andric return Changed; 19670b57cec5SDimitry Andric } 19680b57cec5SDimitry Andric 1969e8d8bef9SDimitry Andric if (IsNonTemporal) { 19704824e7fdSDimitry Andric // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 19714824e7fdSDimitry Andric // and L2 cache policy to STREAM. 19724824e7fdSDimitry Andric // For stores setting both GLC and SLC configures L0 and L1 cache policy 19734824e7fdSDimitry Andric // to MISS_EVICT and the L2 cache policy to STREAM. 19744824e7fdSDimitry Andric if (Op == SIMemOp::STORE) 19754824e7fdSDimitry Andric Changed |= enableGLCBit(MI); 1976e8d8bef9SDimitry Andric Changed |= enableSLCBit(MI); 19774824e7fdSDimitry Andric 1978e8d8bef9SDimitry Andric return Changed; 19790b57cec5SDimitry Andric } 19800b57cec5SDimitry Andric 19810b57cec5SDimitry Andric return Changed; 19820b57cec5SDimitry Andric } 19830b57cec5SDimitry Andric 19840b57cec5SDimitry Andric bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, 19850b57cec5SDimitry Andric SIAtomicScope Scope, 19860b57cec5SDimitry Andric SIAtomicAddrSpace AddrSpace, 19870b57cec5SDimitry Andric SIMemOp Op, 19880b57cec5SDimitry Andric bool IsCrossAddrSpaceOrdering, 19890b57cec5SDimitry Andric Position Pos) const { 19900b57cec5SDimitry Andric bool Changed = false; 19910b57cec5SDimitry Andric 19920b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 19930b57cec5SDimitry Andric DebugLoc DL = MI->getDebugLoc(); 19940b57cec5SDimitry Andric 19950b57cec5SDimitry Andric if (Pos == Position::AFTER) 19960b57cec5SDimitry Andric ++MI; 19970b57cec5SDimitry Andric 19980b57cec5SDimitry Andric bool VMCnt = false; 19990b57cec5SDimitry Andric bool VSCnt = false; 20000b57cec5SDimitry Andric bool LGKMCnt = false; 20010b57cec5SDimitry Andric 2002e8d8bef9SDimitry Andric if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 2003e8d8bef9SDimitry Andric SIAtomicAddrSpace::NONE) { 20040b57cec5SDimitry Andric switch (Scope) { 20050b57cec5SDimitry Andric case SIAtomicScope::SYSTEM: 20060b57cec5SDimitry Andric case SIAtomicScope::AGENT: 20070b57cec5SDimitry Andric if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 20080b57cec5SDimitry Andric VMCnt |= true; 20090b57cec5SDimitry Andric if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 20100b57cec5SDimitry Andric VSCnt |= true; 20110b57cec5SDimitry Andric break; 20120b57cec5SDimitry Andric case SIAtomicScope::WORKGROUP: 20130b57cec5SDimitry Andric // In WGP mode the waves of a work-group can be executing on either CU of 20140b57cec5SDimitry Andric // the WGP. Therefore need to wait for operations to complete to ensure 20150b57cec5SDimitry Andric // they are visible to waves in the other CU as the L0 is per CU. 20160b57cec5SDimitry Andric // Otherwise in CU mode and all waves of a work-group are on the same CU 20170b57cec5SDimitry Andric // which shares the same L0. 2018e8d8bef9SDimitry Andric if (!ST.isCuModeEnabled()) { 20190b57cec5SDimitry Andric if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 20200b57cec5SDimitry Andric VMCnt |= true; 20210b57cec5SDimitry Andric if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 20220b57cec5SDimitry Andric VSCnt |= true; 20230b57cec5SDimitry Andric } 20240b57cec5SDimitry Andric break; 20250b57cec5SDimitry Andric case SIAtomicScope::WAVEFRONT: 20260b57cec5SDimitry Andric case SIAtomicScope::SINGLETHREAD: 20270b57cec5SDimitry Andric // The L0 cache keeps all memory operations in order for 20280b57cec5SDimitry Andric // work-items in the same wavefront. 20290b57cec5SDimitry Andric break; 20300b57cec5SDimitry Andric default: 20310b57cec5SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 20320b57cec5SDimitry Andric } 20330b57cec5SDimitry Andric } 20340b57cec5SDimitry Andric 20350b57cec5SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 20360b57cec5SDimitry Andric switch (Scope) { 20370b57cec5SDimitry Andric case SIAtomicScope::SYSTEM: 20380b57cec5SDimitry Andric case SIAtomicScope::AGENT: 20390b57cec5SDimitry Andric case SIAtomicScope::WORKGROUP: 2040e8d8bef9SDimitry Andric // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 2041e8d8bef9SDimitry Andric // not needed as LDS operations for all waves are executed in a total 2042e8d8bef9SDimitry Andric // global ordering as observed by all waves. Required if also 2043e8d8bef9SDimitry Andric // synchronizing with global/GDS memory as LDS operations could be 2044e8d8bef9SDimitry Andric // reordered with respect to later global/GDS memory operations of the 2045e8d8bef9SDimitry Andric // same wave. 20460b57cec5SDimitry Andric LGKMCnt |= IsCrossAddrSpaceOrdering; 20470b57cec5SDimitry Andric break; 20480b57cec5SDimitry Andric case SIAtomicScope::WAVEFRONT: 20490b57cec5SDimitry Andric case SIAtomicScope::SINGLETHREAD: 20500b57cec5SDimitry Andric // The LDS keeps all memory operations in order for 205181ad6265SDimitry Andric // the same wavefront. 20520b57cec5SDimitry Andric break; 20530b57cec5SDimitry Andric default: 20540b57cec5SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 20550b57cec5SDimitry Andric } 20560b57cec5SDimitry Andric } 20570b57cec5SDimitry Andric 20580b57cec5SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { 20590b57cec5SDimitry Andric switch (Scope) { 20600b57cec5SDimitry Andric case SIAtomicScope::SYSTEM: 20610b57cec5SDimitry Andric case SIAtomicScope::AGENT: 2062e8d8bef9SDimitry Andric // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)" 2063e8d8bef9SDimitry Andric // is not needed as GDS operations for all waves are executed in a total 2064e8d8bef9SDimitry Andric // global ordering as observed by all waves. Required if also 2065e8d8bef9SDimitry Andric // synchronizing with global/LDS memory as GDS operations could be 2066e8d8bef9SDimitry Andric // reordered with respect to later global/LDS memory operations of the 2067e8d8bef9SDimitry Andric // same wave. 20680b57cec5SDimitry Andric LGKMCnt |= IsCrossAddrSpaceOrdering; 20690b57cec5SDimitry Andric break; 20700b57cec5SDimitry Andric case SIAtomicScope::WORKGROUP: 20710b57cec5SDimitry Andric case SIAtomicScope::WAVEFRONT: 20720b57cec5SDimitry Andric case SIAtomicScope::SINGLETHREAD: 20730b57cec5SDimitry Andric // The GDS keeps all memory operations in order for 20740b57cec5SDimitry Andric // the same work-group. 20750b57cec5SDimitry Andric break; 20760b57cec5SDimitry Andric default: 20770b57cec5SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 20780b57cec5SDimitry Andric } 20790b57cec5SDimitry Andric } 20800b57cec5SDimitry Andric 20810b57cec5SDimitry Andric if (VMCnt || LGKMCnt) { 20820b57cec5SDimitry Andric unsigned WaitCntImmediate = 20830b57cec5SDimitry Andric AMDGPU::encodeWaitcnt(IV, 20840b57cec5SDimitry Andric VMCnt ? 0 : getVmcntBitMask(IV), 20850b57cec5SDimitry Andric getExpcntBitMask(IV), 20860b57cec5SDimitry Andric LGKMCnt ? 0 : getLgkmcntBitMask(IV)); 20875f757f3fSDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) 20885f757f3fSDimitry Andric .addImm(WaitCntImmediate); 20890b57cec5SDimitry Andric Changed = true; 20900b57cec5SDimitry Andric } 20910b57cec5SDimitry Andric 20920b57cec5SDimitry Andric if (VSCnt) { 20935f757f3fSDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) 20940b57cec5SDimitry Andric .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 20950b57cec5SDimitry Andric .addImm(0); 20960b57cec5SDimitry Andric Changed = true; 20970b57cec5SDimitry Andric } 20980b57cec5SDimitry Andric 20990b57cec5SDimitry Andric if (Pos == Position::AFTER) 21000b57cec5SDimitry Andric --MI; 21010b57cec5SDimitry Andric 21020b57cec5SDimitry Andric return Changed; 21030b57cec5SDimitry Andric } 21040b57cec5SDimitry Andric 2105e8d8bef9SDimitry Andric bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 2106e8d8bef9SDimitry Andric SIAtomicScope Scope, 2107e8d8bef9SDimitry Andric SIAtomicAddrSpace AddrSpace, 2108e8d8bef9SDimitry Andric Position Pos) const { 2109e8d8bef9SDimitry Andric if (!InsertCacheInv) 2110e8d8bef9SDimitry Andric return false; 2111e8d8bef9SDimitry Andric 2112e8d8bef9SDimitry Andric bool Changed = false; 2113e8d8bef9SDimitry Andric 2114e8d8bef9SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 2115e8d8bef9SDimitry Andric DebugLoc DL = MI->getDebugLoc(); 2116e8d8bef9SDimitry Andric 2117e8d8bef9SDimitry Andric if (Pos == Position::AFTER) 2118e8d8bef9SDimitry Andric ++MI; 2119e8d8bef9SDimitry Andric 2120e8d8bef9SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2121e8d8bef9SDimitry Andric switch (Scope) { 2122e8d8bef9SDimitry Andric case SIAtomicScope::SYSTEM: 2123e8d8bef9SDimitry Andric case SIAtomicScope::AGENT: 2124*0fca6ea1SDimitry Andric // The order of invalidates matter here. We must invalidate "outer in" 2125*0fca6ea1SDimitry Andric // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is 2126*0fca6ea1SDimitry Andric // invalidated. 2127e8d8bef9SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); 2128*0fca6ea1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2129e8d8bef9SDimitry Andric Changed = true; 2130e8d8bef9SDimitry Andric break; 2131e8d8bef9SDimitry Andric case SIAtomicScope::WORKGROUP: 2132e8d8bef9SDimitry Andric // In WGP mode the waves of a work-group can be executing on either CU of 2133e8d8bef9SDimitry Andric // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise 2134e8d8bef9SDimitry Andric // in CU mode and all waves of a work-group are on the same CU, and so the 2135e8d8bef9SDimitry Andric // L0 does not need to be invalidated. 2136e8d8bef9SDimitry Andric if (!ST.isCuModeEnabled()) { 2137e8d8bef9SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); 2138e8d8bef9SDimitry Andric Changed = true; 2139e8d8bef9SDimitry Andric } 2140e8d8bef9SDimitry Andric break; 2141e8d8bef9SDimitry Andric case SIAtomicScope::WAVEFRONT: 2142e8d8bef9SDimitry Andric case SIAtomicScope::SINGLETHREAD: 2143e8d8bef9SDimitry Andric // No cache to invalidate. 2144e8d8bef9SDimitry Andric break; 2145e8d8bef9SDimitry Andric default: 2146e8d8bef9SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 2147e8d8bef9SDimitry Andric } 2148e8d8bef9SDimitry Andric } 2149e8d8bef9SDimitry Andric 2150e8d8bef9SDimitry Andric /// The scratch address space does not need the global memory cache 2151e8d8bef9SDimitry Andric /// to be flushed as all memory operations by the same thread are 2152e8d8bef9SDimitry Andric /// sequentially consistent, and no other thread can access scratch 2153e8d8bef9SDimitry Andric /// memory. 2154e8d8bef9SDimitry Andric 2155e8d8bef9SDimitry Andric /// Other address spaces do not have a cache. 2156e8d8bef9SDimitry Andric 2157e8d8bef9SDimitry Andric if (Pos == Position::AFTER) 2158e8d8bef9SDimitry Andric --MI; 2159e8d8bef9SDimitry Andric 2160e8d8bef9SDimitry Andric return Changed; 2161e8d8bef9SDimitry Andric } 2162e8d8bef9SDimitry Andric 216381ad6265SDimitry Andric bool SIGfx11CacheControl::enableLoadCacheBypass( 216481ad6265SDimitry Andric const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, 216581ad6265SDimitry Andric SIAtomicAddrSpace AddrSpace) const { 216681ad6265SDimitry Andric assert(MI->mayLoad() && !MI->mayStore()); 216781ad6265SDimitry Andric bool Changed = false; 216881ad6265SDimitry Andric 216981ad6265SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 217081ad6265SDimitry Andric switch (Scope) { 217181ad6265SDimitry Andric case SIAtomicScope::SYSTEM: 217281ad6265SDimitry Andric case SIAtomicScope::AGENT: 217381ad6265SDimitry Andric // Set the L0 and L1 cache policies to MISS_EVICT. 217481ad6265SDimitry Andric // Note: there is no L2 cache coherent bypass control at the ISA level. 217581ad6265SDimitry Andric Changed |= enableGLCBit(MI); 217681ad6265SDimitry Andric break; 217781ad6265SDimitry Andric case SIAtomicScope::WORKGROUP: 217881ad6265SDimitry Andric // In WGP mode the waves of a work-group can be executing on either CU of 217981ad6265SDimitry Andric // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in 218081ad6265SDimitry Andric // CU mode all waves of a work-group are on the same CU, and so the L0 218181ad6265SDimitry Andric // does not need to be bypassed. 218281ad6265SDimitry Andric if (!ST.isCuModeEnabled()) 218381ad6265SDimitry Andric Changed |= enableGLCBit(MI); 218481ad6265SDimitry Andric break; 218581ad6265SDimitry Andric case SIAtomicScope::WAVEFRONT: 218681ad6265SDimitry Andric case SIAtomicScope::SINGLETHREAD: 218781ad6265SDimitry Andric // No cache to bypass. 218881ad6265SDimitry Andric break; 218981ad6265SDimitry Andric default: 219081ad6265SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 219181ad6265SDimitry Andric } 219281ad6265SDimitry Andric } 219381ad6265SDimitry Andric 219481ad6265SDimitry Andric /// The scratch address space does not need the global memory caches 219581ad6265SDimitry Andric /// to be bypassed as all memory operations by the same thread are 219681ad6265SDimitry Andric /// sequentially consistent, and no other thread can access scratch 219781ad6265SDimitry Andric /// memory. 219881ad6265SDimitry Andric 219981ad6265SDimitry Andric /// Other address spaces do not have a cache. 220081ad6265SDimitry Andric 220181ad6265SDimitry Andric return Changed; 220281ad6265SDimitry Andric } 220381ad6265SDimitry Andric 220481ad6265SDimitry Andric bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( 220581ad6265SDimitry Andric MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2206*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 220781ad6265SDimitry Andric 220881ad6265SDimitry Andric // Only handle load and store, not atomic read-modify-write insructions. The 220981ad6265SDimitry Andric // latter use glc to indicate if the atomic returns a result and so must not 221081ad6265SDimitry Andric // be used for cache control. 221181ad6265SDimitry Andric assert(MI->mayLoad() ^ MI->mayStore()); 221281ad6265SDimitry Andric 221381ad6265SDimitry Andric // Only update load and store, not LLVM IR atomic read-modify-write 221481ad6265SDimitry Andric // instructions. The latter are always marked as volatile so cannot sensibly 221581ad6265SDimitry Andric // handle it as do not want to pessimize all atomics. Also they do not support 221681ad6265SDimitry Andric // the nontemporal attribute. 221781ad6265SDimitry Andric assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 221881ad6265SDimitry Andric 221981ad6265SDimitry Andric bool Changed = false; 222081ad6265SDimitry Andric 222181ad6265SDimitry Andric if (IsVolatile) { 222281ad6265SDimitry Andric // Set L0 and L1 cache policy to be MISS_EVICT for load instructions 222381ad6265SDimitry Andric // and MISS_LRU for store instructions. 222481ad6265SDimitry Andric // Note: there is no L2 cache coherent bypass control at the ISA level. 222581ad6265SDimitry Andric if (Op == SIMemOp::LOAD) 222681ad6265SDimitry Andric Changed |= enableGLCBit(MI); 222781ad6265SDimitry Andric 222881ad6265SDimitry Andric // Set MALL NOALLOC for load and store instructions. 222981ad6265SDimitry Andric Changed |= enableDLCBit(MI); 223081ad6265SDimitry Andric 223181ad6265SDimitry Andric // Ensure operation has completed at system scope to cause all volatile 223281ad6265SDimitry Andric // operations to be visible outside the program in a global order. Do not 223381ad6265SDimitry Andric // request cross address space as only the global address space can be 223481ad6265SDimitry Andric // observable outside the program, so no need to cause a waitcnt for LDS 223581ad6265SDimitry Andric // address space operations. 223681ad6265SDimitry Andric Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 223781ad6265SDimitry Andric Position::AFTER); 223881ad6265SDimitry Andric return Changed; 223981ad6265SDimitry Andric } 224081ad6265SDimitry Andric 224181ad6265SDimitry Andric if (IsNonTemporal) { 224281ad6265SDimitry Andric // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT 224381ad6265SDimitry Andric // and L2 cache policy to STREAM. 224481ad6265SDimitry Andric // For stores setting both GLC and SLC configures L0 and L1 cache policy 224581ad6265SDimitry Andric // to MISS_EVICT and the L2 cache policy to STREAM. 224681ad6265SDimitry Andric if (Op == SIMemOp::STORE) 224781ad6265SDimitry Andric Changed |= enableGLCBit(MI); 224881ad6265SDimitry Andric Changed |= enableSLCBit(MI); 224981ad6265SDimitry Andric 225081ad6265SDimitry Andric // Set MALL NOALLOC for load and store instructions. 225181ad6265SDimitry Andric Changed |= enableDLCBit(MI); 225281ad6265SDimitry Andric return Changed; 225381ad6265SDimitry Andric } 225481ad6265SDimitry Andric 225581ad6265SDimitry Andric return Changed; 225681ad6265SDimitry Andric } 225781ad6265SDimitry Andric 22587a6dacacSDimitry Andric bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, 22597a6dacacSDimitry Andric AMDGPU::CPol::CPol Value) const { 22607a6dacacSDimitry Andric MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 22617a6dacacSDimitry Andric if (!CPol) 22627a6dacacSDimitry Andric return false; 22637a6dacacSDimitry Andric 22647a6dacacSDimitry Andric uint64_t NewTH = Value & AMDGPU::CPol::TH; 22657a6dacacSDimitry Andric if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) { 22667a6dacacSDimitry Andric CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH); 22677a6dacacSDimitry Andric return true; 22687a6dacacSDimitry Andric } 22697a6dacacSDimitry Andric 22707a6dacacSDimitry Andric return false; 22717a6dacacSDimitry Andric } 22727a6dacacSDimitry Andric 22737a6dacacSDimitry Andric bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, 22747a6dacacSDimitry Andric AMDGPU::CPol::CPol Value) const { 22757a6dacacSDimitry Andric MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 22767a6dacacSDimitry Andric if (!CPol) 22777a6dacacSDimitry Andric return false; 22787a6dacacSDimitry Andric 22797a6dacacSDimitry Andric uint64_t NewScope = Value & AMDGPU::CPol::SCOPE; 22807a6dacacSDimitry Andric if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) { 22817a6dacacSDimitry Andric CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope); 22827a6dacacSDimitry Andric return true; 22837a6dacacSDimitry Andric } 22847a6dacacSDimitry Andric 22857a6dacacSDimitry Andric return false; 22867a6dacacSDimitry Andric } 22877a6dacacSDimitry Andric 2288*0fca6ea1SDimitry Andric bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore( 2289*0fca6ea1SDimitry Andric const MachineBasicBlock::iterator MI) const { 2290*0fca6ea1SDimitry Andric // TODO: implement flag for frontend to give us a hint not to insert waits. 2291*0fca6ea1SDimitry Andric 2292*0fca6ea1SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 2293*0fca6ea1SDimitry Andric const DebugLoc &DL = MI->getDebugLoc(); 2294*0fca6ea1SDimitry Andric 2295*0fca6ea1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0); 2296*0fca6ea1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0); 2297*0fca6ea1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0); 2298*0fca6ea1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0); 2299*0fca6ea1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0); 2300*0fca6ea1SDimitry Andric 2301*0fca6ea1SDimitry Andric return true; 2302*0fca6ea1SDimitry Andric } 2303*0fca6ea1SDimitry Andric 23047a6dacacSDimitry Andric bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, 23057a6dacacSDimitry Andric SIAtomicScope Scope, 23067a6dacacSDimitry Andric SIAtomicAddrSpace AddrSpace, SIMemOp Op, 23077a6dacacSDimitry Andric bool IsCrossAddrSpaceOrdering, 23087a6dacacSDimitry Andric Position Pos) const { 23097a6dacacSDimitry Andric bool Changed = false; 23107a6dacacSDimitry Andric 23117a6dacacSDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 23127a6dacacSDimitry Andric DebugLoc DL = MI->getDebugLoc(); 23137a6dacacSDimitry Andric 23147a6dacacSDimitry Andric bool LOADCnt = false; 23157a6dacacSDimitry Andric bool DSCnt = false; 23167a6dacacSDimitry Andric bool STORECnt = false; 23177a6dacacSDimitry Andric 23187a6dacacSDimitry Andric if (Pos == Position::AFTER) 23197a6dacacSDimitry Andric ++MI; 23207a6dacacSDimitry Andric 23217a6dacacSDimitry Andric if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != 23227a6dacacSDimitry Andric SIAtomicAddrSpace::NONE) { 23237a6dacacSDimitry Andric switch (Scope) { 23247a6dacacSDimitry Andric case SIAtomicScope::SYSTEM: 23257a6dacacSDimitry Andric case SIAtomicScope::AGENT: 23267a6dacacSDimitry Andric if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 23277a6dacacSDimitry Andric LOADCnt |= true; 23287a6dacacSDimitry Andric if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 23297a6dacacSDimitry Andric STORECnt |= true; 23307a6dacacSDimitry Andric break; 23317a6dacacSDimitry Andric case SIAtomicScope::WORKGROUP: 23327a6dacacSDimitry Andric // In WGP mode the waves of a work-group can be executing on either CU of 23337a6dacacSDimitry Andric // the WGP. Therefore need to wait for operations to complete to ensure 23347a6dacacSDimitry Andric // they are visible to waves in the other CU as the L0 is per CU. 23357a6dacacSDimitry Andric // Otherwise in CU mode and all waves of a work-group are on the same CU 23367a6dacacSDimitry Andric // which shares the same L0. 23377a6dacacSDimitry Andric if (!ST.isCuModeEnabled()) { 23387a6dacacSDimitry Andric if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) 23397a6dacacSDimitry Andric LOADCnt |= true; 23407a6dacacSDimitry Andric if ((Op & SIMemOp::STORE) != SIMemOp::NONE) 23417a6dacacSDimitry Andric STORECnt |= true; 23427a6dacacSDimitry Andric } 23437a6dacacSDimitry Andric break; 23447a6dacacSDimitry Andric case SIAtomicScope::WAVEFRONT: 23457a6dacacSDimitry Andric case SIAtomicScope::SINGLETHREAD: 23467a6dacacSDimitry Andric // The L0 cache keeps all memory operations in order for 23477a6dacacSDimitry Andric // work-items in the same wavefront. 23487a6dacacSDimitry Andric break; 23497a6dacacSDimitry Andric default: 23507a6dacacSDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 23517a6dacacSDimitry Andric } 23527a6dacacSDimitry Andric } 23537a6dacacSDimitry Andric 23547a6dacacSDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { 23557a6dacacSDimitry Andric switch (Scope) { 23567a6dacacSDimitry Andric case SIAtomicScope::SYSTEM: 23577a6dacacSDimitry Andric case SIAtomicScope::AGENT: 23587a6dacacSDimitry Andric case SIAtomicScope::WORKGROUP: 23597a6dacacSDimitry Andric // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is 23607a6dacacSDimitry Andric // not needed as LDS operations for all waves are executed in a total 23617a6dacacSDimitry Andric // global ordering as observed by all waves. Required if also 23627a6dacacSDimitry Andric // synchronizing with global/GDS memory as LDS operations could be 23637a6dacacSDimitry Andric // reordered with respect to later global/GDS memory operations of the 23647a6dacacSDimitry Andric // same wave. 23657a6dacacSDimitry Andric DSCnt |= IsCrossAddrSpaceOrdering; 23667a6dacacSDimitry Andric break; 23677a6dacacSDimitry Andric case SIAtomicScope::WAVEFRONT: 23687a6dacacSDimitry Andric case SIAtomicScope::SINGLETHREAD: 23697a6dacacSDimitry Andric // The LDS keeps all memory operations in order for 23707a6dacacSDimitry Andric // the same wavefront. 23717a6dacacSDimitry Andric break; 23727a6dacacSDimitry Andric default: 23737a6dacacSDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 23747a6dacacSDimitry Andric } 23757a6dacacSDimitry Andric } 23767a6dacacSDimitry Andric 23777a6dacacSDimitry Andric if (LOADCnt) { 23787a6dacacSDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); 23797a6dacacSDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); 23807a6dacacSDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); 23817a6dacacSDimitry Andric Changed = true; 23827a6dacacSDimitry Andric } 23837a6dacacSDimitry Andric 23847a6dacacSDimitry Andric if (STORECnt) { 23857a6dacacSDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0); 23867a6dacacSDimitry Andric Changed = true; 23877a6dacacSDimitry Andric } 23887a6dacacSDimitry Andric 23897a6dacacSDimitry Andric if (DSCnt) { 23907a6dacacSDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0); 23917a6dacacSDimitry Andric Changed = true; 23927a6dacacSDimitry Andric } 23937a6dacacSDimitry Andric 23947a6dacacSDimitry Andric if (Pos == Position::AFTER) 23957a6dacacSDimitry Andric --MI; 23967a6dacacSDimitry Andric 23977a6dacacSDimitry Andric return Changed; 23987a6dacacSDimitry Andric } 23997a6dacacSDimitry Andric 24001db9f3b2SDimitry Andric bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, 24011db9f3b2SDimitry Andric SIAtomicScope Scope, 24021db9f3b2SDimitry Andric SIAtomicAddrSpace AddrSpace, 24031db9f3b2SDimitry Andric Position Pos) const { 24041db9f3b2SDimitry Andric if (!InsertCacheInv) 24051db9f3b2SDimitry Andric return false; 24061db9f3b2SDimitry Andric 24071db9f3b2SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 24081db9f3b2SDimitry Andric DebugLoc DL = MI->getDebugLoc(); 24091db9f3b2SDimitry Andric 24101db9f3b2SDimitry Andric /// The scratch address space does not need the global memory cache 24111db9f3b2SDimitry Andric /// to be flushed as all memory operations by the same thread are 24121db9f3b2SDimitry Andric /// sequentially consistent, and no other thread can access scratch 24131db9f3b2SDimitry Andric /// memory. 24141db9f3b2SDimitry Andric 24151db9f3b2SDimitry Andric /// Other address spaces do not have a cache. 24161db9f3b2SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) 24171db9f3b2SDimitry Andric return false; 24181db9f3b2SDimitry Andric 24191db9f3b2SDimitry Andric AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; 24201db9f3b2SDimitry Andric switch (Scope) { 24211db9f3b2SDimitry Andric case SIAtomicScope::SYSTEM: 24221db9f3b2SDimitry Andric ScopeImm = AMDGPU::CPol::SCOPE_SYS; 24231db9f3b2SDimitry Andric break; 24241db9f3b2SDimitry Andric case SIAtomicScope::AGENT: 24251db9f3b2SDimitry Andric ScopeImm = AMDGPU::CPol::SCOPE_DEV; 24261db9f3b2SDimitry Andric break; 24271db9f3b2SDimitry Andric case SIAtomicScope::WORKGROUP: 24281db9f3b2SDimitry Andric // In WGP mode the waves of a work-group can be executing on either CU of 24291db9f3b2SDimitry Andric // the WGP. Therefore we need to invalidate the L0 which is per CU. 24301db9f3b2SDimitry Andric // Otherwise in CU mode all waves of a work-group are on the same CU, and so 24311db9f3b2SDimitry Andric // the L0 does not need to be invalidated. 24321db9f3b2SDimitry Andric if (ST.isCuModeEnabled()) 24331db9f3b2SDimitry Andric return false; 24341db9f3b2SDimitry Andric 24351db9f3b2SDimitry Andric ScopeImm = AMDGPU::CPol::SCOPE_SE; 24361db9f3b2SDimitry Andric break; 24371db9f3b2SDimitry Andric case SIAtomicScope::WAVEFRONT: 24381db9f3b2SDimitry Andric case SIAtomicScope::SINGLETHREAD: 24391db9f3b2SDimitry Andric // No cache to invalidate. 24401db9f3b2SDimitry Andric return false; 24411db9f3b2SDimitry Andric default: 24421db9f3b2SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 24431db9f3b2SDimitry Andric } 24441db9f3b2SDimitry Andric 24451db9f3b2SDimitry Andric if (Pos == Position::AFTER) 24461db9f3b2SDimitry Andric ++MI; 24471db9f3b2SDimitry Andric 24481db9f3b2SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm); 24491db9f3b2SDimitry Andric 24501db9f3b2SDimitry Andric if (Pos == Position::AFTER) 24511db9f3b2SDimitry Andric --MI; 24521db9f3b2SDimitry Andric 24531db9f3b2SDimitry Andric return true; 24541db9f3b2SDimitry Andric } 24551db9f3b2SDimitry Andric 2456*0fca6ea1SDimitry Andric bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, 2457*0fca6ea1SDimitry Andric SIAtomicScope Scope, 2458*0fca6ea1SDimitry Andric SIAtomicAddrSpace AddrSpace, 2459*0fca6ea1SDimitry Andric bool IsCrossAddrSpaceOrdering, 2460*0fca6ea1SDimitry Andric Position Pos) const { 2461*0fca6ea1SDimitry Andric MachineBasicBlock &MBB = *MI->getParent(); 2462*0fca6ea1SDimitry Andric DebugLoc DL = MI->getDebugLoc(); 2463*0fca6ea1SDimitry Andric 2464*0fca6ea1SDimitry Andric // The scratch address space does not need the global memory cache 2465*0fca6ea1SDimitry Andric // writeback as all memory operations by the same thread are 2466*0fca6ea1SDimitry Andric // sequentially consistent, and no other thread can access scratch 2467*0fca6ea1SDimitry Andric // memory. 2468*0fca6ea1SDimitry Andric 2469*0fca6ea1SDimitry Andric // Other address spaces do not have a cache. 2470*0fca6ea1SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) 2471*0fca6ea1SDimitry Andric return false; 2472*0fca6ea1SDimitry Andric 2473*0fca6ea1SDimitry Andric if (Pos == Position::AFTER) 2474*0fca6ea1SDimitry Andric ++MI; 2475*0fca6ea1SDimitry Andric 2476*0fca6ea1SDimitry Andric // GLOBAL_WB is always needed, even for write-through caches, as it 2477*0fca6ea1SDimitry Andric // additionally ensures all operations have reached the desired cache level. 2478*0fca6ea1SDimitry Andric bool SkipWB = false; 2479*0fca6ea1SDimitry Andric AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2480*0fca6ea1SDimitry Andric switch (Scope) { 2481*0fca6ea1SDimitry Andric case SIAtomicScope::SYSTEM: 2482*0fca6ea1SDimitry Andric ScopeImm = AMDGPU::CPol::SCOPE_SYS; 2483*0fca6ea1SDimitry Andric break; 2484*0fca6ea1SDimitry Andric case SIAtomicScope::AGENT: 2485*0fca6ea1SDimitry Andric ScopeImm = AMDGPU::CPol::SCOPE_DEV; 2486*0fca6ea1SDimitry Andric break; 2487*0fca6ea1SDimitry Andric case SIAtomicScope::WORKGROUP: 2488*0fca6ea1SDimitry Andric // In WGP mode the waves of a work-group can be executing on either CU of 2489*0fca6ea1SDimitry Andric // the WGP. Therefore we need to ensure all operations have reached L1, 2490*0fca6ea1SDimitry Andric // hence the SCOPE_SE WB. 2491*0fca6ea1SDimitry Andric // For CU mode, we need operations to reach L0, so the wait is enough - 2492*0fca6ea1SDimitry Andric // there are no ways for an operation to report completion without reaching 2493*0fca6ea1SDimitry Andric // at least L0. 2494*0fca6ea1SDimitry Andric if (ST.isCuModeEnabled()) 2495*0fca6ea1SDimitry Andric SkipWB = true; 2496*0fca6ea1SDimitry Andric else 2497*0fca6ea1SDimitry Andric ScopeImm = AMDGPU::CPol::SCOPE_SE; 2498*0fca6ea1SDimitry Andric break; 2499*0fca6ea1SDimitry Andric case SIAtomicScope::WAVEFRONT: 2500*0fca6ea1SDimitry Andric case SIAtomicScope::SINGLETHREAD: 2501*0fca6ea1SDimitry Andric // No cache to invalidate. 2502*0fca6ea1SDimitry Andric return false; 2503*0fca6ea1SDimitry Andric default: 2504*0fca6ea1SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 2505*0fca6ea1SDimitry Andric } 2506*0fca6ea1SDimitry Andric 2507*0fca6ea1SDimitry Andric if (!SkipWB) 2508*0fca6ea1SDimitry Andric BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm); 2509*0fca6ea1SDimitry Andric 2510*0fca6ea1SDimitry Andric if (Pos == Position::AFTER) 2511*0fca6ea1SDimitry Andric --MI; 2512*0fca6ea1SDimitry Andric 2513*0fca6ea1SDimitry Andric // We always have to wait for previous memory operations (load/store) to 2514*0fca6ea1SDimitry Andric // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), 2515*0fca6ea1SDimitry Andric // we of course need to wait for that as well. 2516*0fca6ea1SDimitry Andric insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 2517*0fca6ea1SDimitry Andric IsCrossAddrSpaceOrdering, Pos); 2518*0fca6ea1SDimitry Andric 2519*0fca6ea1SDimitry Andric return true; 2520*0fca6ea1SDimitry Andric } 2521*0fca6ea1SDimitry Andric 25227a6dacacSDimitry Andric bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( 25237a6dacacSDimitry Andric MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, 2524*0fca6ea1SDimitry Andric bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { 25257a6dacacSDimitry Andric 25267a6dacacSDimitry Andric // Only handle load and store, not atomic read-modify-write instructions. 25277a6dacacSDimitry Andric assert(MI->mayLoad() ^ MI->mayStore()); 25287a6dacacSDimitry Andric 25297a6dacacSDimitry Andric // Only update load and store, not LLVM IR atomic read-modify-write 25307a6dacacSDimitry Andric // instructions. The latter are always marked as volatile so cannot sensibly 25317a6dacacSDimitry Andric // handle it as do not want to pessimize all atomics. Also they do not support 25327a6dacacSDimitry Andric // the nontemporal attribute. 25337a6dacacSDimitry Andric assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); 25347a6dacacSDimitry Andric 25357a6dacacSDimitry Andric bool Changed = false; 25367a6dacacSDimitry Andric 2537*0fca6ea1SDimitry Andric if (IsLastUse) { 2538*0fca6ea1SDimitry Andric // Set last-use hint. 2539*0fca6ea1SDimitry Andric Changed |= setTH(MI, AMDGPU::CPol::TH_LU); 2540*0fca6ea1SDimitry Andric } else if (IsNonTemporal) { 25415678d1d9SDimitry Andric // Set non-temporal hint for all cache levels. 25425678d1d9SDimitry Andric Changed |= setTH(MI, AMDGPU::CPol::TH_NT); 25435678d1d9SDimitry Andric } 25445678d1d9SDimitry Andric 25457a6dacacSDimitry Andric if (IsVolatile) { 25467a6dacacSDimitry Andric Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); 25477a6dacacSDimitry Andric 2548*0fca6ea1SDimitry Andric if (Op == SIMemOp::STORE) 2549*0fca6ea1SDimitry Andric Changed |= insertWaitsBeforeSystemScopeStore(MI); 2550*0fca6ea1SDimitry Andric 25517a6dacacSDimitry Andric // Ensure operation has completed at system scope to cause all volatile 25527a6dacacSDimitry Andric // operations to be visible outside the program in a global order. Do not 25537a6dacacSDimitry Andric // request cross address space as only the global address space can be 25547a6dacacSDimitry Andric // observable outside the program, so no need to cause a waitcnt for LDS 25557a6dacacSDimitry Andric // address space operations. 25567a6dacacSDimitry Andric Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, 25577a6dacacSDimitry Andric Position::AFTER); 25587a6dacacSDimitry Andric } 25597a6dacacSDimitry Andric 25607a6dacacSDimitry Andric return Changed; 25617a6dacacSDimitry Andric } 25627a6dacacSDimitry Andric 2563*0fca6ea1SDimitry Andric bool SIGfx12CacheControl::expandSystemScopeStore( 2564*0fca6ea1SDimitry Andric MachineBasicBlock::iterator &MI) const { 2565*0fca6ea1SDimitry Andric MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); 2566*0fca6ea1SDimitry Andric if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) 2567*0fca6ea1SDimitry Andric return insertWaitsBeforeSystemScopeStore(MI); 2568*0fca6ea1SDimitry Andric 2569*0fca6ea1SDimitry Andric return false; 2570*0fca6ea1SDimitry Andric } 2571*0fca6ea1SDimitry Andric 2572*0fca6ea1SDimitry Andric bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, 2573*0fca6ea1SDimitry Andric SIAtomicScope Scope, 2574*0fca6ea1SDimitry Andric SIAtomicAddrSpace AddrSpace) const { 2575*0fca6ea1SDimitry Andric bool Changed = false; 2576*0fca6ea1SDimitry Andric 2577*0fca6ea1SDimitry Andric if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { 2578*0fca6ea1SDimitry Andric switch (Scope) { 2579*0fca6ea1SDimitry Andric case SIAtomicScope::SYSTEM: 2580*0fca6ea1SDimitry Andric Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); 2581*0fca6ea1SDimitry Andric break; 2582*0fca6ea1SDimitry Andric case SIAtomicScope::AGENT: 2583*0fca6ea1SDimitry Andric Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV); 2584*0fca6ea1SDimitry Andric break; 2585*0fca6ea1SDimitry Andric case SIAtomicScope::WORKGROUP: 2586*0fca6ea1SDimitry Andric // In workgroup mode, SCOPE_SE is needed as waves can executes on 2587*0fca6ea1SDimitry Andric // different CUs that access different L0s. 2588*0fca6ea1SDimitry Andric if (!ST.isCuModeEnabled()) 2589*0fca6ea1SDimitry Andric Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE); 2590*0fca6ea1SDimitry Andric break; 2591*0fca6ea1SDimitry Andric case SIAtomicScope::WAVEFRONT: 2592*0fca6ea1SDimitry Andric case SIAtomicScope::SINGLETHREAD: 2593*0fca6ea1SDimitry Andric // No cache to bypass. 2594*0fca6ea1SDimitry Andric break; 2595*0fca6ea1SDimitry Andric default: 2596*0fca6ea1SDimitry Andric llvm_unreachable("Unsupported synchronization scope"); 2597*0fca6ea1SDimitry Andric } 2598*0fca6ea1SDimitry Andric } 2599*0fca6ea1SDimitry Andric 2600*0fca6ea1SDimitry Andric // The scratch address space does not need the global memory caches 2601*0fca6ea1SDimitry Andric // to be bypassed as all memory operations by the same thread are 2602*0fca6ea1SDimitry Andric // sequentially consistent, and no other thread can access scratch 2603*0fca6ea1SDimitry Andric // memory. 2604*0fca6ea1SDimitry Andric 2605*0fca6ea1SDimitry Andric // Other address spaces do not have a cache. 2606*0fca6ea1SDimitry Andric 2607*0fca6ea1SDimitry Andric return Changed; 2608*0fca6ea1SDimitry Andric } 2609*0fca6ea1SDimitry Andric 26100b57cec5SDimitry Andric bool SIMemoryLegalizer::removeAtomicPseudoMIs() { 26110b57cec5SDimitry Andric if (AtomicPseudoMIs.empty()) 26120b57cec5SDimitry Andric return false; 26130b57cec5SDimitry Andric 26140b57cec5SDimitry Andric for (auto &MI : AtomicPseudoMIs) 26150b57cec5SDimitry Andric MI->eraseFromParent(); 26160b57cec5SDimitry Andric 26170b57cec5SDimitry Andric AtomicPseudoMIs.clear(); 26180b57cec5SDimitry Andric return true; 26190b57cec5SDimitry Andric } 26200b57cec5SDimitry Andric 26210b57cec5SDimitry Andric bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, 26220b57cec5SDimitry Andric MachineBasicBlock::iterator &MI) { 26230b57cec5SDimitry Andric assert(MI->mayLoad() && !MI->mayStore()); 26240b57cec5SDimitry Andric 26250b57cec5SDimitry Andric bool Changed = false; 26260b57cec5SDimitry Andric 26270b57cec5SDimitry Andric if (MOI.isAtomic()) { 26280b57cec5SDimitry Andric if (MOI.getOrdering() == AtomicOrdering::Monotonic || 26290b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::Acquire || 26300b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 26310b57cec5SDimitry Andric Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), 26320b57cec5SDimitry Andric MOI.getOrderingAddrSpace()); 26330b57cec5SDimitry Andric } 26340b57cec5SDimitry Andric 26350b57cec5SDimitry Andric if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 26360b57cec5SDimitry Andric Changed |= CC->insertWait(MI, MOI.getScope(), 26370b57cec5SDimitry Andric MOI.getOrderingAddrSpace(), 26380b57cec5SDimitry Andric SIMemOp::LOAD | SIMemOp::STORE, 26390b57cec5SDimitry Andric MOI.getIsCrossAddressSpaceOrdering(), 26400b57cec5SDimitry Andric Position::BEFORE); 26410b57cec5SDimitry Andric 26420b57cec5SDimitry Andric if (MOI.getOrdering() == AtomicOrdering::Acquire || 26430b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 26440b57cec5SDimitry Andric Changed |= CC->insertWait(MI, MOI.getScope(), 26450b57cec5SDimitry Andric MOI.getInstrAddrSpace(), 26460b57cec5SDimitry Andric SIMemOp::LOAD, 26470b57cec5SDimitry Andric MOI.getIsCrossAddressSpaceOrdering(), 26480b57cec5SDimitry Andric Position::AFTER); 2649e8d8bef9SDimitry Andric Changed |= CC->insertAcquire(MI, MOI.getScope(), 26500b57cec5SDimitry Andric MOI.getOrderingAddrSpace(), 26510b57cec5SDimitry Andric Position::AFTER); 26520b57cec5SDimitry Andric } 26530b57cec5SDimitry Andric 26540b57cec5SDimitry Andric return Changed; 26550b57cec5SDimitry Andric } 26560b57cec5SDimitry Andric 2657e8d8bef9SDimitry Andric // Atomic instructions already bypass caches to the scope specified by the 2658*0fca6ea1SDimitry Andric // SyncScope operand. Only non-atomic volatile and nontemporal/last-use 2659*0fca6ea1SDimitry Andric // instructions need additional treatment. 2660*0fca6ea1SDimitry Andric Changed |= CC->enableVolatileAndOrNonTemporal( 2661*0fca6ea1SDimitry Andric MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(), 2662*0fca6ea1SDimitry Andric MOI.isNonTemporal(), MOI.isLastUse()); 2663*0fca6ea1SDimitry Andric 26640b57cec5SDimitry Andric return Changed; 26650b57cec5SDimitry Andric } 26660b57cec5SDimitry Andric 26670b57cec5SDimitry Andric bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, 26680b57cec5SDimitry Andric MachineBasicBlock::iterator &MI) { 26690b57cec5SDimitry Andric assert(!MI->mayLoad() && MI->mayStore()); 26700b57cec5SDimitry Andric 26710b57cec5SDimitry Andric bool Changed = false; 26720b57cec5SDimitry Andric 26730b57cec5SDimitry Andric if (MOI.isAtomic()) { 2674fe6060f1SDimitry Andric if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2675fe6060f1SDimitry Andric MOI.getOrdering() == AtomicOrdering::Release || 2676fe6060f1SDimitry Andric MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2677fe6060f1SDimitry Andric Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), 2678fe6060f1SDimitry Andric MOI.getOrderingAddrSpace()); 2679fe6060f1SDimitry Andric } 2680fe6060f1SDimitry Andric 26810b57cec5SDimitry Andric if (MOI.getOrdering() == AtomicOrdering::Release || 26820b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2683e8d8bef9SDimitry Andric Changed |= CC->insertRelease(MI, MOI.getScope(), 26840b57cec5SDimitry Andric MOI.getOrderingAddrSpace(), 26850b57cec5SDimitry Andric MOI.getIsCrossAddressSpaceOrdering(), 26860b57cec5SDimitry Andric Position::BEFORE); 26870b57cec5SDimitry Andric 26880b57cec5SDimitry Andric return Changed; 26890b57cec5SDimitry Andric } 26900b57cec5SDimitry Andric 2691e8d8bef9SDimitry Andric // Atomic instructions already bypass caches to the scope specified by the 2692e8d8bef9SDimitry Andric // SyncScope operand. Only non-atomic volatile and nontemporal instructions 2693e8d8bef9SDimitry Andric // need additional treatment. 2694e8d8bef9SDimitry Andric Changed |= CC->enableVolatileAndOrNonTemporal( 2695e8d8bef9SDimitry Andric MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), 2696e8d8bef9SDimitry Andric MOI.isNonTemporal()); 2697*0fca6ea1SDimitry Andric 2698*0fca6ea1SDimitry Andric // GFX12 specific, scope(desired coherence domain in cache hierarchy) is 2699*0fca6ea1SDimitry Andric // instruction field, do not confuse it with atomic scope. 2700*0fca6ea1SDimitry Andric Changed |= CC->expandSystemScopeStore(MI); 27010b57cec5SDimitry Andric return Changed; 27020b57cec5SDimitry Andric } 27030b57cec5SDimitry Andric 27040b57cec5SDimitry Andric bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, 27050b57cec5SDimitry Andric MachineBasicBlock::iterator &MI) { 27060b57cec5SDimitry Andric assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); 27070b57cec5SDimitry Andric 27080b57cec5SDimitry Andric AtomicPseudoMIs.push_back(MI); 27090b57cec5SDimitry Andric bool Changed = false; 27100b57cec5SDimitry Andric 2711*0fca6ea1SDimitry Andric // Refine fenced address space based on MMRAs. 2712*0fca6ea1SDimitry Andric // 2713*0fca6ea1SDimitry Andric // TODO: Should we support this MMRA on other atomic operations? 2714*0fca6ea1SDimitry Andric auto OrderingAddrSpace = 2715*0fca6ea1SDimitry Andric getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace()); 2716*0fca6ea1SDimitry Andric 27170b57cec5SDimitry Andric if (MOI.isAtomic()) { 271806c3fb27SDimitry Andric if (MOI.getOrdering() == AtomicOrdering::Acquire) 2719*0fca6ea1SDimitry Andric Changed |= CC->insertWait( 2720*0fca6ea1SDimitry Andric MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE, 2721*0fca6ea1SDimitry Andric MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); 272206c3fb27SDimitry Andric 272306c3fb27SDimitry Andric if (MOI.getOrdering() == AtomicOrdering::Release || 27240b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::AcquireRelease || 27250b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 27260b57cec5SDimitry Andric /// TODO: This relies on a barrier always generating a waitcnt 27270b57cec5SDimitry Andric /// for LDS to ensure it is not reordered with the completion of 27280b57cec5SDimitry Andric /// the proceeding LDS operations. If barrier had a memory 27290b57cec5SDimitry Andric /// ordering and memory scope, then library does not need to 27300b57cec5SDimitry Andric /// generate a fence. Could add support in this file for 27310b57cec5SDimitry Andric /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally 2732e8d8bef9SDimitry Andric /// adding S_WAITCNT before a S_BARRIER. 2733*0fca6ea1SDimitry Andric Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace, 27340b57cec5SDimitry Andric MOI.getIsCrossAddressSpaceOrdering(), 27350b57cec5SDimitry Andric Position::BEFORE); 27360b57cec5SDimitry Andric 2737e8d8bef9SDimitry Andric // TODO: If both release and invalidate are happening they could be combined 2738fe6060f1SDimitry Andric // to use the single "BUFFER_WBINV*" instruction. This could be done by 2739e8d8bef9SDimitry Andric // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to 2740e8d8bef9SDimitry Andric // track cache invalidate and write back instructions. 2741e8d8bef9SDimitry Andric 27420b57cec5SDimitry Andric if (MOI.getOrdering() == AtomicOrdering::Acquire || 27430b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::AcquireRelease || 27440b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) 2745*0fca6ea1SDimitry Andric Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace, 27460b57cec5SDimitry Andric Position::BEFORE); 27470b57cec5SDimitry Andric 27480b57cec5SDimitry Andric return Changed; 27490b57cec5SDimitry Andric } 27500b57cec5SDimitry Andric 27510b57cec5SDimitry Andric return Changed; 27520b57cec5SDimitry Andric } 27530b57cec5SDimitry Andric 27540b57cec5SDimitry Andric bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, 27550b57cec5SDimitry Andric MachineBasicBlock::iterator &MI) { 27560b57cec5SDimitry Andric assert(MI->mayLoad() && MI->mayStore()); 27570b57cec5SDimitry Andric 27580b57cec5SDimitry Andric bool Changed = false; 27590b57cec5SDimitry Andric 27600b57cec5SDimitry Andric if (MOI.isAtomic()) { 2761fe6060f1SDimitry Andric if (MOI.getOrdering() == AtomicOrdering::Monotonic || 2762fe6060f1SDimitry Andric MOI.getOrdering() == AtomicOrdering::Acquire || 2763fe6060f1SDimitry Andric MOI.getOrdering() == AtomicOrdering::Release || 2764fe6060f1SDimitry Andric MOI.getOrdering() == AtomicOrdering::AcquireRelease || 2765fe6060f1SDimitry Andric MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { 2766fe6060f1SDimitry Andric Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), 2767fe6060f1SDimitry Andric MOI.getInstrAddrSpace()); 2768fe6060f1SDimitry Andric } 2769fe6060f1SDimitry Andric 27700b57cec5SDimitry Andric if (MOI.getOrdering() == AtomicOrdering::Release || 27710b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::AcquireRelease || 27720b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 27730b57cec5SDimitry Andric MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) 2774e8d8bef9SDimitry Andric Changed |= CC->insertRelease(MI, MOI.getScope(), 27750b57cec5SDimitry Andric MOI.getOrderingAddrSpace(), 27760b57cec5SDimitry Andric MOI.getIsCrossAddressSpaceOrdering(), 27770b57cec5SDimitry Andric Position::BEFORE); 27780b57cec5SDimitry Andric 27790b57cec5SDimitry Andric if (MOI.getOrdering() == AtomicOrdering::Acquire || 27800b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::AcquireRelease || 27810b57cec5SDimitry Andric MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || 27820b57cec5SDimitry Andric MOI.getFailureOrdering() == AtomicOrdering::Acquire || 27830b57cec5SDimitry Andric MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { 27840b57cec5SDimitry Andric Changed |= CC->insertWait(MI, MOI.getScope(), 2785fe6060f1SDimitry Andric MOI.getInstrAddrSpace(), 27860b57cec5SDimitry Andric isAtomicRet(*MI) ? SIMemOp::LOAD : 27870b57cec5SDimitry Andric SIMemOp::STORE, 27880b57cec5SDimitry Andric MOI.getIsCrossAddressSpaceOrdering(), 27890b57cec5SDimitry Andric Position::AFTER); 2790e8d8bef9SDimitry Andric Changed |= CC->insertAcquire(MI, MOI.getScope(), 27910b57cec5SDimitry Andric MOI.getOrderingAddrSpace(), 27920b57cec5SDimitry Andric Position::AFTER); 27930b57cec5SDimitry Andric } 27940b57cec5SDimitry Andric 27950b57cec5SDimitry Andric return Changed; 27960b57cec5SDimitry Andric } 27970b57cec5SDimitry Andric 27980b57cec5SDimitry Andric return Changed; 27990b57cec5SDimitry Andric } 28000b57cec5SDimitry Andric 28010b57cec5SDimitry Andric bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { 28020b57cec5SDimitry Andric bool Changed = false; 28030b57cec5SDimitry Andric 2804*0fca6ea1SDimitry Andric const MachineModuleInfo &MMI = 2805*0fca6ea1SDimitry Andric getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 2806*0fca6ea1SDimitry Andric 2807*0fca6ea1SDimitry Andric SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>()); 28080b57cec5SDimitry Andric CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); 28090b57cec5SDimitry Andric 28100b57cec5SDimitry Andric for (auto &MBB : MF) { 28110b57cec5SDimitry Andric for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { 28125ffd83dbSDimitry Andric 2813e8d8bef9SDimitry Andric // Unbundle instructions after the post-RA scheduler. 2814fe6060f1SDimitry Andric if (MI->isBundle() && MI->mayLoadOrStore()) { 28155ffd83dbSDimitry Andric MachineBasicBlock::instr_iterator II(MI->getIterator()); 28165ffd83dbSDimitry Andric for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); 28175ffd83dbSDimitry Andric I != E && I->isBundledWithPred(); ++I) { 28185ffd83dbSDimitry Andric I->unbundleFromPred(); 28195ffd83dbSDimitry Andric for (MachineOperand &MO : I->operands()) 28205ffd83dbSDimitry Andric if (MO.isReg()) 28215ffd83dbSDimitry Andric MO.setIsInternalRead(false); 28225ffd83dbSDimitry Andric } 28235ffd83dbSDimitry Andric 28245ffd83dbSDimitry Andric MI->eraseFromParent(); 28255ffd83dbSDimitry Andric MI = II->getIterator(); 28265ffd83dbSDimitry Andric } 28275ffd83dbSDimitry Andric 28280b57cec5SDimitry Andric if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) 28290b57cec5SDimitry Andric continue; 28300b57cec5SDimitry Andric 28310b57cec5SDimitry Andric if (const auto &MOI = MOA.getLoadInfo(MI)) 2832bdd1243dSDimitry Andric Changed |= expandLoad(*MOI, MI); 283306c3fb27SDimitry Andric else if (const auto &MOI = MOA.getStoreInfo(MI)) { 2834bdd1243dSDimitry Andric Changed |= expandStore(*MOI, MI); 283506c3fb27SDimitry Andric Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); 283606c3fb27SDimitry Andric } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) 2837bdd1243dSDimitry Andric Changed |= expandAtomicFence(*MOI, MI); 28380b57cec5SDimitry Andric else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) 2839bdd1243dSDimitry Andric Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); 28400b57cec5SDimitry Andric } 28410b57cec5SDimitry Andric } 28420b57cec5SDimitry Andric 28430b57cec5SDimitry Andric Changed |= removeAtomicPseudoMIs(); 28440b57cec5SDimitry Andric return Changed; 28450b57cec5SDimitry Andric } 28460b57cec5SDimitry Andric 28470b57cec5SDimitry Andric INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) 28480b57cec5SDimitry Andric 28490b57cec5SDimitry Andric char SIMemoryLegalizer::ID = 0; 28500b57cec5SDimitry Andric char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; 28510b57cec5SDimitry Andric 28520b57cec5SDimitry Andric FunctionPass *llvm::createSIMemoryLegalizerPass() { 28530b57cec5SDimitry Andric return new SIMemoryLegalizer(); 28540b57cec5SDimitry Andric } 2855