xref: /llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 590dd73c6ebdc9fe1314dfa5bda5c2367d866574)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIDefines.h"
20 #include "SIInstrInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/BitmaskEnum.h"
24 #include "llvm/ADT/None.h"
25 #include "llvm/ADT/Optional.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineFunction.h"
28 #include "llvm/CodeGen/MachineFunctionPass.h"
29 #include "llvm/CodeGen/MachineInstrBuilder.h"
30 #include "llvm/CodeGen/MachineMemOperand.h"
31 #include "llvm/CodeGen/MachineModuleInfo.h"
32 #include "llvm/CodeGen/MachineOperand.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/LLVMContext.h"
37 #include "llvm/MC/MCInstrDesc.h"
38 #include "llvm/Pass.h"
39 #include "llvm/Support/AtomicOrdering.h"
40 #include "llvm/Support/MathExtras.h"
41 #include <cassert>
42 #include <list>
43 
44 using namespace llvm;
45 using namespace llvm::AMDGPU;
46 
47 #define DEBUG_TYPE "si-memory-legalizer"
48 #define PASS_NAME "SI Memory Legalizer"
49 
50 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
51     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
52     cl::desc("Use this to skip inserting cache invalidating instructions."));
53 
54 namespace {
55 
56 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
57 
58 /// Memory operation flags. Can be ORed together.
59 enum class SIMemOp {
60   NONE = 0u,
61   LOAD = 1u << 0,
62   STORE = 1u << 1,
63   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
64 };
65 
66 /// Position to insert a new instruction relative to an existing
67 /// instruction.
68 enum class Position {
69   BEFORE,
70   AFTER
71 };
72 
73 /// The atomic synchronization scopes supported by the AMDGPU target.
74 enum class SIAtomicScope {
75   NONE,
76   SINGLETHREAD,
77   WAVEFRONT,
78   WORKGROUP,
79   AGENT,
80   SYSTEM
81 };
82 
83 /// The distinct address spaces supported by the AMDGPU target for
84 /// atomic memory operation. Can be ORed toether.
85 enum class SIAtomicAddrSpace {
86   NONE = 0u,
87   GLOBAL = 1u << 0,
88   LDS = 1u << 1,
89   SCRATCH = 1u << 2,
90   GDS = 1u << 3,
91   OTHER = 1u << 4,
92 
93   /// The address spaces that can be accessed by a FLAT instruction.
94   FLAT = GLOBAL | LDS | SCRATCH,
95 
96   /// The address spaces that support atomic instructions.
97   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
98 
99   /// All address spaces.
100   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
101 
102   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
103 };
104 
105 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
106 /// \returns Returns true if \p MI is modified, false otherwise.
107 template <uint16_t BitName>
108 bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
109   int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
110   if (BitIdx == -1)
111     return false;
112 
113   MachineOperand &Bit = MI->getOperand(BitIdx);
114   if (Bit.getImm() != 0)
115     return false;
116 
117   Bit.setImm(1);
118   return true;
119 }
120 
121 class SIMemOpInfo final {
122 private:
123 
124   friend class SIMemOpAccess;
125 
126   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
127   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
128   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
129   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
130   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
131   bool IsCrossAddressSpaceOrdering = false;
132   bool IsNonTemporal = false;
133 
134   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
135               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
136               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
137               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
138               bool IsCrossAddressSpaceOrdering = true,
139               AtomicOrdering FailureOrdering =
140                 AtomicOrdering::SequentiallyConsistent,
141               bool IsNonTemporal = false)
142     : Ordering(Ordering), FailureOrdering(FailureOrdering),
143       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
144       InstrAddrSpace(InstrAddrSpace),
145       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
146       IsNonTemporal(IsNonTemporal) {
147     // There is also no cross address space ordering if the ordering
148     // address space is the same as the instruction address space and
149     // only contains a single address space.
150     if ((OrderingAddrSpace == InstrAddrSpace) &&
151         isPowerOf2_32(uint32_t(InstrAddrSpace)))
152       this->IsCrossAddressSpaceOrdering = false;
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SiMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SiMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is non-temporal, false otherwise.
194   bool isNonTemporal() const {
195     return IsNonTemporal;
196   }
197 
198   /// \returns True if ordering constraint of the machine instruction used to
199   /// create this SIMemOpInfo is unordered or higher, false otherwise.
200   bool isAtomic() const {
201     return Ordering != AtomicOrdering::NotAtomic;
202   }
203 
204 };
205 
206 class SIMemOpAccess final {
207 private:
208   AMDGPUMachineModuleInfo *MMI = nullptr;
209 
210   /// Reports unsupported message \p Msg for \p MI to LLVM context.
211   void reportUnsupported(const MachineBasicBlock::iterator &MI,
212                          const char *Msg) const;
213 
214   /// Inspects the target synchonization scope \p SSID and determines
215   /// the SI atomic scope it corresponds to, the address spaces it
216   /// covers, and whether the memory ordering applies between address
217   /// spaces.
218   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
219   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
220 
221   /// \return Return a bit set of the address spaces accessed by \p AS.
222   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
223 
224   /// \returns Info constructed from \p MI, which has at least machine memory
225   /// operand.
226   Optional<SIMemOpInfo> constructFromMIWithMMO(
227       const MachineBasicBlock::iterator &MI) const;
228 
229 public:
230   /// Construct class to support accessing the machine memory operands
231   /// of instructions in the machine function \p MF.
232   SIMemOpAccess(MachineFunction &MF);
233 
234   /// \returns Load info if \p MI is a load operation, "None" otherwise.
235   Optional<SIMemOpInfo> getLoadInfo(
236       const MachineBasicBlock::iterator &MI) const;
237 
238   /// \returns Store info if \p MI is a store operation, "None" otherwise.
239   Optional<SIMemOpInfo> getStoreInfo(
240       const MachineBasicBlock::iterator &MI) const;
241 
242   /// \returns Atomic fence info if \p MI is an atomic fence operation,
243   /// "None" otherwise.
244   Optional<SIMemOpInfo> getAtomicFenceInfo(
245       const MachineBasicBlock::iterator &MI) const;
246 
247   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
248   /// rmw operation, "None" otherwise.
249   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
250       const MachineBasicBlock::iterator &MI) const;
251 };
252 
253 class SICacheControl {
254 protected:
255 
256   /// Instruction info.
257   const SIInstrInfo *TII = nullptr;
258 
259   IsaVersion IV;
260 
261   /// Whether to insert cache invalidating instructions.
262   bool InsertCacheInv;
263 
264   SICacheControl(const GCNSubtarget &ST);
265 
266 public:
267 
268   /// Create a cache control for the subtarget \p ST.
269   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
270 
271   /// Update \p MI memory load instruction to bypass any caches up to
272   /// the \p Scope memory scope for address spaces \p
273   /// AddrSpace. Return true iff the instruction was modified.
274   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
275                                      SIAtomicScope Scope,
276                                      SIAtomicAddrSpace AddrSpace) const = 0;
277 
278   /// Update \p MI memory instruction to indicate it is
279   /// nontemporal. Return true iff the instruction was modified.
280   virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
281     const = 0;
282 
283   /// Inserts any necessary instructions at position \p Pos relative
284   /// to instruction \p MI to ensure any caches associated with
285   /// address spaces \p AddrSpace for memory scopes up to memory scope
286   /// \p Scope are invalidated. Returns true iff any instructions
287   /// inserted.
288   virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
289                                      SIAtomicScope Scope,
290                                      SIAtomicAddrSpace AddrSpace,
291                                      Position Pos) const = 0;
292 
293   /// Inserts any necessary instructions at position \p Pos relative
294   /// to instruction \p MI to ensure memory instructions of kind \p Op
295   /// associated with address spaces \p AddrSpace have completed as
296   /// observed by other memory instructions executing in memory scope
297   /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
298   /// ordering is between address spaces. Returns true iff any
299   /// instructions inserted.
300   virtual bool insertWait(MachineBasicBlock::iterator &MI,
301                           SIAtomicScope Scope,
302                           SIAtomicAddrSpace AddrSpace,
303                           SIMemOp Op,
304                           bool IsCrossAddrSpaceOrdering,
305                           Position Pos) const = 0;
306 
307   /// Virtual destructor to allow derivations to be deleted.
308   virtual ~SICacheControl() = default;
309 
310 };
311 
312 class SIGfx6CacheControl : public SICacheControl {
313 protected:
314 
315   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
316   /// is modified, false otherwise.
317   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
318     return enableNamedBit<AMDGPU::OpName::glc>(MI);
319   }
320 
321   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
322   /// is modified, false otherwise.
323   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
324     return enableNamedBit<AMDGPU::OpName::slc>(MI);
325   }
326 
327 public:
328 
329   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
330 
331   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
332                              SIAtomicScope Scope,
333                              SIAtomicAddrSpace AddrSpace) const override;
334 
335   bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
336 
337   bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
338                              SIAtomicScope Scope,
339                              SIAtomicAddrSpace AddrSpace,
340                              Position Pos) const override;
341 
342   bool insertWait(MachineBasicBlock::iterator &MI,
343                   SIAtomicScope Scope,
344                   SIAtomicAddrSpace AddrSpace,
345                   SIMemOp Op,
346                   bool IsCrossAddrSpaceOrdering,
347                   Position Pos) const override;
348 };
349 
350 class SIGfx7CacheControl : public SIGfx6CacheControl {
351 public:
352 
353   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
354 
355   bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
356                              SIAtomicScope Scope,
357                              SIAtomicAddrSpace AddrSpace,
358                              Position Pos) const override;
359 
360 };
361 
362 class SIGfx10CacheControl : public SIGfx7CacheControl {
363 protected:
364   bool CuMode = false;
365 
366   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
367   /// is modified, false otherwise.
368   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
369     return enableNamedBit<AMDGPU::OpName::dlc>(MI);
370   }
371 
372 public:
373 
374   SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) :
375     SIGfx7CacheControl(ST), CuMode(CuMode) {};
376 
377   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
378                              SIAtomicScope Scope,
379                              SIAtomicAddrSpace AddrSpace) const override;
380 
381   bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
382 
383   bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
384                              SIAtomicScope Scope,
385                              SIAtomicAddrSpace AddrSpace,
386                              Position Pos) const override;
387 
388   bool insertWait(MachineBasicBlock::iterator &MI,
389                   SIAtomicScope Scope,
390                   SIAtomicAddrSpace AddrSpace,
391                   SIMemOp Op,
392                   bool IsCrossAddrSpaceOrdering,
393                   Position Pos) const override;
394 };
395 
396 class SIMemoryLegalizer final : public MachineFunctionPass {
397 private:
398 
399   /// Cache Control.
400   std::unique_ptr<SICacheControl> CC = nullptr;
401 
402   /// List of atomic pseudo instructions.
403   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
404 
405   /// Return true iff instruction \p MI is a atomic instruction that
406   /// returns a result.
407   bool isAtomicRet(const MachineInstr &MI) const {
408     return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
409   }
410 
411   /// Removes all processed atomic pseudo instructions from the current
412   /// function. Returns true if current function is modified, false otherwise.
413   bool removeAtomicPseudoMIs();
414 
415   /// Expands load operation \p MI. Returns true if instructions are
416   /// added/deleted or \p MI is modified, false otherwise.
417   bool expandLoad(const SIMemOpInfo &MOI,
418                   MachineBasicBlock::iterator &MI);
419   /// Expands store operation \p MI. Returns true if instructions are
420   /// added/deleted or \p MI is modified, false otherwise.
421   bool expandStore(const SIMemOpInfo &MOI,
422                    MachineBasicBlock::iterator &MI);
423   /// Expands atomic fence operation \p MI. Returns true if
424   /// instructions are added/deleted or \p MI is modified, false otherwise.
425   bool expandAtomicFence(const SIMemOpInfo &MOI,
426                          MachineBasicBlock::iterator &MI);
427   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
428   /// instructions are added/deleted or \p MI is modified, false otherwise.
429   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
430                                 MachineBasicBlock::iterator &MI);
431 
432 public:
433   static char ID;
434 
435   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
436 
437   void getAnalysisUsage(AnalysisUsage &AU) const override {
438     AU.setPreservesCFG();
439     MachineFunctionPass::getAnalysisUsage(AU);
440   }
441 
442   StringRef getPassName() const override {
443     return PASS_NAME;
444   }
445 
446   bool runOnMachineFunction(MachineFunction &MF) override;
447 };
448 
449 } // end namespace anonymous
450 
451 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
452                                       const char *Msg) const {
453   const Function &Func = MI->getParent()->getParent()->getFunction();
454   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
455   Func.getContext().diagnose(Diag);
456 }
457 
458 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
459 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
460                                SIAtomicAddrSpace InstrScope) const {
461   if (SSID == SyncScope::System)
462     return std::make_tuple(SIAtomicScope::SYSTEM,
463                            SIAtomicAddrSpace::ATOMIC,
464                            true);
465   if (SSID == MMI->getAgentSSID())
466     return std::make_tuple(SIAtomicScope::AGENT,
467                            SIAtomicAddrSpace::ATOMIC,
468                            true);
469   if (SSID == MMI->getWorkgroupSSID())
470     return std::make_tuple(SIAtomicScope::WORKGROUP,
471                            SIAtomicAddrSpace::ATOMIC,
472                            true);
473   if (SSID == MMI->getWavefrontSSID())
474     return std::make_tuple(SIAtomicScope::WAVEFRONT,
475                            SIAtomicAddrSpace::ATOMIC,
476                            true);
477   if (SSID == SyncScope::SingleThread)
478     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
479                            SIAtomicAddrSpace::ATOMIC,
480                            true);
481   if (SSID == MMI->getSystemOneAddressSpaceSSID())
482     return std::make_tuple(SIAtomicScope::SYSTEM,
483                            SIAtomicAddrSpace::ATOMIC & InstrScope,
484                            false);
485   if (SSID == MMI->getAgentOneAddressSpaceSSID())
486     return std::make_tuple(SIAtomicScope::AGENT,
487                            SIAtomicAddrSpace::ATOMIC & InstrScope,
488                            false);
489   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
490     return std::make_tuple(SIAtomicScope::WORKGROUP,
491                            SIAtomicAddrSpace::ATOMIC & InstrScope,
492                            false);
493   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
494     return std::make_tuple(SIAtomicScope::WAVEFRONT,
495                            SIAtomicAddrSpace::ATOMIC & InstrScope,
496                            false);
497   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
498     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
499                            SIAtomicAddrSpace::ATOMIC & InstrScope,
500                            false);
501   return None;
502 }
503 
504 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
505   if (AS == AMDGPUAS::FLAT_ADDRESS)
506     return SIAtomicAddrSpace::FLAT;
507   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
508     return SIAtomicAddrSpace::GLOBAL;
509   if (AS == AMDGPUAS::LOCAL_ADDRESS)
510     return SIAtomicAddrSpace::LDS;
511   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
512     return SIAtomicAddrSpace::SCRATCH;
513   if (AS == AMDGPUAS::REGION_ADDRESS)
514     return SIAtomicAddrSpace::GDS;
515 
516   return SIAtomicAddrSpace::OTHER;
517 }
518 
519 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
520   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
521 }
522 
523 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
524     const MachineBasicBlock::iterator &MI) const {
525   assert(MI->getNumMemOperands() > 0);
526 
527   SyncScope::ID SSID = SyncScope::SingleThread;
528   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
529   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
530   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
531   bool IsNonTemporal = true;
532 
533   // Validator should check whether or not MMOs cover the entire set of
534   // locations accessed by the memory instruction.
535   for (const auto &MMO : MI->memoperands()) {
536     IsNonTemporal &= MMO->isNonTemporal();
537     InstrAddrSpace |=
538       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
539     AtomicOrdering OpOrdering = MMO->getOrdering();
540     if (OpOrdering != AtomicOrdering::NotAtomic) {
541       const auto &IsSyncScopeInclusion =
542           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
543       if (!IsSyncScopeInclusion) {
544         reportUnsupported(MI,
545           "Unsupported non-inclusive atomic synchronization scope");
546         return None;
547       }
548 
549       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
550       Ordering =
551           isStrongerThan(Ordering, OpOrdering) ?
552               Ordering : MMO->getOrdering();
553       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
554              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
555       FailureOrdering =
556           isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
557               FailureOrdering : MMO->getFailureOrdering();
558     }
559   }
560 
561   SIAtomicScope Scope = SIAtomicScope::NONE;
562   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
563   bool IsCrossAddressSpaceOrdering = false;
564   if (Ordering != AtomicOrdering::NotAtomic) {
565     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
566     if (!ScopeOrNone) {
567       reportUnsupported(MI, "Unsupported atomic synchronization scope");
568       return None;
569     }
570     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
571       ScopeOrNone.getValue();
572     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
573         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
574       reportUnsupported(MI, "Unsupported atomic address space");
575       return None;
576     }
577   }
578   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
579                      IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
580 }
581 
582 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
583     const MachineBasicBlock::iterator &MI) const {
584   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
585 
586   if (!(MI->mayLoad() && !MI->mayStore()))
587     return None;
588 
589   // Be conservative if there are no memory operands.
590   if (MI->getNumMemOperands() == 0)
591     return SIMemOpInfo();
592 
593   return constructFromMIWithMMO(MI);
594 }
595 
596 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
597     const MachineBasicBlock::iterator &MI) const {
598   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
599 
600   if (!(!MI->mayLoad() && MI->mayStore()))
601     return None;
602 
603   // Be conservative if there are no memory operands.
604   if (MI->getNumMemOperands() == 0)
605     return SIMemOpInfo();
606 
607   return constructFromMIWithMMO(MI);
608 }
609 
610 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
611     const MachineBasicBlock::iterator &MI) const {
612   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
613 
614   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
615     return None;
616 
617   AtomicOrdering Ordering =
618     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
619 
620   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
621   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
622   if (!ScopeOrNone) {
623     reportUnsupported(MI, "Unsupported atomic synchronization scope");
624     return None;
625   }
626 
627   SIAtomicScope Scope = SIAtomicScope::NONE;
628   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
629   bool IsCrossAddressSpaceOrdering = false;
630   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
631     ScopeOrNone.getValue();
632 
633   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
634       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
635     reportUnsupported(MI, "Unsupported atomic address space");
636     return None;
637   }
638 
639   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
640                      IsCrossAddressSpaceOrdering);
641 }
642 
643 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
644     const MachineBasicBlock::iterator &MI) const {
645   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
646 
647   if (!(MI->mayLoad() && MI->mayStore()))
648     return None;
649 
650   // Be conservative if there are no memory operands.
651   if (MI->getNumMemOperands() == 0)
652     return SIMemOpInfo();
653 
654   return constructFromMIWithMMO(MI);
655 }
656 
657 SICacheControl::SICacheControl(const GCNSubtarget &ST) {
658   TII = ST.getInstrInfo();
659   IV = getIsaVersion(ST.getCPU());
660   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
661 }
662 
663 /* static */
664 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
665   GCNSubtarget::Generation Generation = ST.getGeneration();
666   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
667     return std::make_unique<SIGfx6CacheControl>(ST);
668   if (Generation < AMDGPUSubtarget::GFX10)
669     return std::make_unique<SIGfx7CacheControl>(ST);
670   return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
671 }
672 
673 bool SIGfx6CacheControl::enableLoadCacheBypass(
674     const MachineBasicBlock::iterator &MI,
675     SIAtomicScope Scope,
676     SIAtomicAddrSpace AddrSpace) const {
677   assert(MI->mayLoad() && !MI->mayStore());
678   bool Changed = false;
679 
680   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
681     /// TODO: Do not set glc for rmw atomic operations as they
682     /// implicitly bypass the L1 cache.
683 
684     switch (Scope) {
685     case SIAtomicScope::SYSTEM:
686     case SIAtomicScope::AGENT:
687       Changed |= enableGLCBit(MI);
688       break;
689     case SIAtomicScope::WORKGROUP:
690     case SIAtomicScope::WAVEFRONT:
691     case SIAtomicScope::SINGLETHREAD:
692       // No cache to bypass.
693       break;
694     default:
695       llvm_unreachable("Unsupported synchronization scope");
696     }
697   }
698 
699   /// The scratch address space does not need the global memory caches
700   /// to be bypassed as all memory operations by the same thread are
701   /// sequentially consistent, and no other thread can access scratch
702   /// memory.
703 
704   /// Other address spaces do not hava a cache.
705 
706   return Changed;
707 }
708 
709 bool SIGfx6CacheControl::enableNonTemporal(
710     const MachineBasicBlock::iterator &MI) const {
711   assert(MI->mayLoad() ^ MI->mayStore());
712   bool Changed = false;
713 
714   /// TODO: Do not enableGLCBit if rmw atomic.
715   Changed |= enableGLCBit(MI);
716   Changed |= enableSLCBit(MI);
717 
718   return Changed;
719 }
720 
721 bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
722                                                SIAtomicScope Scope,
723                                                SIAtomicAddrSpace AddrSpace,
724                                                Position Pos) const {
725   if (!InsertCacheInv)
726     return false;
727 
728   bool Changed = false;
729 
730   MachineBasicBlock &MBB = *MI->getParent();
731   DebugLoc DL = MI->getDebugLoc();
732 
733   if (Pos == Position::AFTER)
734     ++MI;
735 
736   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
737     switch (Scope) {
738     case SIAtomicScope::SYSTEM:
739     case SIAtomicScope::AGENT:
740       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
741       Changed = true;
742       break;
743     case SIAtomicScope::WORKGROUP:
744     case SIAtomicScope::WAVEFRONT:
745     case SIAtomicScope::SINGLETHREAD:
746       // No cache to invalidate.
747       break;
748     default:
749       llvm_unreachable("Unsupported synchronization scope");
750     }
751   }
752 
753   /// The scratch address space does not need the global memory cache
754   /// to be flushed as all memory operations by the same thread are
755   /// sequentially consistent, and no other thread can access scratch
756   /// memory.
757 
758   /// Other address spaces do not hava a cache.
759 
760   if (Pos == Position::AFTER)
761     --MI;
762 
763   return Changed;
764 }
765 
766 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
767                                     SIAtomicScope Scope,
768                                     SIAtomicAddrSpace AddrSpace,
769                                     SIMemOp Op,
770                                     bool IsCrossAddrSpaceOrdering,
771                                     Position Pos) const {
772   bool Changed = false;
773 
774   MachineBasicBlock &MBB = *MI->getParent();
775   DebugLoc DL = MI->getDebugLoc();
776 
777   if (Pos == Position::AFTER)
778     ++MI;
779 
780   bool VMCnt = false;
781   bool LGKMCnt = false;
782 
783   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
784     switch (Scope) {
785     case SIAtomicScope::SYSTEM:
786     case SIAtomicScope::AGENT:
787       VMCnt |= true;
788       break;
789     case SIAtomicScope::WORKGROUP:
790     case SIAtomicScope::WAVEFRONT:
791     case SIAtomicScope::SINGLETHREAD:
792       // The L1 cache keeps all memory operations in order for
793       // wavefronts in the same work-group.
794       break;
795     default:
796       llvm_unreachable("Unsupported synchronization scope");
797     }
798   }
799 
800   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
801     switch (Scope) {
802     case SIAtomicScope::SYSTEM:
803     case SIAtomicScope::AGENT:
804     case SIAtomicScope::WORKGROUP:
805       // If no cross address space ordering then an LDS waitcnt is not
806       // needed as LDS operations for all waves are executed in a
807       // total global ordering as observed by all waves. Required if
808       // also synchronizing with global/GDS memory as LDS operations
809       // could be reordered with respect to later global/GDS memory
810       // operations of the same wave.
811       LGKMCnt |= IsCrossAddrSpaceOrdering;
812       break;
813     case SIAtomicScope::WAVEFRONT:
814     case SIAtomicScope::SINGLETHREAD:
815       // The LDS keeps all memory operations in order for
816       // the same wavesfront.
817       break;
818     default:
819       llvm_unreachable("Unsupported synchronization scope");
820     }
821   }
822 
823   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
824     switch (Scope) {
825     case SIAtomicScope::SYSTEM:
826     case SIAtomicScope::AGENT:
827       // If no cross address space ordering then an GDS waitcnt is not
828       // needed as GDS operations for all waves are executed in a
829       // total global ordering as observed by all waves. Required if
830       // also synchronizing with global/LDS memory as GDS operations
831       // could be reordered with respect to later global/LDS memory
832       // operations of the same wave.
833       LGKMCnt |= IsCrossAddrSpaceOrdering;
834       break;
835     case SIAtomicScope::WORKGROUP:
836     case SIAtomicScope::WAVEFRONT:
837     case SIAtomicScope::SINGLETHREAD:
838       // The GDS keeps all memory operations in order for
839       // the same work-group.
840       break;
841     default:
842       llvm_unreachable("Unsupported synchronization scope");
843     }
844   }
845 
846   if (VMCnt || LGKMCnt) {
847     unsigned WaitCntImmediate =
848       AMDGPU::encodeWaitcnt(IV,
849                             VMCnt ? 0 : getVmcntBitMask(IV),
850                             getExpcntBitMask(IV),
851                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
852     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
853     Changed = true;
854   }
855 
856   if (Pos == Position::AFTER)
857     --MI;
858 
859   return Changed;
860 }
861 
862 bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
863                                                SIAtomicScope Scope,
864                                                SIAtomicAddrSpace AddrSpace,
865                                                Position Pos) const {
866   if (!InsertCacheInv)
867     return false;
868 
869   bool Changed = false;
870 
871   MachineBasicBlock &MBB = *MI->getParent();
872   DebugLoc DL = MI->getDebugLoc();
873 
874   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
875 
876   const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
877                              ? AMDGPU::BUFFER_WBINVL1
878                              : AMDGPU::BUFFER_WBINVL1_VOL;
879 
880   if (Pos == Position::AFTER)
881     ++MI;
882 
883   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
884     switch (Scope) {
885     case SIAtomicScope::SYSTEM:
886     case SIAtomicScope::AGENT:
887       BuildMI(MBB, MI, DL, TII->get(Flush));
888       Changed = true;
889       break;
890     case SIAtomicScope::WORKGROUP:
891     case SIAtomicScope::WAVEFRONT:
892     case SIAtomicScope::SINGLETHREAD:
893       // No cache to invalidate.
894       break;
895     default:
896       llvm_unreachable("Unsupported synchronization scope");
897     }
898   }
899 
900   /// The scratch address space does not need the global memory cache
901   /// to be flushed as all memory operations by the same thread are
902   /// sequentially consistent, and no other thread can access scratch
903   /// memory.
904 
905   /// Other address spaces do not hava a cache.
906 
907   if (Pos == Position::AFTER)
908     --MI;
909 
910   return Changed;
911 }
912 
913 bool SIGfx10CacheControl::enableLoadCacheBypass(
914     const MachineBasicBlock::iterator &MI,
915     SIAtomicScope Scope,
916     SIAtomicAddrSpace AddrSpace) const {
917   assert(MI->mayLoad() && !MI->mayStore());
918   bool Changed = false;
919 
920   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
921     /// TODO Do not set glc for rmw atomic operations as they
922     /// implicitly bypass the L0/L1 caches.
923 
924     switch (Scope) {
925     case SIAtomicScope::SYSTEM:
926     case SIAtomicScope::AGENT:
927       Changed |= enableGLCBit(MI);
928       Changed |= enableDLCBit(MI);
929       break;
930     case SIAtomicScope::WORKGROUP:
931       // In WGP mode the waves of a work-group can be executing on either CU of
932       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
933       // CU mode and all waves of a work-group are on the same CU, and so the
934       // L0 does not need to be bypassed.
935       if (!CuMode) Changed |= enableGLCBit(MI);
936       break;
937     case SIAtomicScope::WAVEFRONT:
938     case SIAtomicScope::SINGLETHREAD:
939       // No cache to bypass.
940       break;
941     default:
942       llvm_unreachable("Unsupported synchronization scope");
943     }
944   }
945 
946   /// The scratch address space does not need the global memory caches
947   /// to be bypassed as all memory operations by the same thread are
948   /// sequentially consistent, and no other thread can access scratch
949   /// memory.
950 
951   /// Other address spaces do not hava a cache.
952 
953   return Changed;
954 }
955 
956 bool SIGfx10CacheControl::enableNonTemporal(
957     const MachineBasicBlock::iterator &MI) const {
958   assert(MI->mayLoad() ^ MI->mayStore());
959   bool Changed = false;
960 
961   Changed |= enableSLCBit(MI);
962   /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
963 
964   return Changed;
965 }
966 
967 bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
968                                                 SIAtomicScope Scope,
969                                                 SIAtomicAddrSpace AddrSpace,
970                                                 Position Pos) const {
971   if (!InsertCacheInv)
972     return false;
973 
974   bool Changed = false;
975 
976   MachineBasicBlock &MBB = *MI->getParent();
977   DebugLoc DL = MI->getDebugLoc();
978 
979   if (Pos == Position::AFTER)
980     ++MI;
981 
982   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
983     switch (Scope) {
984     case SIAtomicScope::SYSTEM:
985     case SIAtomicScope::AGENT:
986       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
987       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
988       Changed = true;
989       break;
990     case SIAtomicScope::WORKGROUP:
991       // In WGP mode the waves of a work-group can be executing on either CU of
992       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
993       // in CU mode and all waves of a work-group are on the same CU, and so the
994       // L0 does not need to be invalidated.
995       if (!CuMode) {
996         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
997         Changed = true;
998       }
999       break;
1000     case SIAtomicScope::WAVEFRONT:
1001     case SIAtomicScope::SINGLETHREAD:
1002       // No cache to invalidate.
1003       break;
1004     default:
1005       llvm_unreachable("Unsupported synchronization scope");
1006     }
1007   }
1008 
1009   /// The scratch address space does not need the global memory cache
1010   /// to be flushed as all memory operations by the same thread are
1011   /// sequentially consistent, and no other thread can access scratch
1012   /// memory.
1013 
1014   /// Other address spaces do not hava a cache.
1015 
1016   if (Pos == Position::AFTER)
1017     --MI;
1018 
1019   return Changed;
1020 }
1021 
1022 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1023                                      SIAtomicScope Scope,
1024                                      SIAtomicAddrSpace AddrSpace,
1025                                      SIMemOp Op,
1026                                      bool IsCrossAddrSpaceOrdering,
1027                                      Position Pos) const {
1028   bool Changed = false;
1029 
1030   MachineBasicBlock &MBB = *MI->getParent();
1031   DebugLoc DL = MI->getDebugLoc();
1032 
1033   if (Pos == Position::AFTER)
1034     ++MI;
1035 
1036   bool VMCnt = false;
1037   bool VSCnt = false;
1038   bool LGKMCnt = false;
1039 
1040   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1041     switch (Scope) {
1042     case SIAtomicScope::SYSTEM:
1043     case SIAtomicScope::AGENT:
1044       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1045         VMCnt |= true;
1046       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1047         VSCnt |= true;
1048       break;
1049     case SIAtomicScope::WORKGROUP:
1050       // In WGP mode the waves of a work-group can be executing on either CU of
1051       // the WGP. Therefore need to wait for operations to complete to ensure
1052       // they are visible to waves in the other CU as the L0 is per CU.
1053       // Otherwise in CU mode and all waves of a work-group are on the same CU
1054       // which shares the same L0.
1055       if (!CuMode) {
1056         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1057           VMCnt |= true;
1058         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1059           VSCnt |= true;
1060       }
1061       break;
1062     case SIAtomicScope::WAVEFRONT:
1063     case SIAtomicScope::SINGLETHREAD:
1064       // The L0 cache keeps all memory operations in order for
1065       // work-items in the same wavefront.
1066       break;
1067     default:
1068       llvm_unreachable("Unsupported synchronization scope");
1069     }
1070   }
1071 
1072   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1073     switch (Scope) {
1074     case SIAtomicScope::SYSTEM:
1075     case SIAtomicScope::AGENT:
1076     case SIAtomicScope::WORKGROUP:
1077       // If no cross address space ordering then an LDS waitcnt is not
1078       // needed as LDS operations for all waves are executed in a
1079       // total global ordering as observed by all waves. Required if
1080       // also synchronizing with global/GDS memory as LDS operations
1081       // could be reordered with respect to later global/GDS memory
1082       // operations of the same wave.
1083       LGKMCnt |= IsCrossAddrSpaceOrdering;
1084       break;
1085     case SIAtomicScope::WAVEFRONT:
1086     case SIAtomicScope::SINGLETHREAD:
1087       // The LDS keeps all memory operations in order for
1088       // the same wavesfront.
1089       break;
1090     default:
1091       llvm_unreachable("Unsupported synchronization scope");
1092     }
1093   }
1094 
1095   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1096     switch (Scope) {
1097     case SIAtomicScope::SYSTEM:
1098     case SIAtomicScope::AGENT:
1099       // If no cross address space ordering then an GDS waitcnt is not
1100       // needed as GDS operations for all waves are executed in a
1101       // total global ordering as observed by all waves. Required if
1102       // also synchronizing with global/LDS memory as GDS operations
1103       // could be reordered with respect to later global/LDS memory
1104       // operations of the same wave.
1105       LGKMCnt |= IsCrossAddrSpaceOrdering;
1106       break;
1107     case SIAtomicScope::WORKGROUP:
1108     case SIAtomicScope::WAVEFRONT:
1109     case SIAtomicScope::SINGLETHREAD:
1110       // The GDS keeps all memory operations in order for
1111       // the same work-group.
1112       break;
1113     default:
1114       llvm_unreachable("Unsupported synchronization scope");
1115     }
1116   }
1117 
1118   if (VMCnt || LGKMCnt) {
1119     unsigned WaitCntImmediate =
1120       AMDGPU::encodeWaitcnt(IV,
1121                             VMCnt ? 0 : getVmcntBitMask(IV),
1122                             getExpcntBitMask(IV),
1123                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1124     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1125     Changed = true;
1126   }
1127 
1128   if (VSCnt) {
1129     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1130       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1131       .addImm(0);
1132     Changed = true;
1133   }
1134 
1135   if (Pos == Position::AFTER)
1136     --MI;
1137 
1138   return Changed;
1139 }
1140 
1141 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1142   if (AtomicPseudoMIs.empty())
1143     return false;
1144 
1145   for (auto &MI : AtomicPseudoMIs)
1146     MI->eraseFromParent();
1147 
1148   AtomicPseudoMIs.clear();
1149   return true;
1150 }
1151 
1152 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1153                                    MachineBasicBlock::iterator &MI) {
1154   assert(MI->mayLoad() && !MI->mayStore());
1155 
1156   bool Changed = false;
1157 
1158   if (MOI.isAtomic()) {
1159     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1160         MOI.getOrdering() == AtomicOrdering::Acquire ||
1161         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1162       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1163                                            MOI.getOrderingAddrSpace());
1164     }
1165 
1166     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1167       Changed |= CC->insertWait(MI, MOI.getScope(),
1168                                 MOI.getOrderingAddrSpace(),
1169                                 SIMemOp::LOAD | SIMemOp::STORE,
1170                                 MOI.getIsCrossAddressSpaceOrdering(),
1171                                 Position::BEFORE);
1172 
1173     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1174         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1175       Changed |= CC->insertWait(MI, MOI.getScope(),
1176                                 MOI.getInstrAddrSpace(),
1177                                 SIMemOp::LOAD,
1178                                 MOI.getIsCrossAddressSpaceOrdering(),
1179                                 Position::AFTER);
1180       Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1181                                            MOI.getOrderingAddrSpace(),
1182                                            Position::AFTER);
1183     }
1184 
1185     return Changed;
1186   }
1187 
1188   // Atomic instructions do not have the nontemporal attribute.
1189   if (MOI.isNonTemporal()) {
1190     Changed |= CC->enableNonTemporal(MI);
1191     return Changed;
1192   }
1193 
1194   return Changed;
1195 }
1196 
1197 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1198                                     MachineBasicBlock::iterator &MI) {
1199   assert(!MI->mayLoad() && MI->mayStore());
1200 
1201   bool Changed = false;
1202 
1203   if (MOI.isAtomic()) {
1204     if (MOI.getOrdering() == AtomicOrdering::Release ||
1205         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1206       Changed |= CC->insertWait(MI, MOI.getScope(),
1207                                 MOI.getOrderingAddrSpace(),
1208                                 SIMemOp::LOAD | SIMemOp::STORE,
1209                                 MOI.getIsCrossAddressSpaceOrdering(),
1210                                 Position::BEFORE);
1211 
1212     return Changed;
1213   }
1214 
1215   // Atomic instructions do not have the nontemporal attribute.
1216   if (MOI.isNonTemporal()) {
1217     Changed |= CC->enableNonTemporal(MI);
1218     return Changed;
1219   }
1220 
1221   return Changed;
1222 }
1223 
1224 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1225                                           MachineBasicBlock::iterator &MI) {
1226   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1227 
1228   AtomicPseudoMIs.push_back(MI);
1229   bool Changed = false;
1230 
1231   if (MOI.isAtomic()) {
1232     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1233         MOI.getOrdering() == AtomicOrdering::Release ||
1234         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1235         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1236       /// TODO: This relies on a barrier always generating a waitcnt
1237       /// for LDS to ensure it is not reordered with the completion of
1238       /// the proceeding LDS operations. If barrier had a memory
1239       /// ordering and memory scope, then library does not need to
1240       /// generate a fence. Could add support in this file for
1241       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1242       /// adding waitcnt before a S_BARRIER.
1243       Changed |= CC->insertWait(MI, MOI.getScope(),
1244                                 MOI.getOrderingAddrSpace(),
1245                                 SIMemOp::LOAD | SIMemOp::STORE,
1246                                 MOI.getIsCrossAddressSpaceOrdering(),
1247                                 Position::BEFORE);
1248 
1249     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1250         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1251         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1252       Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1253                                            MOI.getOrderingAddrSpace(),
1254                                            Position::BEFORE);
1255 
1256     return Changed;
1257   }
1258 
1259   return Changed;
1260 }
1261 
1262 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1263   MachineBasicBlock::iterator &MI) {
1264   assert(MI->mayLoad() && MI->mayStore());
1265 
1266   bool Changed = false;
1267 
1268   if (MOI.isAtomic()) {
1269     if (MOI.getOrdering() == AtomicOrdering::Release ||
1270         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1271         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1272         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1273       Changed |= CC->insertWait(MI, MOI.getScope(),
1274                                 MOI.getOrderingAddrSpace(),
1275                                 SIMemOp::LOAD | SIMemOp::STORE,
1276                                 MOI.getIsCrossAddressSpaceOrdering(),
1277                                 Position::BEFORE);
1278 
1279     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1280         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1281         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1282         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1283         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1284       Changed |= CC->insertWait(MI, MOI.getScope(),
1285                                 MOI.getOrderingAddrSpace(),
1286                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1287                                                    SIMemOp::STORE,
1288                                 MOI.getIsCrossAddressSpaceOrdering(),
1289                                 Position::AFTER);
1290       Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1291                                            MOI.getOrderingAddrSpace(),
1292                                            Position::AFTER);
1293     }
1294 
1295     return Changed;
1296   }
1297 
1298   return Changed;
1299 }
1300 
1301 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1302   bool Changed = false;
1303 
1304   SIMemOpAccess MOA(MF);
1305   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1306 
1307   for (auto &MBB : MF) {
1308     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1309 
1310       if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) {
1311         MachineBasicBlock::instr_iterator II(MI->getIterator());
1312         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1313              I != E && I->isBundledWithPred(); ++I) {
1314           I->unbundleFromPred();
1315           for (MachineOperand &MO : I->operands())
1316             if (MO.isReg())
1317               MO.setIsInternalRead(false);
1318         }
1319 
1320         MI->eraseFromParent();
1321         MI = II->getIterator();
1322       }
1323 
1324       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1325         continue;
1326 
1327       if (const auto &MOI = MOA.getLoadInfo(MI))
1328         Changed |= expandLoad(MOI.getValue(), MI);
1329       else if (const auto &MOI = MOA.getStoreInfo(MI))
1330         Changed |= expandStore(MOI.getValue(), MI);
1331       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1332         Changed |= expandAtomicFence(MOI.getValue(), MI);
1333       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1334         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1335     }
1336   }
1337 
1338   Changed |= removeAtomicPseudoMIs();
1339   return Changed;
1340 }
1341 
1342 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1343 
1344 char SIMemoryLegalizer::ID = 0;
1345 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1346 
1347 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1348   return new SIMemoryLegalizer();
1349 }
1350