xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision a30235a4c360c06bb57be1f10ae6866a71fb5622)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34     cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42   NONE = 0u,
43   LOAD = 1u << 0,
44   STORE = 1u << 1,
45   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51   BEFORE,
52   AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57   NONE,
58   SINGLETHREAD,
59   WAVEFRONT,
60   WORKGROUP,
61   AGENT,
62   SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68   NONE = 0u,
69   GLOBAL = 1u << 0,
70   LDS = 1u << 1,
71   SCRATCH = 1u << 2,
72   GDS = 1u << 3,
73   OTHER = 1u << 4,
74 
75   /// The address spaces that can be accessed by a FLAT instruction.
76   FLAT = GLOBAL | LDS | SCRATCH,
77 
78   /// The address spaces that support atomic instructions.
79   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81   /// All address spaces.
82   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
88 /// \returns Returns true if \p MI is modified, false otherwise.
89 template <uint16_t BitName>
90 bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
91   int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
92   if (BitIdx == -1)
93     return false;
94 
95   MachineOperand &Bit = MI->getOperand(BitIdx);
96   if (Bit.getImm() != 0)
97     return false;
98 
99   Bit.setImm(1);
100   return true;
101 }
102 
103 class SIMemOpInfo final {
104 private:
105 
106   friend class SIMemOpAccess;
107 
108   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
109   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
110   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
111   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
112   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
113   bool IsCrossAddressSpaceOrdering = false;
114   bool IsVolatile = false;
115   bool IsNonTemporal = false;
116 
117   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
118               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
119               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
120               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
121               bool IsCrossAddressSpaceOrdering = true,
122               AtomicOrdering FailureOrdering =
123                 AtomicOrdering::SequentiallyConsistent,
124               bool IsVolatile = false,
125               bool IsNonTemporal = false)
126     : Ordering(Ordering), FailureOrdering(FailureOrdering),
127       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
128       InstrAddrSpace(InstrAddrSpace),
129       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
130       IsVolatile(IsVolatile),
131       IsNonTemporal(IsNonTemporal) {
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138   }
139 
140 public:
141   /// \returns Atomic synchronization scope of the machine instruction used to
142   /// create this SIMemOpInfo.
143   SIAtomicScope getScope() const {
144     return Scope;
145   }
146 
147   /// \returns Ordering constraint of the machine instruction used to
148   /// create this SIMemOpInfo.
149   AtomicOrdering getOrdering() const {
150     return Ordering;
151   }
152 
153   /// \returns Failure ordering constraint of the machine instruction used to
154   /// create this SIMemOpInfo.
155   AtomicOrdering getFailureOrdering() const {
156     return FailureOrdering;
157   }
158 
159   /// \returns The address spaces be accessed by the machine
160   /// instruction used to create this SiMemOpInfo.
161   SIAtomicAddrSpace getInstrAddrSpace() const {
162     return InstrAddrSpace;
163   }
164 
165   /// \returns The address spaces that must be ordered by the machine
166   /// instruction used to create this SiMemOpInfo.
167   SIAtomicAddrSpace getOrderingAddrSpace() const {
168     return OrderingAddrSpace;
169   }
170 
171   /// \returns Return true iff memory ordering of operations on
172   /// different address spaces is required.
173   bool getIsCrossAddressSpaceOrdering() const {
174     return IsCrossAddressSpaceOrdering;
175   }
176 
177   /// \returns True if memory access of the machine instruction used to
178   /// create this SIMemOpInfo is volatile, false otherwise.
179   bool isVolatile() const {
180     return IsVolatile;
181   }
182 
183   /// \returns True if memory access of the machine instruction used to
184   /// create this SIMemOpInfo is nontemporal, false otherwise.
185   bool isNonTemporal() const {
186     return IsNonTemporal;
187   }
188 
189   /// \returns True if ordering constraint of the machine instruction used to
190   /// create this SIMemOpInfo is unordered or higher, false otherwise.
191   bool isAtomic() const {
192     return Ordering != AtomicOrdering::NotAtomic;
193   }
194 
195 };
196 
197 class SIMemOpAccess final {
198 private:
199   AMDGPUMachineModuleInfo *MMI = nullptr;
200 
201   /// Reports unsupported message \p Msg for \p MI to LLVM context.
202   void reportUnsupported(const MachineBasicBlock::iterator &MI,
203                          const char *Msg) const;
204 
205   /// Inspects the target synchonization scope \p SSID and determines
206   /// the SI atomic scope it corresponds to, the address spaces it
207   /// covers, and whether the memory ordering applies between address
208   /// spaces.
209   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
210   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
211 
212   /// \return Return a bit set of the address spaces accessed by \p AS.
213   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
214 
215   /// \returns Info constructed from \p MI, which has at least machine memory
216   /// operand.
217   Optional<SIMemOpInfo> constructFromMIWithMMO(
218       const MachineBasicBlock::iterator &MI) const;
219 
220 public:
221   /// Construct class to support accessing the machine memory operands
222   /// of instructions in the machine function \p MF.
223   SIMemOpAccess(MachineFunction &MF);
224 
225   /// \returns Load info if \p MI is a load operation, "None" otherwise.
226   Optional<SIMemOpInfo> getLoadInfo(
227       const MachineBasicBlock::iterator &MI) const;
228 
229   /// \returns Store info if \p MI is a store operation, "None" otherwise.
230   Optional<SIMemOpInfo> getStoreInfo(
231       const MachineBasicBlock::iterator &MI) const;
232 
233   /// \returns Atomic fence info if \p MI is an atomic fence operation,
234   /// "None" otherwise.
235   Optional<SIMemOpInfo> getAtomicFenceInfo(
236       const MachineBasicBlock::iterator &MI) const;
237 
238   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
239   /// rmw operation, "None" otherwise.
240   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
241       const MachineBasicBlock::iterator &MI) const;
242 };
243 
244 class SICacheControl {
245 protected:
246 
247   /// AMDGPU subtarget info.
248   const GCNSubtarget &ST;
249 
250   /// Instruction info.
251   const SIInstrInfo *TII = nullptr;
252 
253   IsaVersion IV;
254 
255   /// Whether to insert cache invalidating instructions.
256   bool InsertCacheInv;
257 
258   SICacheControl(const GCNSubtarget &ST);
259 
260 public:
261 
262   /// Create a cache control for the subtarget \p ST.
263   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
264 
265   /// Update \p MI memory load instruction to bypass any caches up to
266   /// the \p Scope memory scope for address spaces \p
267   /// AddrSpace. Return true iff the instruction was modified.
268   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
269                                      SIAtomicScope Scope,
270                                      SIAtomicAddrSpace AddrSpace) const = 0;
271 
272   /// Update \p MI memory instruction of kind \p Op associated with address
273   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
274   /// true iff the instruction was modified.
275   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
276                                               SIAtomicAddrSpace AddrSpace,
277                                               SIMemOp Op, bool IsVolatile,
278                                               bool IsNonTemporal) const = 0;
279 
280   /// Inserts any necessary instructions at position \p Pos relative
281   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
282   /// \p Op associated with address spaces \p AddrSpace have completed. Used
283   /// between memory instructions to enforce the order they become visible as
284   /// observed by other memory instructions executing in memory scope \p Scope.
285   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
286   /// address spaces. Returns true iff any instructions inserted.
287   virtual bool insertWait(MachineBasicBlock::iterator &MI,
288                           SIAtomicScope Scope,
289                           SIAtomicAddrSpace AddrSpace,
290                           SIMemOp Op,
291                           bool IsCrossAddrSpaceOrdering,
292                           Position Pos) const = 0;
293 
294   /// Inserts any necessary instructions at position \p Pos relative to
295   /// instruction \p MI to ensure any subsequent memory instructions of this
296   /// thread with address spaces \p AddrSpace will observe the previous memory
297   /// operations by any thread for memory scopes up to memory scope \p Scope .
298   /// Returns true iff any instructions inserted.
299   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
300                              SIAtomicScope Scope,
301                              SIAtomicAddrSpace AddrSpace,
302                              Position Pos) const = 0;
303 
304   /// Inserts any necessary instructions at position \p Pos relative to
305   /// instruction \p MI to ensure previous memory instructions by this thread
306   /// with address spaces \p AddrSpace have completed and can be observed by
307   /// subsequent memory instructions by any thread executing in memory scope \p
308   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
309   /// between address spaces. Returns true iff any instructions inserted.
310   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
311                              SIAtomicScope Scope,
312                              SIAtomicAddrSpace AddrSpace,
313                              bool IsCrossAddrSpaceOrdering,
314                              Position Pos) const = 0;
315 
316   /// Virtual destructor to allow derivations to be deleted.
317   virtual ~SICacheControl() = default;
318 
319 };
320 
321 class SIGfx6CacheControl : public SICacheControl {
322 protected:
323 
324   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
325   /// is modified, false otherwise.
326   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
327     return enableNamedBit<AMDGPU::OpName::glc>(MI);
328   }
329 
330   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
331   /// is modified, false otherwise.
332   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
333     return enableNamedBit<AMDGPU::OpName::slc>(MI);
334   }
335 
336 public:
337 
338   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
339 
340   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
341                              SIAtomicScope Scope,
342                              SIAtomicAddrSpace AddrSpace) const override;
343 
344   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
345                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
346                                       bool IsVolatile,
347                                       bool IsNonTemporal) const override;
348 
349   bool insertWait(MachineBasicBlock::iterator &MI,
350                   SIAtomicScope Scope,
351                   SIAtomicAddrSpace AddrSpace,
352                   SIMemOp Op,
353                   bool IsCrossAddrSpaceOrdering,
354                   Position Pos) const override;
355 
356   bool insertAcquire(MachineBasicBlock::iterator &MI,
357                      SIAtomicScope Scope,
358                      SIAtomicAddrSpace AddrSpace,
359                      Position Pos) const override;
360 
361   bool insertRelease(MachineBasicBlock::iterator &MI,
362                      SIAtomicScope Scope,
363                      SIAtomicAddrSpace AddrSpace,
364                      bool IsCrossAddrSpaceOrdering,
365                      Position Pos) const override;
366 };
367 
368 class SIGfx7CacheControl : public SIGfx6CacheControl {
369 public:
370 
371   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
372 
373   bool insertAcquire(MachineBasicBlock::iterator &MI,
374                      SIAtomicScope Scope,
375                      SIAtomicAddrSpace AddrSpace,
376                      Position Pos) const override;
377 
378 };
379 
380 class SIGfx10CacheControl : public SIGfx7CacheControl {
381 protected:
382 
383   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
384   /// is modified, false otherwise.
385   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
386     return enableNamedBit<AMDGPU::OpName::dlc>(MI);
387   }
388 
389 public:
390 
391   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
392 
393   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
394                              SIAtomicScope Scope,
395                              SIAtomicAddrSpace AddrSpace) const override;
396 
397   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
398                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
399                                       bool IsVolatile,
400                                       bool IsNonTemporal) const override;
401 
402   bool insertWait(MachineBasicBlock::iterator &MI,
403                   SIAtomicScope Scope,
404                   SIAtomicAddrSpace AddrSpace,
405                   SIMemOp Op,
406                   bool IsCrossAddrSpaceOrdering,
407                   Position Pos) const override;
408 
409   bool insertAcquire(MachineBasicBlock::iterator &MI,
410                      SIAtomicScope Scope,
411                      SIAtomicAddrSpace AddrSpace,
412                      Position Pos) const override;
413 };
414 
415 class SIMemoryLegalizer final : public MachineFunctionPass {
416 private:
417 
418   /// Cache Control.
419   std::unique_ptr<SICacheControl> CC = nullptr;
420 
421   /// List of atomic pseudo instructions.
422   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
423 
424   /// Return true iff instruction \p MI is a atomic instruction that
425   /// returns a result.
426   bool isAtomicRet(const MachineInstr &MI) const {
427     return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
428   }
429 
430   /// Removes all processed atomic pseudo instructions from the current
431   /// function. Returns true if current function is modified, false otherwise.
432   bool removeAtomicPseudoMIs();
433 
434   /// Expands load operation \p MI. Returns true if instructions are
435   /// added/deleted or \p MI is modified, false otherwise.
436   bool expandLoad(const SIMemOpInfo &MOI,
437                   MachineBasicBlock::iterator &MI);
438   /// Expands store operation \p MI. Returns true if instructions are
439   /// added/deleted or \p MI is modified, false otherwise.
440   bool expandStore(const SIMemOpInfo &MOI,
441                    MachineBasicBlock::iterator &MI);
442   /// Expands atomic fence operation \p MI. Returns true if
443   /// instructions are added/deleted or \p MI is modified, false otherwise.
444   bool expandAtomicFence(const SIMemOpInfo &MOI,
445                          MachineBasicBlock::iterator &MI);
446   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
447   /// instructions are added/deleted or \p MI is modified, false otherwise.
448   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
449                                 MachineBasicBlock::iterator &MI);
450 
451 public:
452   static char ID;
453 
454   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
455 
456   void getAnalysisUsage(AnalysisUsage &AU) const override {
457     AU.setPreservesCFG();
458     MachineFunctionPass::getAnalysisUsage(AU);
459   }
460 
461   StringRef getPassName() const override {
462     return PASS_NAME;
463   }
464 
465   bool runOnMachineFunction(MachineFunction &MF) override;
466 };
467 
468 } // end namespace anonymous
469 
470 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
471                                       const char *Msg) const {
472   const Function &Func = MI->getParent()->getParent()->getFunction();
473   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
474   Func.getContext().diagnose(Diag);
475 }
476 
477 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
478 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
479                                SIAtomicAddrSpace InstrScope) const {
480   if (SSID == SyncScope::System)
481     return std::make_tuple(SIAtomicScope::SYSTEM,
482                            SIAtomicAddrSpace::ATOMIC,
483                            true);
484   if (SSID == MMI->getAgentSSID())
485     return std::make_tuple(SIAtomicScope::AGENT,
486                            SIAtomicAddrSpace::ATOMIC,
487                            true);
488   if (SSID == MMI->getWorkgroupSSID())
489     return std::make_tuple(SIAtomicScope::WORKGROUP,
490                            SIAtomicAddrSpace::ATOMIC,
491                            true);
492   if (SSID == MMI->getWavefrontSSID())
493     return std::make_tuple(SIAtomicScope::WAVEFRONT,
494                            SIAtomicAddrSpace::ATOMIC,
495                            true);
496   if (SSID == SyncScope::SingleThread)
497     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
498                            SIAtomicAddrSpace::ATOMIC,
499                            true);
500   if (SSID == MMI->getSystemOneAddressSpaceSSID())
501     return std::make_tuple(SIAtomicScope::SYSTEM,
502                            SIAtomicAddrSpace::ATOMIC & InstrScope,
503                            false);
504   if (SSID == MMI->getAgentOneAddressSpaceSSID())
505     return std::make_tuple(SIAtomicScope::AGENT,
506                            SIAtomicAddrSpace::ATOMIC & InstrScope,
507                            false);
508   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
509     return std::make_tuple(SIAtomicScope::WORKGROUP,
510                            SIAtomicAddrSpace::ATOMIC & InstrScope,
511                            false);
512   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
513     return std::make_tuple(SIAtomicScope::WAVEFRONT,
514                            SIAtomicAddrSpace::ATOMIC & InstrScope,
515                            false);
516   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
517     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
518                            SIAtomicAddrSpace::ATOMIC & InstrScope,
519                            false);
520   return None;
521 }
522 
523 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
524   if (AS == AMDGPUAS::FLAT_ADDRESS)
525     return SIAtomicAddrSpace::FLAT;
526   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
527     return SIAtomicAddrSpace::GLOBAL;
528   if (AS == AMDGPUAS::LOCAL_ADDRESS)
529     return SIAtomicAddrSpace::LDS;
530   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
531     return SIAtomicAddrSpace::SCRATCH;
532   if (AS == AMDGPUAS::REGION_ADDRESS)
533     return SIAtomicAddrSpace::GDS;
534 
535   return SIAtomicAddrSpace::OTHER;
536 }
537 
538 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
539   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
540 }
541 
542 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
543     const MachineBasicBlock::iterator &MI) const {
544   assert(MI->getNumMemOperands() > 0);
545 
546   SyncScope::ID SSID = SyncScope::SingleThread;
547   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
548   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
549   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
550   bool IsNonTemporal = true;
551   bool IsVolatile = false;
552 
553   // Validator should check whether or not MMOs cover the entire set of
554   // locations accessed by the memory instruction.
555   for (const auto &MMO : MI->memoperands()) {
556     IsNonTemporal &= MMO->isNonTemporal();
557     IsVolatile |= MMO->isVolatile();
558     InstrAddrSpace |=
559       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
560     AtomicOrdering OpOrdering = MMO->getOrdering();
561     if (OpOrdering != AtomicOrdering::NotAtomic) {
562       const auto &IsSyncScopeInclusion =
563           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
564       if (!IsSyncScopeInclusion) {
565         reportUnsupported(MI,
566           "Unsupported non-inclusive atomic synchronization scope");
567         return None;
568       }
569 
570       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
571       Ordering =
572           isStrongerThan(Ordering, OpOrdering) ?
573               Ordering : MMO->getOrdering();
574       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
575              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
576       FailureOrdering =
577           isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
578               FailureOrdering : MMO->getFailureOrdering();
579     }
580   }
581 
582   SIAtomicScope Scope = SIAtomicScope::NONE;
583   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
584   bool IsCrossAddressSpaceOrdering = false;
585   if (Ordering != AtomicOrdering::NotAtomic) {
586     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
587     if (!ScopeOrNone) {
588       reportUnsupported(MI, "Unsupported atomic synchronization scope");
589       return None;
590     }
591     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
592       ScopeOrNone.getValue();
593     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
594         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
595       reportUnsupported(MI, "Unsupported atomic address space");
596       return None;
597     }
598   }
599   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
600                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
601                      IsNonTemporal);
602 }
603 
604 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
605     const MachineBasicBlock::iterator &MI) const {
606   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
607 
608   if (!(MI->mayLoad() && !MI->mayStore()))
609     return None;
610 
611   // Be conservative if there are no memory operands.
612   if (MI->getNumMemOperands() == 0)
613     return SIMemOpInfo();
614 
615   return constructFromMIWithMMO(MI);
616 }
617 
618 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
619     const MachineBasicBlock::iterator &MI) const {
620   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
621 
622   if (!(!MI->mayLoad() && MI->mayStore()))
623     return None;
624 
625   // Be conservative if there are no memory operands.
626   if (MI->getNumMemOperands() == 0)
627     return SIMemOpInfo();
628 
629   return constructFromMIWithMMO(MI);
630 }
631 
632 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
633     const MachineBasicBlock::iterator &MI) const {
634   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
635 
636   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
637     return None;
638 
639   AtomicOrdering Ordering =
640     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
641 
642   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
643   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
644   if (!ScopeOrNone) {
645     reportUnsupported(MI, "Unsupported atomic synchronization scope");
646     return None;
647   }
648 
649   SIAtomicScope Scope = SIAtomicScope::NONE;
650   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
651   bool IsCrossAddressSpaceOrdering = false;
652   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
653     ScopeOrNone.getValue();
654 
655   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
656       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
657     reportUnsupported(MI, "Unsupported atomic address space");
658     return None;
659   }
660 
661   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
662                      IsCrossAddressSpaceOrdering);
663 }
664 
665 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
666     const MachineBasicBlock::iterator &MI) const {
667   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
668 
669   if (!(MI->mayLoad() && MI->mayStore()))
670     return None;
671 
672   // Be conservative if there are no memory operands.
673   if (MI->getNumMemOperands() == 0)
674     return SIMemOpInfo();
675 
676   return constructFromMIWithMMO(MI);
677 }
678 
679 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
680   TII = ST.getInstrInfo();
681   IV = getIsaVersion(ST.getCPU());
682   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
683 }
684 
685 /* static */
686 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
687   GCNSubtarget::Generation Generation = ST.getGeneration();
688   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
689     return std::make_unique<SIGfx6CacheControl>(ST);
690   if (Generation < AMDGPUSubtarget::GFX10)
691     return std::make_unique<SIGfx7CacheControl>(ST);
692   return std::make_unique<SIGfx10CacheControl>(ST);
693 }
694 
695 bool SIGfx6CacheControl::enableLoadCacheBypass(
696     const MachineBasicBlock::iterator &MI,
697     SIAtomicScope Scope,
698     SIAtomicAddrSpace AddrSpace) const {
699   assert(MI->mayLoad() && !MI->mayStore());
700   bool Changed = false;
701 
702   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
703     switch (Scope) {
704     case SIAtomicScope::SYSTEM:
705     case SIAtomicScope::AGENT:
706       Changed |= enableGLCBit(MI);
707       break;
708     case SIAtomicScope::WORKGROUP:
709     case SIAtomicScope::WAVEFRONT:
710     case SIAtomicScope::SINGLETHREAD:
711       // No cache to bypass.
712       break;
713     default:
714       llvm_unreachable("Unsupported synchronization scope");
715     }
716   }
717 
718   /// The scratch address space does not need the global memory caches
719   /// to be bypassed as all memory operations by the same thread are
720   /// sequentially consistent, and no other thread can access scratch
721   /// memory.
722 
723   /// Other address spaces do not have a cache.
724 
725   return Changed;
726 }
727 
728 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
729     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
730     bool IsVolatile, bool IsNonTemporal) const {
731   // Only handle load and store, not atomic read-modify-write insructions. The
732   // latter use glc to indicate if the atomic returns a result and so must not
733   // be used for cache control.
734   assert(MI->mayLoad() ^ MI->mayStore());
735 
736   // Only update load and store, not LLVM IR atomic read-modify-write
737   // instructions. The latter are always marked as volatile so cannot sensibly
738   // handle it as do not want to pessimize all atomics. Also they do not support
739   // the nontemporal attribute.
740   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
741 
742   bool Changed = false;
743 
744   if (IsVolatile) {
745     if (Op == SIMemOp::LOAD)
746       Changed |= enableGLCBit(MI);
747 
748     // Ensure operation has completed at system scope to cause all volatile
749     // operations to be visible outside the program in a global order. Do not
750     // request cross address space as only the global address space can be
751     // observable outside the program, so no need to cause a waitcnt for LDS
752     // address space operations.
753     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
754                           Position::AFTER);
755 
756     return Changed;
757   }
758 
759   if (IsNonTemporal) {
760     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
761     Changed |= enableGLCBit(MI);
762     Changed |= enableSLCBit(MI);
763     return Changed;
764   }
765 
766   return Changed;
767 }
768 
769 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
770                                     SIAtomicScope Scope,
771                                     SIAtomicAddrSpace AddrSpace,
772                                     SIMemOp Op,
773                                     bool IsCrossAddrSpaceOrdering,
774                                     Position Pos) const {
775   bool Changed = false;
776 
777   MachineBasicBlock &MBB = *MI->getParent();
778   DebugLoc DL = MI->getDebugLoc();
779 
780   if (Pos == Position::AFTER)
781     ++MI;
782 
783   bool VMCnt = false;
784   bool LGKMCnt = false;
785 
786   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
787       SIAtomicAddrSpace::NONE) {
788     switch (Scope) {
789     case SIAtomicScope::SYSTEM:
790     case SIAtomicScope::AGENT:
791       VMCnt |= true;
792       break;
793     case SIAtomicScope::WORKGROUP:
794     case SIAtomicScope::WAVEFRONT:
795     case SIAtomicScope::SINGLETHREAD:
796       // The L1 cache keeps all memory operations in order for
797       // wavefronts in the same work-group.
798       break;
799     default:
800       llvm_unreachable("Unsupported synchronization scope");
801     }
802   }
803 
804   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
805     switch (Scope) {
806     case SIAtomicScope::SYSTEM:
807     case SIAtomicScope::AGENT:
808     case SIAtomicScope::WORKGROUP:
809       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
810       // not needed as LDS operations for all waves are executed in a total
811       // global ordering as observed by all waves. Required if also
812       // synchronizing with global/GDS memory as LDS operations could be
813       // reordered with respect to later global/GDS memory operations of the
814       // same wave.
815       LGKMCnt |= IsCrossAddrSpaceOrdering;
816       break;
817     case SIAtomicScope::WAVEFRONT:
818     case SIAtomicScope::SINGLETHREAD:
819       // The LDS keeps all memory operations in order for
820       // the same wavesfront.
821       break;
822     default:
823       llvm_unreachable("Unsupported synchronization scope");
824     }
825   }
826 
827   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
828     switch (Scope) {
829     case SIAtomicScope::SYSTEM:
830     case SIAtomicScope::AGENT:
831       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
832       // is not needed as GDS operations for all waves are executed in a total
833       // global ordering as observed by all waves. Required if also
834       // synchronizing with global/LDS memory as GDS operations could be
835       // reordered with respect to later global/LDS memory operations of the
836       // same wave.
837       LGKMCnt |= IsCrossAddrSpaceOrdering;
838       break;
839     case SIAtomicScope::WORKGROUP:
840     case SIAtomicScope::WAVEFRONT:
841     case SIAtomicScope::SINGLETHREAD:
842       // The GDS keeps all memory operations in order for
843       // the same work-group.
844       break;
845     default:
846       llvm_unreachable("Unsupported synchronization scope");
847     }
848   }
849 
850   if (VMCnt || LGKMCnt) {
851     unsigned WaitCntImmediate =
852       AMDGPU::encodeWaitcnt(IV,
853                             VMCnt ? 0 : getVmcntBitMask(IV),
854                             getExpcntBitMask(IV),
855                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
856     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
857     Changed = true;
858   }
859 
860   if (Pos == Position::AFTER)
861     --MI;
862 
863   return Changed;
864 }
865 
866 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
867                                        SIAtomicScope Scope,
868                                        SIAtomicAddrSpace AddrSpace,
869                                        Position Pos) const {
870   if (!InsertCacheInv)
871     return false;
872 
873   bool Changed = false;
874 
875   MachineBasicBlock &MBB = *MI->getParent();
876   DebugLoc DL = MI->getDebugLoc();
877 
878   if (Pos == Position::AFTER)
879     ++MI;
880 
881   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
882     switch (Scope) {
883     case SIAtomicScope::SYSTEM:
884     case SIAtomicScope::AGENT:
885       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
886       Changed = true;
887       break;
888     case SIAtomicScope::WORKGROUP:
889     case SIAtomicScope::WAVEFRONT:
890     case SIAtomicScope::SINGLETHREAD:
891       // No cache to invalidate.
892       break;
893     default:
894       llvm_unreachable("Unsupported synchronization scope");
895     }
896   }
897 
898   /// The scratch address space does not need the global memory cache
899   /// to be flushed as all memory operations by the same thread are
900   /// sequentially consistent, and no other thread can access scratch
901   /// memory.
902 
903   /// Other address spaces do not have a cache.
904 
905   if (Pos == Position::AFTER)
906     --MI;
907 
908   return Changed;
909 }
910 
911 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
912                                        SIAtomicScope Scope,
913                                        SIAtomicAddrSpace AddrSpace,
914                                        bool IsCrossAddrSpaceOrdering,
915                                        Position Pos) const {
916     return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
917                       IsCrossAddrSpaceOrdering, Pos);
918 }
919 
920 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
921                                        SIAtomicScope Scope,
922                                        SIAtomicAddrSpace AddrSpace,
923                                        Position Pos) const {
924   if (!InsertCacheInv)
925     return false;
926 
927   bool Changed = false;
928 
929   MachineBasicBlock &MBB = *MI->getParent();
930   DebugLoc DL = MI->getDebugLoc();
931 
932   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
933 
934   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
935                                     ? AMDGPU::BUFFER_WBINVL1
936                                     : AMDGPU::BUFFER_WBINVL1_VOL;
937 
938   if (Pos == Position::AFTER)
939     ++MI;
940 
941   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
942     switch (Scope) {
943     case SIAtomicScope::SYSTEM:
944     case SIAtomicScope::AGENT:
945       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
946       Changed = true;
947       break;
948     case SIAtomicScope::WORKGROUP:
949     case SIAtomicScope::WAVEFRONT:
950     case SIAtomicScope::SINGLETHREAD:
951       // No cache to invalidate.
952       break;
953     default:
954       llvm_unreachable("Unsupported synchronization scope");
955     }
956   }
957 
958   /// The scratch address space does not need the global memory cache
959   /// to be flushed as all memory operations by the same thread are
960   /// sequentially consistent, and no other thread can access scratch
961   /// memory.
962 
963   /// Other address spaces do not have a cache.
964 
965   if (Pos == Position::AFTER)
966     --MI;
967 
968   return Changed;
969 }
970 
971 bool SIGfx10CacheControl::enableLoadCacheBypass(
972     const MachineBasicBlock::iterator &MI,
973     SIAtomicScope Scope,
974     SIAtomicAddrSpace AddrSpace) const {
975   assert(MI->mayLoad() && !MI->mayStore());
976   bool Changed = false;
977 
978   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
979     /// TODO Do not set glc for rmw atomic operations as they
980     /// implicitly bypass the L0/L1 caches.
981 
982     switch (Scope) {
983     case SIAtomicScope::SYSTEM:
984     case SIAtomicScope::AGENT:
985       Changed |= enableGLCBit(MI);
986       Changed |= enableDLCBit(MI);
987       break;
988     case SIAtomicScope::WORKGROUP:
989       // In WGP mode the waves of a work-group can be executing on either CU of
990       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
991       // CU mode all waves of a work-group are on the same CU, and so the L0
992       // does not need to be bypassed.
993       if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
994       break;
995     case SIAtomicScope::WAVEFRONT:
996     case SIAtomicScope::SINGLETHREAD:
997       // No cache to bypass.
998       break;
999     default:
1000       llvm_unreachable("Unsupported synchronization scope");
1001     }
1002   }
1003 
1004   /// The scratch address space does not need the global memory caches
1005   /// to be bypassed as all memory operations by the same thread are
1006   /// sequentially consistent, and no other thread can access scratch
1007   /// memory.
1008 
1009   /// Other address spaces do not have a cache.
1010 
1011   return Changed;
1012 }
1013 
1014 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1015     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1016     bool IsVolatile, bool IsNonTemporal) const {
1017 
1018   // Only handle load and store, not atomic read-modify-write insructions. The
1019   // latter use glc to indicate if the atomic returns a result and so must not
1020   // be used for cache control.
1021   assert(MI->mayLoad() ^ MI->mayStore());
1022 
1023   // Only update load and store, not LLVM IR atomic read-modify-write
1024   // instructions. The latter are always marked as volatile so cannot sensibly
1025   // handle it as do not want to pessimize all atomics. Also they do not support
1026   // the nontemporal attribute.
1027   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1028 
1029   bool Changed = false;
1030 
1031   if (IsVolatile) {
1032 
1033     if (Op == SIMemOp::LOAD) {
1034       Changed |= enableGLCBit(MI);
1035       Changed |= enableDLCBit(MI);
1036     }
1037 
1038     // Ensure operation has completed at system scope to cause all volatile
1039     // operations to be visible outside the program in a global order. Do not
1040     // request cross address space as only the global address space can be
1041     // observable outside the program, so no need to cause a waitcnt for LDS
1042     // address space operations.
1043     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1044                           Position::AFTER);
1045     return Changed;
1046   }
1047 
1048   if (IsNonTemporal) {
1049     // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1050     Changed |= enableSLCBit(MI);
1051     return Changed;
1052   }
1053 
1054   return Changed;
1055 }
1056 
1057 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1058                                      SIAtomicScope Scope,
1059                                      SIAtomicAddrSpace AddrSpace,
1060                                      SIMemOp Op,
1061                                      bool IsCrossAddrSpaceOrdering,
1062                                      Position Pos) const {
1063   bool Changed = false;
1064 
1065   MachineBasicBlock &MBB = *MI->getParent();
1066   DebugLoc DL = MI->getDebugLoc();
1067 
1068   if (Pos == Position::AFTER)
1069     ++MI;
1070 
1071   bool VMCnt = false;
1072   bool VSCnt = false;
1073   bool LGKMCnt = false;
1074 
1075   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1076       SIAtomicAddrSpace::NONE) {
1077     switch (Scope) {
1078     case SIAtomicScope::SYSTEM:
1079     case SIAtomicScope::AGENT:
1080       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1081         VMCnt |= true;
1082       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1083         VSCnt |= true;
1084       break;
1085     case SIAtomicScope::WORKGROUP:
1086       // In WGP mode the waves of a work-group can be executing on either CU of
1087       // the WGP. Therefore need to wait for operations to complete to ensure
1088       // they are visible to waves in the other CU as the L0 is per CU.
1089       // Otherwise in CU mode and all waves of a work-group are on the same CU
1090       // which shares the same L0.
1091       if (!ST.isCuModeEnabled()) {
1092         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1093           VMCnt |= true;
1094         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1095           VSCnt |= true;
1096       }
1097       break;
1098     case SIAtomicScope::WAVEFRONT:
1099     case SIAtomicScope::SINGLETHREAD:
1100       // The L0 cache keeps all memory operations in order for
1101       // work-items in the same wavefront.
1102       break;
1103     default:
1104       llvm_unreachable("Unsupported synchronization scope");
1105     }
1106   }
1107 
1108   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1109     switch (Scope) {
1110     case SIAtomicScope::SYSTEM:
1111     case SIAtomicScope::AGENT:
1112     case SIAtomicScope::WORKGROUP:
1113       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1114       // not needed as LDS operations for all waves are executed in a total
1115       // global ordering as observed by all waves. Required if also
1116       // synchronizing with global/GDS memory as LDS operations could be
1117       // reordered with respect to later global/GDS memory operations of the
1118       // same wave.
1119       LGKMCnt |= IsCrossAddrSpaceOrdering;
1120       break;
1121     case SIAtomicScope::WAVEFRONT:
1122     case SIAtomicScope::SINGLETHREAD:
1123       // The LDS keeps all memory operations in order for
1124       // the same wavesfront.
1125       break;
1126     default:
1127       llvm_unreachable("Unsupported synchronization scope");
1128     }
1129   }
1130 
1131   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1132     switch (Scope) {
1133     case SIAtomicScope::SYSTEM:
1134     case SIAtomicScope::AGENT:
1135       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1136       // is not needed as GDS operations for all waves are executed in a total
1137       // global ordering as observed by all waves. Required if also
1138       // synchronizing with global/LDS memory as GDS operations could be
1139       // reordered with respect to later global/LDS memory operations of the
1140       // same wave.
1141       LGKMCnt |= IsCrossAddrSpaceOrdering;
1142       break;
1143     case SIAtomicScope::WORKGROUP:
1144     case SIAtomicScope::WAVEFRONT:
1145     case SIAtomicScope::SINGLETHREAD:
1146       // The GDS keeps all memory operations in order for
1147       // the same work-group.
1148       break;
1149     default:
1150       llvm_unreachable("Unsupported synchronization scope");
1151     }
1152   }
1153 
1154   if (VMCnt || LGKMCnt) {
1155     unsigned WaitCntImmediate =
1156       AMDGPU::encodeWaitcnt(IV,
1157                             VMCnt ? 0 : getVmcntBitMask(IV),
1158                             getExpcntBitMask(IV),
1159                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1160     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1161     Changed = true;
1162   }
1163 
1164   if (VSCnt) {
1165     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1166       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1167       .addImm(0);
1168     Changed = true;
1169   }
1170 
1171   if (Pos == Position::AFTER)
1172     --MI;
1173 
1174   return Changed;
1175 }
1176 
1177 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1178                                         SIAtomicScope Scope,
1179                                         SIAtomicAddrSpace AddrSpace,
1180                                         Position Pos) const {
1181   if (!InsertCacheInv)
1182     return false;
1183 
1184   bool Changed = false;
1185 
1186   MachineBasicBlock &MBB = *MI->getParent();
1187   DebugLoc DL = MI->getDebugLoc();
1188 
1189   if (Pos == Position::AFTER)
1190     ++MI;
1191 
1192   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1193     switch (Scope) {
1194     case SIAtomicScope::SYSTEM:
1195     case SIAtomicScope::AGENT:
1196       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1197       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1198       Changed = true;
1199       break;
1200     case SIAtomicScope::WORKGROUP:
1201       // In WGP mode the waves of a work-group can be executing on either CU of
1202       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1203       // in CU mode and all waves of a work-group are on the same CU, and so the
1204       // L0 does not need to be invalidated.
1205       if (!ST.isCuModeEnabled()) {
1206         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1207         Changed = true;
1208       }
1209       break;
1210     case SIAtomicScope::WAVEFRONT:
1211     case SIAtomicScope::SINGLETHREAD:
1212       // No cache to invalidate.
1213       break;
1214     default:
1215       llvm_unreachable("Unsupported synchronization scope");
1216     }
1217   }
1218 
1219   /// The scratch address space does not need the global memory cache
1220   /// to be flushed as all memory operations by the same thread are
1221   /// sequentially consistent, and no other thread can access scratch
1222   /// memory.
1223 
1224   /// Other address spaces do not have a cache.
1225 
1226   if (Pos == Position::AFTER)
1227     --MI;
1228 
1229   return Changed;
1230 }
1231 
1232 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1233   if (AtomicPseudoMIs.empty())
1234     return false;
1235 
1236   for (auto &MI : AtomicPseudoMIs)
1237     MI->eraseFromParent();
1238 
1239   AtomicPseudoMIs.clear();
1240   return true;
1241 }
1242 
1243 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1244                                    MachineBasicBlock::iterator &MI) {
1245   assert(MI->mayLoad() && !MI->mayStore());
1246 
1247   bool Changed = false;
1248 
1249   if (MOI.isAtomic()) {
1250     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1251         MOI.getOrdering() == AtomicOrdering::Acquire ||
1252         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1253       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1254                                            MOI.getOrderingAddrSpace());
1255     }
1256 
1257     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1258       Changed |= CC->insertWait(MI, MOI.getScope(),
1259                                 MOI.getOrderingAddrSpace(),
1260                                 SIMemOp::LOAD | SIMemOp::STORE,
1261                                 MOI.getIsCrossAddressSpaceOrdering(),
1262                                 Position::BEFORE);
1263 
1264     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1265         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1266       Changed |= CC->insertWait(MI, MOI.getScope(),
1267                                 MOI.getInstrAddrSpace(),
1268                                 SIMemOp::LOAD,
1269                                 MOI.getIsCrossAddressSpaceOrdering(),
1270                                 Position::AFTER);
1271       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1272                                    MOI.getOrderingAddrSpace(),
1273                                    Position::AFTER);
1274     }
1275 
1276     return Changed;
1277   }
1278 
1279   // Atomic instructions already bypass caches to the scope specified by the
1280   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1281   // need additional treatment.
1282   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1283                                                 SIMemOp::LOAD, MOI.isVolatile(),
1284                                                 MOI.isNonTemporal());
1285   return Changed;
1286 }
1287 
1288 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1289                                     MachineBasicBlock::iterator &MI) {
1290   assert(!MI->mayLoad() && MI->mayStore());
1291 
1292   bool Changed = false;
1293 
1294   if (MOI.isAtomic()) {
1295     if (MOI.getOrdering() == AtomicOrdering::Release ||
1296         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1297       Changed |= CC->insertRelease(MI, MOI.getScope(),
1298                                    MOI.getOrderingAddrSpace(),
1299                                    MOI.getIsCrossAddressSpaceOrdering(),
1300                                    Position::BEFORE);
1301 
1302     return Changed;
1303   }
1304 
1305   // Atomic instructions already bypass caches to the scope specified by the
1306   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1307   // need additional treatment.
1308   Changed |= CC->enableVolatileAndOrNonTemporal(
1309       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1310       MOI.isNonTemporal());
1311   return Changed;
1312 }
1313 
1314 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1315                                           MachineBasicBlock::iterator &MI) {
1316   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1317 
1318   AtomicPseudoMIs.push_back(MI);
1319   bool Changed = false;
1320 
1321   if (MOI.isAtomic()) {
1322     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1323         MOI.getOrdering() == AtomicOrdering::Release ||
1324         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1325         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1326       /// TODO: This relies on a barrier always generating a waitcnt
1327       /// for LDS to ensure it is not reordered with the completion of
1328       /// the proceeding LDS operations. If barrier had a memory
1329       /// ordering and memory scope, then library does not need to
1330       /// generate a fence. Could add support in this file for
1331       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1332       /// adding S_WAITCNT before a S_BARRIER.
1333       Changed |= CC->insertRelease(MI, MOI.getScope(),
1334                                    MOI.getOrderingAddrSpace(),
1335                                    MOI.getIsCrossAddressSpaceOrdering(),
1336                                    Position::BEFORE);
1337 
1338     // TODO: If both release and invalidate are happening they could be combined
1339     // to use the single "BUFFER_WBL2" instruction. This could be done by
1340     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1341     // track cache invalidate and write back instructions.
1342 
1343     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1344         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1345         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1346       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1347                                    MOI.getOrderingAddrSpace(),
1348                                    Position::BEFORE);
1349 
1350     return Changed;
1351   }
1352 
1353   return Changed;
1354 }
1355 
1356 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1357   MachineBasicBlock::iterator &MI) {
1358   assert(MI->mayLoad() && MI->mayStore());
1359 
1360   bool Changed = false;
1361 
1362   if (MOI.isAtomic()) {
1363     if (MOI.getOrdering() == AtomicOrdering::Release ||
1364         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1365         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1366         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1367       Changed |= CC->insertRelease(MI, MOI.getScope(),
1368                                    MOI.getOrderingAddrSpace(),
1369                                    MOI.getIsCrossAddressSpaceOrdering(),
1370                                    Position::BEFORE);
1371 
1372     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1373         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1374         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1375         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1376         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1377       Changed |= CC->insertWait(MI, MOI.getScope(),
1378                                 MOI.getOrderingAddrSpace(),
1379                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1380                                                    SIMemOp::STORE,
1381                                 MOI.getIsCrossAddressSpaceOrdering(),
1382                                 Position::AFTER);
1383       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1384                                    MOI.getOrderingAddrSpace(),
1385                                    Position::AFTER);
1386     }
1387 
1388     return Changed;
1389   }
1390 
1391   return Changed;
1392 }
1393 
1394 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1395   bool Changed = false;
1396 
1397   SIMemOpAccess MOA(MF);
1398   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1399 
1400   for (auto &MBB : MF) {
1401     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1402 
1403       // Unbundle instructions after the post-RA scheduler.
1404       if (MI->isBundle()) {
1405         MachineBasicBlock::instr_iterator II(MI->getIterator());
1406         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1407              I != E && I->isBundledWithPred(); ++I) {
1408           I->unbundleFromPred();
1409           for (MachineOperand &MO : I->operands())
1410             if (MO.isReg())
1411               MO.setIsInternalRead(false);
1412         }
1413 
1414         MI->eraseFromParent();
1415         MI = II->getIterator();
1416       }
1417 
1418       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1419         continue;
1420 
1421       if (const auto &MOI = MOA.getLoadInfo(MI))
1422         Changed |= expandLoad(MOI.getValue(), MI);
1423       else if (const auto &MOI = MOA.getStoreInfo(MI))
1424         Changed |= expandStore(MOI.getValue(), MI);
1425       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1426         Changed |= expandAtomicFence(MOI.getValue(), MI);
1427       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1428         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1429     }
1430   }
1431 
1432   Changed |= removeAtomicPseudoMIs();
1433   return Changed;
1434 }
1435 
1436 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1437 
1438 char SIMemoryLegalizer::ID = 0;
1439 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1440 
1441 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1442   return new SIMemoryLegalizer();
1443 }
1444