xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/Support/AtomicOrdering.h"
24 #include "llvm/Support/TargetParser.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
33     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34     cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42   NONE = 0u,
43   LOAD = 1u << 0,
44   STORE = 1u << 1,
45   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51   BEFORE,
52   AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57   NONE,
58   SINGLETHREAD,
59   WAVEFRONT,
60   WORKGROUP,
61   AGENT,
62   SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68   NONE = 0u,
69   GLOBAL = 1u << 0,
70   LDS = 1u << 1,
71   SCRATCH = 1u << 2,
72   GDS = 1u << 3,
73   OTHER = 1u << 4,
74 
75   /// The address spaces that can be accessed by a FLAT instruction.
76   FLAT = GLOBAL | LDS | SCRATCH,
77 
78   /// The address spaces that support atomic instructions.
79   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81   /// All address spaces.
82   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 class SIMemOpInfo final {
88 private:
89 
90   friend class SIMemOpAccess;
91 
92   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
93   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97   bool IsCrossAddressSpaceOrdering = false;
98   bool IsVolatile = false;
99   bool IsNonTemporal = false;
100 
101   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
102               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105               bool IsCrossAddressSpaceOrdering = true,
106               AtomicOrdering FailureOrdering =
107                 AtomicOrdering::SequentiallyConsistent,
108               bool IsVolatile = false,
109               bool IsNonTemporal = false)
110     : Ordering(Ordering), FailureOrdering(FailureOrdering),
111       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112       InstrAddrSpace(InstrAddrSpace),
113       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
114       IsVolatile(IsVolatile),
115       IsNonTemporal(IsNonTemporal) {
116 
117     if (Ordering == AtomicOrdering::NotAtomic) {
118       assert(Scope == SIAtomicScope::NONE &&
119              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120              !IsCrossAddressSpaceOrdering &&
121              FailureOrdering == AtomicOrdering::NotAtomic);
122       return;
123     }
124 
125     assert(Scope != SIAtomicScope::NONE &&
126            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
127                SIAtomicAddrSpace::NONE &&
128            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
129                SIAtomicAddrSpace::NONE);
130 
131     // There is also no cross address space ordering if the ordering
132     // address space is the same as the instruction address space and
133     // only contains a single address space.
134     if ((OrderingAddrSpace == InstrAddrSpace) &&
135         isPowerOf2_32(uint32_t(InstrAddrSpace)))
136       this->IsCrossAddressSpaceOrdering = false;
137 
138     // Limit the scope to the maximum supported by the instruction's address
139     // spaces.
140     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
141         SIAtomicAddrSpace::NONE) {
142       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
143     } else if ((InstrAddrSpace &
144                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
145                SIAtomicAddrSpace::NONE) {
146       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
147     } else if ((InstrAddrSpace &
148                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
149                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
150       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
151     }
152   }
153 
154 public:
155   /// \returns Atomic synchronization scope of the machine instruction used to
156   /// create this SIMemOpInfo.
157   SIAtomicScope getScope() const {
158     return Scope;
159   }
160 
161   /// \returns Ordering constraint of the machine instruction used to
162   /// create this SIMemOpInfo.
163   AtomicOrdering getOrdering() const {
164     return Ordering;
165   }
166 
167   /// \returns Failure ordering constraint of the machine instruction used to
168   /// create this SIMemOpInfo.
169   AtomicOrdering getFailureOrdering() const {
170     return FailureOrdering;
171   }
172 
173   /// \returns The address spaces be accessed by the machine
174   /// instruction used to create this SiMemOpInfo.
175   SIAtomicAddrSpace getInstrAddrSpace() const {
176     return InstrAddrSpace;
177   }
178 
179   /// \returns The address spaces that must be ordered by the machine
180   /// instruction used to create this SiMemOpInfo.
181   SIAtomicAddrSpace getOrderingAddrSpace() const {
182     return OrderingAddrSpace;
183   }
184 
185   /// \returns Return true iff memory ordering of operations on
186   /// different address spaces is required.
187   bool getIsCrossAddressSpaceOrdering() const {
188     return IsCrossAddressSpaceOrdering;
189   }
190 
191   /// \returns True if memory access of the machine instruction used to
192   /// create this SIMemOpInfo is volatile, false otherwise.
193   bool isVolatile() const {
194     return IsVolatile;
195   }
196 
197   /// \returns True if memory access of the machine instruction used to
198   /// create this SIMemOpInfo is nontemporal, false otherwise.
199   bool isNonTemporal() const {
200     return IsNonTemporal;
201   }
202 
203   /// \returns True if ordering constraint of the machine instruction used to
204   /// create this SIMemOpInfo is unordered or higher, false otherwise.
205   bool isAtomic() const {
206     return Ordering != AtomicOrdering::NotAtomic;
207   }
208 
209 };
210 
211 class SIMemOpAccess final {
212 private:
213   AMDGPUMachineModuleInfo *MMI = nullptr;
214 
215   /// Reports unsupported message \p Msg for \p MI to LLVM context.
216   void reportUnsupported(const MachineBasicBlock::iterator &MI,
217                          const char *Msg) const;
218 
219   /// Inspects the target synchronization scope \p SSID and determines
220   /// the SI atomic scope it corresponds to, the address spaces it
221   /// covers, and whether the memory ordering applies between address
222   /// spaces.
223   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
224   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
225 
226   /// \return Return a bit set of the address spaces accessed by \p AS.
227   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
228 
229   /// \returns Info constructed from \p MI, which has at least machine memory
230   /// operand.
231   Optional<SIMemOpInfo> constructFromMIWithMMO(
232       const MachineBasicBlock::iterator &MI) const;
233 
234 public:
235   /// Construct class to support accessing the machine memory operands
236   /// of instructions in the machine function \p MF.
237   SIMemOpAccess(MachineFunction &MF);
238 
239   /// \returns Load info if \p MI is a load operation, "None" otherwise.
240   Optional<SIMemOpInfo> getLoadInfo(
241       const MachineBasicBlock::iterator &MI) const;
242 
243   /// \returns Store info if \p MI is a store operation, "None" otherwise.
244   Optional<SIMemOpInfo> getStoreInfo(
245       const MachineBasicBlock::iterator &MI) const;
246 
247   /// \returns Atomic fence info if \p MI is an atomic fence operation,
248   /// "None" otherwise.
249   Optional<SIMemOpInfo> getAtomicFenceInfo(
250       const MachineBasicBlock::iterator &MI) const;
251 
252   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
253   /// rmw operation, "None" otherwise.
254   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
255       const MachineBasicBlock::iterator &MI) const;
256 };
257 
258 class SICacheControl {
259 protected:
260 
261   /// AMDGPU subtarget info.
262   const GCNSubtarget &ST;
263 
264   /// Instruction info.
265   const SIInstrInfo *TII = nullptr;
266 
267   IsaVersion IV;
268 
269   /// Whether to insert cache invalidating instructions.
270   bool InsertCacheInv;
271 
272   SICacheControl(const GCNSubtarget &ST);
273 
274   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
275   /// \returns Returns true if \p MI is modified, false otherwise.
276   bool enableNamedBit(const MachineBasicBlock::iterator MI,
277                       AMDGPU::CPol::CPol Bit) const;
278 
279 public:
280 
281   /// Create a cache control for the subtarget \p ST.
282   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
283 
284   /// Update \p MI memory load instruction to bypass any caches up to
285   /// the \p Scope memory scope for address spaces \p
286   /// AddrSpace. Return true iff the instruction was modified.
287   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
288                                      SIAtomicScope Scope,
289                                      SIAtomicAddrSpace AddrSpace) const = 0;
290 
291   /// Update \p MI memory store instruction to bypass any caches up to
292   /// the \p Scope memory scope for address spaces \p
293   /// AddrSpace. Return true iff the instruction was modified.
294   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
295                                       SIAtomicScope Scope,
296                                       SIAtomicAddrSpace AddrSpace) const = 0;
297 
298   /// Update \p MI memory read-modify-write instruction to bypass any caches up
299   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
300   /// iff the instruction was modified.
301   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
302                                     SIAtomicScope Scope,
303                                     SIAtomicAddrSpace AddrSpace) const = 0;
304 
305   /// Update \p MI memory instruction of kind \p Op associated with address
306   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
307   /// true iff the instruction was modified.
308   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
309                                               SIAtomicAddrSpace AddrSpace,
310                                               SIMemOp Op, bool IsVolatile,
311                                               bool IsNonTemporal) const = 0;
312 
313   /// Inserts any necessary instructions at position \p Pos relative
314   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
315   /// \p Op associated with address spaces \p AddrSpace have completed. Used
316   /// between memory instructions to enforce the order they become visible as
317   /// observed by other memory instructions executing in memory scope \p Scope.
318   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
319   /// address spaces. Returns true iff any instructions inserted.
320   virtual bool insertWait(MachineBasicBlock::iterator &MI,
321                           SIAtomicScope Scope,
322                           SIAtomicAddrSpace AddrSpace,
323                           SIMemOp Op,
324                           bool IsCrossAddrSpaceOrdering,
325                           Position Pos) const = 0;
326 
327   /// Inserts any necessary instructions at position \p Pos relative to
328   /// instruction \p MI to ensure any subsequent memory instructions of this
329   /// thread with address spaces \p AddrSpace will observe the previous memory
330   /// operations by any thread for memory scopes up to memory scope \p Scope .
331   /// Returns true iff any instructions inserted.
332   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
333                              SIAtomicScope Scope,
334                              SIAtomicAddrSpace AddrSpace,
335                              Position Pos) const = 0;
336 
337   /// Inserts any necessary instructions at position \p Pos relative to
338   /// instruction \p MI to ensure previous memory instructions by this thread
339   /// with address spaces \p AddrSpace have completed and can be observed by
340   /// subsequent memory instructions by any thread executing in memory scope \p
341   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
342   /// between address spaces. Returns true iff any instructions inserted.
343   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
344                              SIAtomicScope Scope,
345                              SIAtomicAddrSpace AddrSpace,
346                              bool IsCrossAddrSpaceOrdering,
347                              Position Pos) const = 0;
348 
349   /// Virtual destructor to allow derivations to be deleted.
350   virtual ~SICacheControl() = default;
351 
352 };
353 
354 class SIGfx6CacheControl : public SICacheControl {
355 protected:
356 
357   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
358   /// is modified, false otherwise.
359   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
360     return enableNamedBit(MI, AMDGPU::CPol::GLC);
361   }
362 
363   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
364   /// is modified, false otherwise.
365   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
366     return enableNamedBit(MI, AMDGPU::CPol::SLC);
367   }
368 
369 public:
370 
371   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
372 
373   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
374                              SIAtomicScope Scope,
375                              SIAtomicAddrSpace AddrSpace) const override;
376 
377   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
378                               SIAtomicScope Scope,
379                               SIAtomicAddrSpace AddrSpace) const override;
380 
381   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
382                             SIAtomicScope Scope,
383                             SIAtomicAddrSpace AddrSpace) const override;
384 
385   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
386                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
387                                       bool IsVolatile,
388                                       bool IsNonTemporal) const override;
389 
390   bool insertWait(MachineBasicBlock::iterator &MI,
391                   SIAtomicScope Scope,
392                   SIAtomicAddrSpace AddrSpace,
393                   SIMemOp Op,
394                   bool IsCrossAddrSpaceOrdering,
395                   Position Pos) const override;
396 
397   bool insertAcquire(MachineBasicBlock::iterator &MI,
398                      SIAtomicScope Scope,
399                      SIAtomicAddrSpace AddrSpace,
400                      Position Pos) const override;
401 
402   bool insertRelease(MachineBasicBlock::iterator &MI,
403                      SIAtomicScope Scope,
404                      SIAtomicAddrSpace AddrSpace,
405                      bool IsCrossAddrSpaceOrdering,
406                      Position Pos) const override;
407 };
408 
409 class SIGfx7CacheControl : public SIGfx6CacheControl {
410 public:
411 
412   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
413 
414   bool insertAcquire(MachineBasicBlock::iterator &MI,
415                      SIAtomicScope Scope,
416                      SIAtomicAddrSpace AddrSpace,
417                      Position Pos) const override;
418 
419 };
420 
421 class SIGfx90ACacheControl : public SIGfx7CacheControl {
422 public:
423 
424   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
425 
426   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
427                              SIAtomicScope Scope,
428                              SIAtomicAddrSpace AddrSpace) const override;
429 
430   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
431                               SIAtomicScope Scope,
432                               SIAtomicAddrSpace AddrSpace) const override;
433 
434   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
435                             SIAtomicScope Scope,
436                             SIAtomicAddrSpace AddrSpace) const override;
437 
438   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
439                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
440                                       bool IsVolatile,
441                                       bool IsNonTemporal) const override;
442 
443   bool insertWait(MachineBasicBlock::iterator &MI,
444                   SIAtomicScope Scope,
445                   SIAtomicAddrSpace AddrSpace,
446                   SIMemOp Op,
447                   bool IsCrossAddrSpaceOrdering,
448                   Position Pos) const override;
449 
450   bool insertAcquire(MachineBasicBlock::iterator &MI,
451                      SIAtomicScope Scope,
452                      SIAtomicAddrSpace AddrSpace,
453                      Position Pos) const override;
454 
455   bool insertRelease(MachineBasicBlock::iterator &MI,
456                      SIAtomicScope Scope,
457                      SIAtomicAddrSpace AddrSpace,
458                      bool IsCrossAddrSpaceOrdering,
459                      Position Pos) const override;
460 };
461 
462 class SIGfx10CacheControl : public SIGfx7CacheControl {
463 protected:
464 
465   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
466   /// is modified, false otherwise.
467   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
468     return enableNamedBit(MI, AMDGPU::CPol::DLC);
469   }
470 
471 public:
472 
473   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
474 
475   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
476                              SIAtomicScope Scope,
477                              SIAtomicAddrSpace AddrSpace) const override;
478 
479   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
480                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
481                                       bool IsVolatile,
482                                       bool IsNonTemporal) const override;
483 
484   bool insertWait(MachineBasicBlock::iterator &MI,
485                   SIAtomicScope Scope,
486                   SIAtomicAddrSpace AddrSpace,
487                   SIMemOp Op,
488                   bool IsCrossAddrSpaceOrdering,
489                   Position Pos) const override;
490 
491   bool insertAcquire(MachineBasicBlock::iterator &MI,
492                      SIAtomicScope Scope,
493                      SIAtomicAddrSpace AddrSpace,
494                      Position Pos) const override;
495 };
496 
497 class SIMemoryLegalizer final : public MachineFunctionPass {
498 private:
499 
500   /// Cache Control.
501   std::unique_ptr<SICacheControl> CC = nullptr;
502 
503   /// List of atomic pseudo instructions.
504   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
505 
506   /// Return true iff instruction \p MI is a atomic instruction that
507   /// returns a result.
508   bool isAtomicRet(const MachineInstr &MI) const {
509     return SIInstrInfo::isAtomicRet(MI);
510   }
511 
512   /// Removes all processed atomic pseudo instructions from the current
513   /// function. Returns true if current function is modified, false otherwise.
514   bool removeAtomicPseudoMIs();
515 
516   /// Expands load operation \p MI. Returns true if instructions are
517   /// added/deleted or \p MI is modified, false otherwise.
518   bool expandLoad(const SIMemOpInfo &MOI,
519                   MachineBasicBlock::iterator &MI);
520   /// Expands store operation \p MI. Returns true if instructions are
521   /// added/deleted or \p MI is modified, false otherwise.
522   bool expandStore(const SIMemOpInfo &MOI,
523                    MachineBasicBlock::iterator &MI);
524   /// Expands atomic fence operation \p MI. Returns true if
525   /// instructions are added/deleted or \p MI is modified, false otherwise.
526   bool expandAtomicFence(const SIMemOpInfo &MOI,
527                          MachineBasicBlock::iterator &MI);
528   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
529   /// instructions are added/deleted or \p MI is modified, false otherwise.
530   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
531                                 MachineBasicBlock::iterator &MI);
532 
533 public:
534   static char ID;
535 
536   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
537 
538   void getAnalysisUsage(AnalysisUsage &AU) const override {
539     AU.setPreservesCFG();
540     MachineFunctionPass::getAnalysisUsage(AU);
541   }
542 
543   StringRef getPassName() const override {
544     return PASS_NAME;
545   }
546 
547   bool runOnMachineFunction(MachineFunction &MF) override;
548 };
549 
550 } // end namespace anonymous
551 
552 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
553                                       const char *Msg) const {
554   const Function &Func = MI->getParent()->getParent()->getFunction();
555   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
556   Func.getContext().diagnose(Diag);
557 }
558 
559 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
560 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
561                                SIAtomicAddrSpace InstrAddrSpace) const {
562   if (SSID == SyncScope::System)
563     return std::make_tuple(SIAtomicScope::SYSTEM,
564                            SIAtomicAddrSpace::ATOMIC,
565                            true);
566   if (SSID == MMI->getAgentSSID())
567     return std::make_tuple(SIAtomicScope::AGENT,
568                            SIAtomicAddrSpace::ATOMIC,
569                            true);
570   if (SSID == MMI->getWorkgroupSSID())
571     return std::make_tuple(SIAtomicScope::WORKGROUP,
572                            SIAtomicAddrSpace::ATOMIC,
573                            true);
574   if (SSID == MMI->getWavefrontSSID())
575     return std::make_tuple(SIAtomicScope::WAVEFRONT,
576                            SIAtomicAddrSpace::ATOMIC,
577                            true);
578   if (SSID == SyncScope::SingleThread)
579     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
580                            SIAtomicAddrSpace::ATOMIC,
581                            true);
582   if (SSID == MMI->getSystemOneAddressSpaceSSID())
583     return std::make_tuple(SIAtomicScope::SYSTEM,
584                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
585                            false);
586   if (SSID == MMI->getAgentOneAddressSpaceSSID())
587     return std::make_tuple(SIAtomicScope::AGENT,
588                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
589                            false);
590   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
591     return std::make_tuple(SIAtomicScope::WORKGROUP,
592                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
593                            false);
594   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
595     return std::make_tuple(SIAtomicScope::WAVEFRONT,
596                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
597                            false);
598   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
599     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
600                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
601                            false);
602   return None;
603 }
604 
605 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
606   if (AS == AMDGPUAS::FLAT_ADDRESS)
607     return SIAtomicAddrSpace::FLAT;
608   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
609     return SIAtomicAddrSpace::GLOBAL;
610   if (AS == AMDGPUAS::LOCAL_ADDRESS)
611     return SIAtomicAddrSpace::LDS;
612   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
613     return SIAtomicAddrSpace::SCRATCH;
614   if (AS == AMDGPUAS::REGION_ADDRESS)
615     return SIAtomicAddrSpace::GDS;
616 
617   return SIAtomicAddrSpace::OTHER;
618 }
619 
620 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
621   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
622 }
623 
624 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
625     const MachineBasicBlock::iterator &MI) const {
626   assert(MI->getNumMemOperands() > 0);
627 
628   SyncScope::ID SSID = SyncScope::SingleThread;
629   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
630   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
631   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
632   bool IsNonTemporal = true;
633   bool IsVolatile = false;
634 
635   // Validator should check whether or not MMOs cover the entire set of
636   // locations accessed by the memory instruction.
637   for (const auto &MMO : MI->memoperands()) {
638     IsNonTemporal &= MMO->isNonTemporal();
639     IsVolatile |= MMO->isVolatile();
640     InstrAddrSpace |=
641       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
642     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
643     if (OpOrdering != AtomicOrdering::NotAtomic) {
644       const auto &IsSyncScopeInclusion =
645           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
646       if (!IsSyncScopeInclusion) {
647         reportUnsupported(MI,
648           "Unsupported non-inclusive atomic synchronization scope");
649         return None;
650       }
651 
652       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
653       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
654       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
655              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
656       FailureOrdering =
657           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
658     }
659   }
660 
661   SIAtomicScope Scope = SIAtomicScope::NONE;
662   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
663   bool IsCrossAddressSpaceOrdering = false;
664   if (Ordering != AtomicOrdering::NotAtomic) {
665     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
666     if (!ScopeOrNone) {
667       reportUnsupported(MI, "Unsupported atomic synchronization scope");
668       return None;
669     }
670     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
671       ScopeOrNone.getValue();
672     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
673         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
674         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
675       reportUnsupported(MI, "Unsupported atomic address space");
676       return None;
677     }
678   }
679   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
680                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
681                      IsNonTemporal);
682 }
683 
684 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
685     const MachineBasicBlock::iterator &MI) const {
686   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
687 
688   if (!(MI->mayLoad() && !MI->mayStore()))
689     return None;
690 
691   // Be conservative if there are no memory operands.
692   if (MI->getNumMemOperands() == 0)
693     return SIMemOpInfo();
694 
695   return constructFromMIWithMMO(MI);
696 }
697 
698 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
699     const MachineBasicBlock::iterator &MI) const {
700   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
701 
702   if (!(!MI->mayLoad() && MI->mayStore()))
703     return None;
704 
705   // Be conservative if there are no memory operands.
706   if (MI->getNumMemOperands() == 0)
707     return SIMemOpInfo();
708 
709   return constructFromMIWithMMO(MI);
710 }
711 
712 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
713     const MachineBasicBlock::iterator &MI) const {
714   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
715 
716   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
717     return None;
718 
719   AtomicOrdering Ordering =
720     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
721 
722   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
723   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
724   if (!ScopeOrNone) {
725     reportUnsupported(MI, "Unsupported atomic synchronization scope");
726     return None;
727   }
728 
729   SIAtomicScope Scope = SIAtomicScope::NONE;
730   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
731   bool IsCrossAddressSpaceOrdering = false;
732   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
733     ScopeOrNone.getValue();
734 
735   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
736       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
737     reportUnsupported(MI, "Unsupported atomic address space");
738     return None;
739   }
740 
741   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
742                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
743 }
744 
745 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
746     const MachineBasicBlock::iterator &MI) const {
747   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
748 
749   if (!(MI->mayLoad() && MI->mayStore()))
750     return None;
751 
752   // Be conservative if there are no memory operands.
753   if (MI->getNumMemOperands() == 0)
754     return SIMemOpInfo();
755 
756   return constructFromMIWithMMO(MI);
757 }
758 
759 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
760   TII = ST.getInstrInfo();
761   IV = getIsaVersion(ST.getCPU());
762   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
763 }
764 
765 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
766                                     AMDGPU::CPol::CPol Bit) const {
767   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
768   if (!CPol)
769     return false;
770 
771   CPol->setImm(CPol->getImm() | Bit);
772   return true;
773 }
774 
775 /* static */
776 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
777   GCNSubtarget::Generation Generation = ST.getGeneration();
778   if (ST.hasGFX90AInsts())
779     return std::make_unique<SIGfx90ACacheControl>(ST);
780   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
781     return std::make_unique<SIGfx6CacheControl>(ST);
782   if (Generation < AMDGPUSubtarget::GFX10)
783     return std::make_unique<SIGfx7CacheControl>(ST);
784   return std::make_unique<SIGfx10CacheControl>(ST);
785 }
786 
787 bool SIGfx6CacheControl::enableLoadCacheBypass(
788     const MachineBasicBlock::iterator &MI,
789     SIAtomicScope Scope,
790     SIAtomicAddrSpace AddrSpace) const {
791   assert(MI->mayLoad() && !MI->mayStore());
792   bool Changed = false;
793 
794   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
795     switch (Scope) {
796     case SIAtomicScope::SYSTEM:
797     case SIAtomicScope::AGENT:
798       Changed |= enableGLCBit(MI);
799       break;
800     case SIAtomicScope::WORKGROUP:
801     case SIAtomicScope::WAVEFRONT:
802     case SIAtomicScope::SINGLETHREAD:
803       // No cache to bypass.
804       break;
805     default:
806       llvm_unreachable("Unsupported synchronization scope");
807     }
808   }
809 
810   /// The scratch address space does not need the global memory caches
811   /// to be bypassed as all memory operations by the same thread are
812   /// sequentially consistent, and no other thread can access scratch
813   /// memory.
814 
815   /// Other address spaces do not have a cache.
816 
817   return Changed;
818 }
819 
820 bool SIGfx6CacheControl::enableStoreCacheBypass(
821     const MachineBasicBlock::iterator &MI,
822     SIAtomicScope Scope,
823     SIAtomicAddrSpace AddrSpace) const {
824   assert(!MI->mayLoad() && MI->mayStore());
825   bool Changed = false;
826 
827   /// The L1 cache is write through so does not need to be bypassed. There is no
828   /// bypass control for the L2 cache at the isa level.
829 
830   return Changed;
831 }
832 
833 bool SIGfx6CacheControl::enableRMWCacheBypass(
834     const MachineBasicBlock::iterator &MI,
835     SIAtomicScope Scope,
836     SIAtomicAddrSpace AddrSpace) const {
837   assert(MI->mayLoad() && MI->mayStore());
838   bool Changed = false;
839 
840   /// The L1 cache is write through so does not need to be bypassed. There is no
841   /// bypass control for the L2 cache at the isa level.
842 
843   return Changed;
844 }
845 
846 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
847     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
848     bool IsVolatile, bool IsNonTemporal) const {
849   // Only handle load and store, not atomic read-modify-write insructions. The
850   // latter use glc to indicate if the atomic returns a result and so must not
851   // be used for cache control.
852   assert(MI->mayLoad() ^ MI->mayStore());
853 
854   // Only update load and store, not LLVM IR atomic read-modify-write
855   // instructions. The latter are always marked as volatile so cannot sensibly
856   // handle it as do not want to pessimize all atomics. Also they do not support
857   // the nontemporal attribute.
858   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
859 
860   bool Changed = false;
861 
862   if (IsVolatile) {
863     if (Op == SIMemOp::LOAD)
864       Changed |= enableGLCBit(MI);
865 
866     // Ensure operation has completed at system scope to cause all volatile
867     // operations to be visible outside the program in a global order. Do not
868     // request cross address space as only the global address space can be
869     // observable outside the program, so no need to cause a waitcnt for LDS
870     // address space operations.
871     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
872                           Position::AFTER);
873 
874     return Changed;
875   }
876 
877   if (IsNonTemporal) {
878     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
879     Changed |= enableGLCBit(MI);
880     Changed |= enableSLCBit(MI);
881     return Changed;
882   }
883 
884   return Changed;
885 }
886 
887 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
888                                     SIAtomicScope Scope,
889                                     SIAtomicAddrSpace AddrSpace,
890                                     SIMemOp Op,
891                                     bool IsCrossAddrSpaceOrdering,
892                                     Position Pos) const {
893   bool Changed = false;
894 
895   MachineBasicBlock &MBB = *MI->getParent();
896   DebugLoc DL = MI->getDebugLoc();
897 
898   if (Pos == Position::AFTER)
899     ++MI;
900 
901   bool VMCnt = false;
902   bool LGKMCnt = false;
903 
904   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
905       SIAtomicAddrSpace::NONE) {
906     switch (Scope) {
907     case SIAtomicScope::SYSTEM:
908     case SIAtomicScope::AGENT:
909       VMCnt |= true;
910       break;
911     case SIAtomicScope::WORKGROUP:
912     case SIAtomicScope::WAVEFRONT:
913     case SIAtomicScope::SINGLETHREAD:
914       // The L1 cache keeps all memory operations in order for
915       // wavefronts in the same work-group.
916       break;
917     default:
918       llvm_unreachable("Unsupported synchronization scope");
919     }
920   }
921 
922   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
923     switch (Scope) {
924     case SIAtomicScope::SYSTEM:
925     case SIAtomicScope::AGENT:
926     case SIAtomicScope::WORKGROUP:
927       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
928       // not needed as LDS operations for all waves are executed in a total
929       // global ordering as observed by all waves. Required if also
930       // synchronizing with global/GDS memory as LDS operations could be
931       // reordered with respect to later global/GDS memory operations of the
932       // same wave.
933       LGKMCnt |= IsCrossAddrSpaceOrdering;
934       break;
935     case SIAtomicScope::WAVEFRONT:
936     case SIAtomicScope::SINGLETHREAD:
937       // The LDS keeps all memory operations in order for
938       // the same wavesfront.
939       break;
940     default:
941       llvm_unreachable("Unsupported synchronization scope");
942     }
943   }
944 
945   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
946     switch (Scope) {
947     case SIAtomicScope::SYSTEM:
948     case SIAtomicScope::AGENT:
949       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
950       // is not needed as GDS operations for all waves are executed in a total
951       // global ordering as observed by all waves. Required if also
952       // synchronizing with global/LDS memory as GDS operations could be
953       // reordered with respect to later global/LDS memory operations of the
954       // same wave.
955       LGKMCnt |= IsCrossAddrSpaceOrdering;
956       break;
957     case SIAtomicScope::WORKGROUP:
958     case SIAtomicScope::WAVEFRONT:
959     case SIAtomicScope::SINGLETHREAD:
960       // The GDS keeps all memory operations in order for
961       // the same work-group.
962       break;
963     default:
964       llvm_unreachable("Unsupported synchronization scope");
965     }
966   }
967 
968   if (VMCnt || LGKMCnt) {
969     unsigned WaitCntImmediate =
970       AMDGPU::encodeWaitcnt(IV,
971                             VMCnt ? 0 : getVmcntBitMask(IV),
972                             getExpcntBitMask(IV),
973                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
974     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
975     Changed = true;
976   }
977 
978   if (Pos == Position::AFTER)
979     --MI;
980 
981   return Changed;
982 }
983 
984 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
985                                        SIAtomicScope Scope,
986                                        SIAtomicAddrSpace AddrSpace,
987                                        Position Pos) const {
988   if (!InsertCacheInv)
989     return false;
990 
991   bool Changed = false;
992 
993   MachineBasicBlock &MBB = *MI->getParent();
994   DebugLoc DL = MI->getDebugLoc();
995 
996   if (Pos == Position::AFTER)
997     ++MI;
998 
999   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1000     switch (Scope) {
1001     case SIAtomicScope::SYSTEM:
1002     case SIAtomicScope::AGENT:
1003       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1004       Changed = true;
1005       break;
1006     case SIAtomicScope::WORKGROUP:
1007     case SIAtomicScope::WAVEFRONT:
1008     case SIAtomicScope::SINGLETHREAD:
1009       // No cache to invalidate.
1010       break;
1011     default:
1012       llvm_unreachable("Unsupported synchronization scope");
1013     }
1014   }
1015 
1016   /// The scratch address space does not need the global memory cache
1017   /// to be flushed as all memory operations by the same thread are
1018   /// sequentially consistent, and no other thread can access scratch
1019   /// memory.
1020 
1021   /// Other address spaces do not have a cache.
1022 
1023   if (Pos == Position::AFTER)
1024     --MI;
1025 
1026   return Changed;
1027 }
1028 
1029 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1030                                        SIAtomicScope Scope,
1031                                        SIAtomicAddrSpace AddrSpace,
1032                                        bool IsCrossAddrSpaceOrdering,
1033                                        Position Pos) const {
1034   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1035                     IsCrossAddrSpaceOrdering, Pos);
1036 }
1037 
1038 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1039                                        SIAtomicScope Scope,
1040                                        SIAtomicAddrSpace AddrSpace,
1041                                        Position Pos) const {
1042   if (!InsertCacheInv)
1043     return false;
1044 
1045   bool Changed = false;
1046 
1047   MachineBasicBlock &MBB = *MI->getParent();
1048   DebugLoc DL = MI->getDebugLoc();
1049 
1050   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1051 
1052   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1053                                     ? AMDGPU::BUFFER_WBINVL1
1054                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1055 
1056   if (Pos == Position::AFTER)
1057     ++MI;
1058 
1059   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1060     switch (Scope) {
1061     case SIAtomicScope::SYSTEM:
1062     case SIAtomicScope::AGENT:
1063       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1064       Changed = true;
1065       break;
1066     case SIAtomicScope::WORKGROUP:
1067     case SIAtomicScope::WAVEFRONT:
1068     case SIAtomicScope::SINGLETHREAD:
1069       // No cache to invalidate.
1070       break;
1071     default:
1072       llvm_unreachable("Unsupported synchronization scope");
1073     }
1074   }
1075 
1076   /// The scratch address space does not need the global memory cache
1077   /// to be flushed as all memory operations by the same thread are
1078   /// sequentially consistent, and no other thread can access scratch
1079   /// memory.
1080 
1081   /// Other address spaces do not have a cache.
1082 
1083   if (Pos == Position::AFTER)
1084     --MI;
1085 
1086   return Changed;
1087 }
1088 
1089 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1090     const MachineBasicBlock::iterator &MI,
1091     SIAtomicScope Scope,
1092     SIAtomicAddrSpace AddrSpace) const {
1093   assert(MI->mayLoad() && !MI->mayStore());
1094   bool Changed = false;
1095 
1096   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1097     switch (Scope) {
1098     case SIAtomicScope::SYSTEM:
1099     case SIAtomicScope::AGENT:
1100       Changed |= enableGLCBit(MI);
1101       break;
1102     case SIAtomicScope::WORKGROUP:
1103       // In threadgroup split mode the waves of a work-group can be executing on
1104       // different CUs. Therefore need to bypass the L1 which is per CU.
1105       // Otherwise in non-threadgroup split mode all waves of a work-group are
1106       // on the same CU, and so the L1 does not need to be bypassed.
1107       if (ST.isTgSplitEnabled())
1108         Changed |= enableGLCBit(MI);
1109       break;
1110     case SIAtomicScope::WAVEFRONT:
1111     case SIAtomicScope::SINGLETHREAD:
1112       // No cache to bypass.
1113       break;
1114     default:
1115       llvm_unreachable("Unsupported synchronization scope");
1116     }
1117   }
1118 
1119   /// The scratch address space does not need the global memory caches
1120   /// to be bypassed as all memory operations by the same thread are
1121   /// sequentially consistent, and no other thread can access scratch
1122   /// memory.
1123 
1124   /// Other address spaces do not have a cache.
1125 
1126   return Changed;
1127 }
1128 
1129 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1130     const MachineBasicBlock::iterator &MI,
1131     SIAtomicScope Scope,
1132     SIAtomicAddrSpace AddrSpace) const {
1133   assert(!MI->mayLoad() && MI->mayStore());
1134   bool Changed = false;
1135 
1136   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1137     switch (Scope) {
1138     case SIAtomicScope::SYSTEM:
1139     case SIAtomicScope::AGENT:
1140       /// Do not set glc for store atomic operations as they implicitly write
1141       /// through the L1 cache.
1142       break;
1143     case SIAtomicScope::WORKGROUP:
1144     case SIAtomicScope::WAVEFRONT:
1145     case SIAtomicScope::SINGLETHREAD:
1146       // No cache to bypass. Store atomics implicitly write through the L1
1147       // cache.
1148       break;
1149     default:
1150       llvm_unreachable("Unsupported synchronization scope");
1151     }
1152   }
1153 
1154   /// The scratch address space does not need the global memory caches
1155   /// to be bypassed as all memory operations by the same thread are
1156   /// sequentially consistent, and no other thread can access scratch
1157   /// memory.
1158 
1159   /// Other address spaces do not have a cache.
1160 
1161   return Changed;
1162 }
1163 
1164 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1165     const MachineBasicBlock::iterator &MI,
1166     SIAtomicScope Scope,
1167     SIAtomicAddrSpace AddrSpace) const {
1168   assert(MI->mayLoad() && MI->mayStore());
1169   bool Changed = false;
1170 
1171   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1172     switch (Scope) {
1173     case SIAtomicScope::SYSTEM:
1174     case SIAtomicScope::AGENT:
1175       /// Do not set glc for RMW atomic operations as they implicitly bypass
1176       /// the L1 cache, and the glc bit is instead used to indicate if they are
1177       /// return or no-return.
1178       break;
1179     case SIAtomicScope::WORKGROUP:
1180     case SIAtomicScope::WAVEFRONT:
1181     case SIAtomicScope::SINGLETHREAD:
1182       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1183       break;
1184     default:
1185       llvm_unreachable("Unsupported synchronization scope");
1186     }
1187   }
1188 
1189   return Changed;
1190 }
1191 
1192 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1193     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1194     bool IsVolatile, bool IsNonTemporal) const {
1195   // Only handle load and store, not atomic read-modify-write insructions. The
1196   // latter use glc to indicate if the atomic returns a result and so must not
1197   // be used for cache control.
1198   assert(MI->mayLoad() ^ MI->mayStore());
1199 
1200   // Only update load and store, not LLVM IR atomic read-modify-write
1201   // instructions. The latter are always marked as volatile so cannot sensibly
1202   // handle it as do not want to pessimize all atomics. Also they do not support
1203   // the nontemporal attribute.
1204   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1205 
1206   bool Changed = false;
1207 
1208   if (IsVolatile) {
1209     if (Op == SIMemOp::LOAD)
1210       Changed |= enableGLCBit(MI);
1211 
1212     // Ensure operation has completed at system scope to cause all volatile
1213     // operations to be visible outside the program in a global order. Do not
1214     // request cross address space as only the global address space can be
1215     // observable outside the program, so no need to cause a waitcnt for LDS
1216     // address space operations.
1217     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1218                           Position::AFTER);
1219 
1220     return Changed;
1221   }
1222 
1223   if (IsNonTemporal) {
1224     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
1225     Changed |= enableGLCBit(MI);
1226     Changed |= enableSLCBit(MI);
1227     return Changed;
1228   }
1229 
1230   return Changed;
1231 }
1232 
1233 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1234                                       SIAtomicScope Scope,
1235                                       SIAtomicAddrSpace AddrSpace,
1236                                       SIMemOp Op,
1237                                       bool IsCrossAddrSpaceOrdering,
1238                                       Position Pos) const {
1239   if (ST.isTgSplitEnabled()) {
1240     // In threadgroup split mode the waves of a work-group can be executing on
1241     // different CUs. Therefore need to wait for global or GDS memory operations
1242     // to complete to ensure they are visible to waves in the other CUs.
1243     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1244     // the same CU, so no need to wait for global memory as all waves in the
1245     // work-group access the same the L1, nor wait for GDS as access are ordered
1246     // on a CU.
1247     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1248                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1249         (Scope == SIAtomicScope::WORKGROUP)) {
1250       // Same as GFX7 using agent scope.
1251       Scope = SIAtomicScope::AGENT;
1252     }
1253     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1254     // LDS memory operations.
1255     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1256   }
1257   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1258                                         IsCrossAddrSpaceOrdering, Pos);
1259 }
1260 
1261 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1262                                          SIAtomicScope Scope,
1263                                          SIAtomicAddrSpace AddrSpace,
1264                                          Position Pos) const {
1265   if (!InsertCacheInv)
1266     return false;
1267 
1268   bool Changed = false;
1269 
1270   MachineBasicBlock &MBB = *MI->getParent();
1271   DebugLoc DL = MI->getDebugLoc();
1272 
1273   if (Pos == Position::AFTER)
1274     ++MI;
1275 
1276   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1277     switch (Scope) {
1278     case SIAtomicScope::SYSTEM:
1279       // Ensures that following loads will not see stale remote VMEM data or
1280       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1281       // CC will never be stale due to the local memory probes.
1282       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1283       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1284       // hardware does not reorder memory operations by the same wave with
1285       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1286       // remove any cache lines of earlier writes by the same wave and ensures
1287       // later reads by the same wave will refetch the cache lines.
1288       Changed = true;
1289       break;
1290     case SIAtomicScope::AGENT:
1291       // Same as GFX7.
1292       break;
1293     case SIAtomicScope::WORKGROUP:
1294       // In threadgroup split mode the waves of a work-group can be executing on
1295       // different CUs. Therefore need to invalidate the L1 which is per CU.
1296       // Otherwise in non-threadgroup split mode all waves of a work-group are
1297       // on the same CU, and so the L1 does not need to be invalidated.
1298       if (ST.isTgSplitEnabled()) {
1299         // Same as GFX7 using agent scope.
1300         Scope = SIAtomicScope::AGENT;
1301       }
1302       break;
1303     case SIAtomicScope::WAVEFRONT:
1304     case SIAtomicScope::SINGLETHREAD:
1305       // Same as GFX7.
1306       break;
1307     default:
1308       llvm_unreachable("Unsupported synchronization scope");
1309     }
1310   }
1311 
1312   /// The scratch address space does not need the global memory cache
1313   /// to be flushed as all memory operations by the same thread are
1314   /// sequentially consistent, and no other thread can access scratch
1315   /// memory.
1316 
1317   /// Other address spaces do not have a cache.
1318 
1319   if (Pos == Position::AFTER)
1320     --MI;
1321 
1322   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1323 
1324   return Changed;
1325 }
1326 
1327 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1328                                          SIAtomicScope Scope,
1329                                          SIAtomicAddrSpace AddrSpace,
1330                                          bool IsCrossAddrSpaceOrdering,
1331                                          Position Pos) const {
1332   bool Changed = false;
1333 
1334   MachineBasicBlock &MBB = *MI->getParent();
1335   DebugLoc DL = MI->getDebugLoc();
1336 
1337   if (Pos == Position::AFTER)
1338     ++MI;
1339 
1340   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1341     switch (Scope) {
1342     case SIAtomicScope::SYSTEM:
1343       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1344       // hardware does not reorder memory operations by the same wave with
1345       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1346       // to initiate writeback of any dirty cache lines of earlier writes by the
1347       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1348       // writeback has completed.
1349       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
1350       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1351       // vmcnt(0)" needed by the "BUFFER_WBL2".
1352       Changed = true;
1353       break;
1354     case SIAtomicScope::AGENT:
1355     case SIAtomicScope::WORKGROUP:
1356     case SIAtomicScope::WAVEFRONT:
1357     case SIAtomicScope::SINGLETHREAD:
1358       // Same as GFX7.
1359       break;
1360     default:
1361       llvm_unreachable("Unsupported synchronization scope");
1362     }
1363   }
1364 
1365   if (Pos == Position::AFTER)
1366     --MI;
1367 
1368   Changed |=
1369       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1370                                         IsCrossAddrSpaceOrdering, Pos);
1371 
1372   return Changed;
1373 }
1374 
1375 bool SIGfx10CacheControl::enableLoadCacheBypass(
1376     const MachineBasicBlock::iterator &MI,
1377     SIAtomicScope Scope,
1378     SIAtomicAddrSpace AddrSpace) const {
1379   assert(MI->mayLoad() && !MI->mayStore());
1380   bool Changed = false;
1381 
1382   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1383     /// TODO Do not set glc for rmw atomic operations as they
1384     /// implicitly bypass the L0/L1 caches.
1385 
1386     switch (Scope) {
1387     case SIAtomicScope::SYSTEM:
1388     case SIAtomicScope::AGENT:
1389       Changed |= enableGLCBit(MI);
1390       Changed |= enableDLCBit(MI);
1391       break;
1392     case SIAtomicScope::WORKGROUP:
1393       // In WGP mode the waves of a work-group can be executing on either CU of
1394       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1395       // CU mode all waves of a work-group are on the same CU, and so the L0
1396       // does not need to be bypassed.
1397       if (!ST.isCuModeEnabled())
1398         Changed |= enableGLCBit(MI);
1399       break;
1400     case SIAtomicScope::WAVEFRONT:
1401     case SIAtomicScope::SINGLETHREAD:
1402       // No cache to bypass.
1403       break;
1404     default:
1405       llvm_unreachable("Unsupported synchronization scope");
1406     }
1407   }
1408 
1409   /// The scratch address space does not need the global memory caches
1410   /// to be bypassed as all memory operations by the same thread are
1411   /// sequentially consistent, and no other thread can access scratch
1412   /// memory.
1413 
1414   /// Other address spaces do not have a cache.
1415 
1416   return Changed;
1417 }
1418 
1419 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1420     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1421     bool IsVolatile, bool IsNonTemporal) const {
1422 
1423   // Only handle load and store, not atomic read-modify-write insructions. The
1424   // latter use glc to indicate if the atomic returns a result and so must not
1425   // be used for cache control.
1426   assert(MI->mayLoad() ^ MI->mayStore());
1427 
1428   // Only update load and store, not LLVM IR atomic read-modify-write
1429   // instructions. The latter are always marked as volatile so cannot sensibly
1430   // handle it as do not want to pessimize all atomics. Also they do not support
1431   // the nontemporal attribute.
1432   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1433 
1434   bool Changed = false;
1435 
1436   if (IsVolatile) {
1437     if (Op == SIMemOp::LOAD) {
1438       Changed |= enableGLCBit(MI);
1439       Changed |= enableDLCBit(MI);
1440     }
1441 
1442     // Ensure operation has completed at system scope to cause all volatile
1443     // operations to be visible outside the program in a global order. Do not
1444     // request cross address space as only the global address space can be
1445     // observable outside the program, so no need to cause a waitcnt for LDS
1446     // address space operations.
1447     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1448                           Position::AFTER);
1449     return Changed;
1450   }
1451 
1452   if (IsNonTemporal) {
1453     // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1454     Changed |= enableSLCBit(MI);
1455     return Changed;
1456   }
1457 
1458   return Changed;
1459 }
1460 
1461 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1462                                      SIAtomicScope Scope,
1463                                      SIAtomicAddrSpace AddrSpace,
1464                                      SIMemOp Op,
1465                                      bool IsCrossAddrSpaceOrdering,
1466                                      Position Pos) const {
1467   bool Changed = false;
1468 
1469   MachineBasicBlock &MBB = *MI->getParent();
1470   DebugLoc DL = MI->getDebugLoc();
1471 
1472   if (Pos == Position::AFTER)
1473     ++MI;
1474 
1475   bool VMCnt = false;
1476   bool VSCnt = false;
1477   bool LGKMCnt = false;
1478 
1479   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1480       SIAtomicAddrSpace::NONE) {
1481     switch (Scope) {
1482     case SIAtomicScope::SYSTEM:
1483     case SIAtomicScope::AGENT:
1484       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1485         VMCnt |= true;
1486       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1487         VSCnt |= true;
1488       break;
1489     case SIAtomicScope::WORKGROUP:
1490       // In WGP mode the waves of a work-group can be executing on either CU of
1491       // the WGP. Therefore need to wait for operations to complete to ensure
1492       // they are visible to waves in the other CU as the L0 is per CU.
1493       // Otherwise in CU mode and all waves of a work-group are on the same CU
1494       // which shares the same L0.
1495       if (!ST.isCuModeEnabled()) {
1496         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1497           VMCnt |= true;
1498         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1499           VSCnt |= true;
1500       }
1501       break;
1502     case SIAtomicScope::WAVEFRONT:
1503     case SIAtomicScope::SINGLETHREAD:
1504       // The L0 cache keeps all memory operations in order for
1505       // work-items in the same wavefront.
1506       break;
1507     default:
1508       llvm_unreachable("Unsupported synchronization scope");
1509     }
1510   }
1511 
1512   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1513     switch (Scope) {
1514     case SIAtomicScope::SYSTEM:
1515     case SIAtomicScope::AGENT:
1516     case SIAtomicScope::WORKGROUP:
1517       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1518       // not needed as LDS operations for all waves are executed in a total
1519       // global ordering as observed by all waves. Required if also
1520       // synchronizing with global/GDS memory as LDS operations could be
1521       // reordered with respect to later global/GDS memory operations of the
1522       // same wave.
1523       LGKMCnt |= IsCrossAddrSpaceOrdering;
1524       break;
1525     case SIAtomicScope::WAVEFRONT:
1526     case SIAtomicScope::SINGLETHREAD:
1527       // The LDS keeps all memory operations in order for
1528       // the same wavesfront.
1529       break;
1530     default:
1531       llvm_unreachable("Unsupported synchronization scope");
1532     }
1533   }
1534 
1535   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1536     switch (Scope) {
1537     case SIAtomicScope::SYSTEM:
1538     case SIAtomicScope::AGENT:
1539       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1540       // is not needed as GDS operations for all waves are executed in a total
1541       // global ordering as observed by all waves. Required if also
1542       // synchronizing with global/LDS memory as GDS operations could be
1543       // reordered with respect to later global/LDS memory operations of the
1544       // same wave.
1545       LGKMCnt |= IsCrossAddrSpaceOrdering;
1546       break;
1547     case SIAtomicScope::WORKGROUP:
1548     case SIAtomicScope::WAVEFRONT:
1549     case SIAtomicScope::SINGLETHREAD:
1550       // The GDS keeps all memory operations in order for
1551       // the same work-group.
1552       break;
1553     default:
1554       llvm_unreachable("Unsupported synchronization scope");
1555     }
1556   }
1557 
1558   if (VMCnt || LGKMCnt) {
1559     unsigned WaitCntImmediate =
1560       AMDGPU::encodeWaitcnt(IV,
1561                             VMCnt ? 0 : getVmcntBitMask(IV),
1562                             getExpcntBitMask(IV),
1563                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1564     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1565     Changed = true;
1566   }
1567 
1568   if (VSCnt) {
1569     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1570       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1571       .addImm(0);
1572     Changed = true;
1573   }
1574 
1575   if (Pos == Position::AFTER)
1576     --MI;
1577 
1578   return Changed;
1579 }
1580 
1581 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1582                                         SIAtomicScope Scope,
1583                                         SIAtomicAddrSpace AddrSpace,
1584                                         Position Pos) const {
1585   if (!InsertCacheInv)
1586     return false;
1587 
1588   bool Changed = false;
1589 
1590   MachineBasicBlock &MBB = *MI->getParent();
1591   DebugLoc DL = MI->getDebugLoc();
1592 
1593   if (Pos == Position::AFTER)
1594     ++MI;
1595 
1596   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1597     switch (Scope) {
1598     case SIAtomicScope::SYSTEM:
1599     case SIAtomicScope::AGENT:
1600       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1601       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1602       Changed = true;
1603       break;
1604     case SIAtomicScope::WORKGROUP:
1605       // In WGP mode the waves of a work-group can be executing on either CU of
1606       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1607       // in CU mode and all waves of a work-group are on the same CU, and so the
1608       // L0 does not need to be invalidated.
1609       if (!ST.isCuModeEnabled()) {
1610         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1611         Changed = true;
1612       }
1613       break;
1614     case SIAtomicScope::WAVEFRONT:
1615     case SIAtomicScope::SINGLETHREAD:
1616       // No cache to invalidate.
1617       break;
1618     default:
1619       llvm_unreachable("Unsupported synchronization scope");
1620     }
1621   }
1622 
1623   /// The scratch address space does not need the global memory cache
1624   /// to be flushed as all memory operations by the same thread are
1625   /// sequentially consistent, and no other thread can access scratch
1626   /// memory.
1627 
1628   /// Other address spaces do not have a cache.
1629 
1630   if (Pos == Position::AFTER)
1631     --MI;
1632 
1633   return Changed;
1634 }
1635 
1636 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1637   if (AtomicPseudoMIs.empty())
1638     return false;
1639 
1640   for (auto &MI : AtomicPseudoMIs)
1641     MI->eraseFromParent();
1642 
1643   AtomicPseudoMIs.clear();
1644   return true;
1645 }
1646 
1647 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1648                                    MachineBasicBlock::iterator &MI) {
1649   assert(MI->mayLoad() && !MI->mayStore());
1650 
1651   bool Changed = false;
1652 
1653   if (MOI.isAtomic()) {
1654     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1655         MOI.getOrdering() == AtomicOrdering::Acquire ||
1656         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1657       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1658                                            MOI.getOrderingAddrSpace());
1659     }
1660 
1661     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1662       Changed |= CC->insertWait(MI, MOI.getScope(),
1663                                 MOI.getOrderingAddrSpace(),
1664                                 SIMemOp::LOAD | SIMemOp::STORE,
1665                                 MOI.getIsCrossAddressSpaceOrdering(),
1666                                 Position::BEFORE);
1667 
1668     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1669         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1670       Changed |= CC->insertWait(MI, MOI.getScope(),
1671                                 MOI.getInstrAddrSpace(),
1672                                 SIMemOp::LOAD,
1673                                 MOI.getIsCrossAddressSpaceOrdering(),
1674                                 Position::AFTER);
1675       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1676                                    MOI.getOrderingAddrSpace(),
1677                                    Position::AFTER);
1678     }
1679 
1680     return Changed;
1681   }
1682 
1683   // Atomic instructions already bypass caches to the scope specified by the
1684   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1685   // need additional treatment.
1686   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1687                                                 SIMemOp::LOAD, MOI.isVolatile(),
1688                                                 MOI.isNonTemporal());
1689   return Changed;
1690 }
1691 
1692 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1693                                     MachineBasicBlock::iterator &MI) {
1694   assert(!MI->mayLoad() && MI->mayStore());
1695 
1696   bool Changed = false;
1697 
1698   if (MOI.isAtomic()) {
1699     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1700         MOI.getOrdering() == AtomicOrdering::Release ||
1701         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1702       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1703                                             MOI.getOrderingAddrSpace());
1704     }
1705 
1706     if (MOI.getOrdering() == AtomicOrdering::Release ||
1707         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1708       Changed |= CC->insertRelease(MI, MOI.getScope(),
1709                                    MOI.getOrderingAddrSpace(),
1710                                    MOI.getIsCrossAddressSpaceOrdering(),
1711                                    Position::BEFORE);
1712 
1713     return Changed;
1714   }
1715 
1716   // Atomic instructions already bypass caches to the scope specified by the
1717   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1718   // need additional treatment.
1719   Changed |= CC->enableVolatileAndOrNonTemporal(
1720       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1721       MOI.isNonTemporal());
1722   return Changed;
1723 }
1724 
1725 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1726                                           MachineBasicBlock::iterator &MI) {
1727   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1728 
1729   AtomicPseudoMIs.push_back(MI);
1730   bool Changed = false;
1731 
1732   if (MOI.isAtomic()) {
1733     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1734         MOI.getOrdering() == AtomicOrdering::Release ||
1735         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1736         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1737       /// TODO: This relies on a barrier always generating a waitcnt
1738       /// for LDS to ensure it is not reordered with the completion of
1739       /// the proceeding LDS operations. If barrier had a memory
1740       /// ordering and memory scope, then library does not need to
1741       /// generate a fence. Could add support in this file for
1742       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1743       /// adding S_WAITCNT before a S_BARRIER.
1744       Changed |= CC->insertRelease(MI, MOI.getScope(),
1745                                    MOI.getOrderingAddrSpace(),
1746                                    MOI.getIsCrossAddressSpaceOrdering(),
1747                                    Position::BEFORE);
1748 
1749     // TODO: If both release and invalidate are happening they could be combined
1750     // to use the single "BUFFER_WBINV*" instruction. This could be done by
1751     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1752     // track cache invalidate and write back instructions.
1753 
1754     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1755         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1756         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1757       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1758                                    MOI.getOrderingAddrSpace(),
1759                                    Position::BEFORE);
1760 
1761     return Changed;
1762   }
1763 
1764   return Changed;
1765 }
1766 
1767 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1768   MachineBasicBlock::iterator &MI) {
1769   assert(MI->mayLoad() && MI->mayStore());
1770 
1771   bool Changed = false;
1772 
1773   if (MOI.isAtomic()) {
1774     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1775         MOI.getOrdering() == AtomicOrdering::Acquire ||
1776         MOI.getOrdering() == AtomicOrdering::Release ||
1777         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1778         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1779       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1780                                           MOI.getInstrAddrSpace());
1781     }
1782 
1783     if (MOI.getOrdering() == AtomicOrdering::Release ||
1784         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1785         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1786         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1787       Changed |= CC->insertRelease(MI, MOI.getScope(),
1788                                    MOI.getOrderingAddrSpace(),
1789                                    MOI.getIsCrossAddressSpaceOrdering(),
1790                                    Position::BEFORE);
1791 
1792     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1793         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1794         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1795         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1796         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1797       Changed |= CC->insertWait(MI, MOI.getScope(),
1798                                 MOI.getInstrAddrSpace(),
1799                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1800                                                    SIMemOp::STORE,
1801                                 MOI.getIsCrossAddressSpaceOrdering(),
1802                                 Position::AFTER);
1803       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1804                                    MOI.getOrderingAddrSpace(),
1805                                    Position::AFTER);
1806     }
1807 
1808     return Changed;
1809   }
1810 
1811   return Changed;
1812 }
1813 
1814 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1815   bool Changed = false;
1816 
1817   SIMemOpAccess MOA(MF);
1818   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1819 
1820   for (auto &MBB : MF) {
1821     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1822 
1823       // Unbundle instructions after the post-RA scheduler.
1824       if (MI->isBundle() && MI->mayLoadOrStore()) {
1825         MachineBasicBlock::instr_iterator II(MI->getIterator());
1826         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1827              I != E && I->isBundledWithPred(); ++I) {
1828           I->unbundleFromPred();
1829           for (MachineOperand &MO : I->operands())
1830             if (MO.isReg())
1831               MO.setIsInternalRead(false);
1832         }
1833 
1834         MI->eraseFromParent();
1835         MI = II->getIterator();
1836       }
1837 
1838       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1839         continue;
1840 
1841       if (const auto &MOI = MOA.getLoadInfo(MI))
1842         Changed |= expandLoad(MOI.getValue(), MI);
1843       else if (const auto &MOI = MOA.getStoreInfo(MI))
1844         Changed |= expandStore(MOI.getValue(), MI);
1845       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1846         Changed |= expandAtomicFence(MOI.getValue(), MI);
1847       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1848         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1849     }
1850   }
1851 
1852   Changed |= removeAtomicPseudoMIs();
1853   return Changed;
1854 }
1855 
1856 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1857 
1858 char SIMemoryLegalizer::ID = 0;
1859 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1860 
1861 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1862   return new SIMemoryLegalizer();
1863 }
1864