xref: /llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 67819a72c6ba39267effe8edfc1befddc3f3f2f9)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/Support/TargetParser.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35     cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43   NONE = 0u,
44   LOAD = 1u << 0,
45   STORE = 1u << 1,
46   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52   BEFORE,
53   AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58   NONE,
59   SINGLETHREAD,
60   WAVEFRONT,
61   WORKGROUP,
62   AGENT,
63   SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69   NONE = 0u,
70   GLOBAL = 1u << 0,
71   LDS = 1u << 1,
72   SCRATCH = 1u << 2,
73   GDS = 1u << 3,
74   OTHER = 1u << 4,
75 
76   /// The address spaces that can be accessed by a FLAT instruction.
77   FLAT = GLOBAL | LDS | SCRATCH,
78 
79   /// The address spaces that support atomic instructions.
80   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82   /// All address spaces.
83   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91   friend class SIMemOpAccess;
92 
93   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98   bool IsCrossAddressSpaceOrdering = false;
99   bool IsVolatile = false;
100   bool IsNonTemporal = false;
101 
102   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106               bool IsCrossAddressSpaceOrdering = true,
107               AtomicOrdering FailureOrdering =
108                 AtomicOrdering::SequentiallyConsistent,
109               bool IsVolatile = false,
110               bool IsNonTemporal = false)
111     : Ordering(Ordering), FailureOrdering(FailureOrdering),
112       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113       InstrAddrSpace(InstrAddrSpace),
114       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115       IsVolatile(IsVolatile),
116       IsNonTemporal(IsNonTemporal) {
117 
118     if (Ordering == AtomicOrdering::NotAtomic) {
119       assert(Scope == SIAtomicScope::NONE &&
120              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121              !IsCrossAddressSpaceOrdering &&
122              FailureOrdering == AtomicOrdering::NotAtomic);
123       return;
124     }
125 
126     assert(Scope != SIAtomicScope::NONE &&
127            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128                SIAtomicAddrSpace::NONE &&
129            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE);
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SIMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SIMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   std::optional<SIMemOpInfo>
233   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241   std::optional<SIMemOpInfo>
242   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "std::nullopt"
245   /// otherwise.
246   std::optional<SIMemOpInfo>
247   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248 
249   /// \returns Atomic fence info if \p MI is an atomic fence operation,
250   /// "std::nullopt" otherwise.
251   std::optional<SIMemOpInfo>
252   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253 
254   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255   /// rmw operation, "std::nullopt" otherwise.
256   std::optional<SIMemOpInfo>
257   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258 };
259 
260 class SICacheControl {
261 protected:
262 
263   /// AMDGPU subtarget info.
264   const GCNSubtarget &ST;
265 
266   /// Instruction info.
267   const SIInstrInfo *TII = nullptr;
268 
269   IsaVersion IV;
270 
271   /// Whether to insert cache invalidating instructions.
272   bool InsertCacheInv;
273 
274   SICacheControl(const GCNSubtarget &ST);
275 
276   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277   /// \returns Returns true if \p MI is modified, false otherwise.
278   bool enableNamedBit(const MachineBasicBlock::iterator MI,
279                       AMDGPU::CPol::CPol Bit) const;
280 
281 public:
282 
283   /// Create a cache control for the subtarget \p ST.
284   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285 
286   /// Update \p MI memory load instruction to bypass any caches up to
287   /// the \p Scope memory scope for address spaces \p
288   /// AddrSpace. Return true iff the instruction was modified.
289   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290                                      SIAtomicScope Scope,
291                                      SIAtomicAddrSpace AddrSpace) const = 0;
292 
293   /// Update \p MI memory store instruction to bypass any caches up to
294   /// the \p Scope memory scope for address spaces \p
295   /// AddrSpace. Return true iff the instruction was modified.
296   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297                                       SIAtomicScope Scope,
298                                       SIAtomicAddrSpace AddrSpace) const = 0;
299 
300   /// Update \p MI memory read-modify-write instruction to bypass any caches up
301   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302   /// iff the instruction was modified.
303   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304                                     SIAtomicScope Scope,
305                                     SIAtomicAddrSpace AddrSpace) const = 0;
306 
307   /// Update \p MI memory instruction of kind \p Op associated with address
308   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309   /// true iff the instruction was modified.
310   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311                                               SIAtomicAddrSpace AddrSpace,
312                                               SIMemOp Op, bool IsVolatile,
313                                               bool IsNonTemporal) const = 0;
314 
315   /// Inserts any necessary instructions at position \p Pos relative
316   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
317   /// \p Op associated with address spaces \p AddrSpace have completed. Used
318   /// between memory instructions to enforce the order they become visible as
319   /// observed by other memory instructions executing in memory scope \p Scope.
320   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
321   /// address spaces. Returns true iff any instructions inserted.
322   virtual bool insertWait(MachineBasicBlock::iterator &MI,
323                           SIAtomicScope Scope,
324                           SIAtomicAddrSpace AddrSpace,
325                           SIMemOp Op,
326                           bool IsCrossAddrSpaceOrdering,
327                           Position Pos) const = 0;
328 
329   /// Inserts any necessary instructions at position \p Pos relative to
330   /// instruction \p MI to ensure any subsequent memory instructions of this
331   /// thread with address spaces \p AddrSpace will observe the previous memory
332   /// operations by any thread for memory scopes up to memory scope \p Scope .
333   /// Returns true iff any instructions inserted.
334   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
335                              SIAtomicScope Scope,
336                              SIAtomicAddrSpace AddrSpace,
337                              Position Pos) const = 0;
338 
339   /// Inserts any necessary instructions at position \p Pos relative to
340   /// instruction \p MI to ensure previous memory instructions by this thread
341   /// with address spaces \p AddrSpace have completed and can be observed by
342   /// subsequent memory instructions by any thread executing in memory scope \p
343   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
344   /// between address spaces. Returns true iff any instructions inserted.
345   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
346                              SIAtomicScope Scope,
347                              SIAtomicAddrSpace AddrSpace,
348                              bool IsCrossAddrSpaceOrdering,
349                              Position Pos) const = 0;
350 
351   /// Virtual destructor to allow derivations to be deleted.
352   virtual ~SICacheControl() = default;
353 
354 };
355 
356 class SIGfx6CacheControl : public SICacheControl {
357 protected:
358 
359   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
360   /// is modified, false otherwise.
361   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
362     return enableNamedBit(MI, AMDGPU::CPol::GLC);
363   }
364 
365   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
366   /// is modified, false otherwise.
367   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
368     return enableNamedBit(MI, AMDGPU::CPol::SLC);
369   }
370 
371 public:
372 
373   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
374 
375   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
376                              SIAtomicScope Scope,
377                              SIAtomicAddrSpace AddrSpace) const override;
378 
379   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
380                               SIAtomicScope Scope,
381                               SIAtomicAddrSpace AddrSpace) const override;
382 
383   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
384                             SIAtomicScope Scope,
385                             SIAtomicAddrSpace AddrSpace) const override;
386 
387   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
388                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
389                                       bool IsVolatile,
390                                       bool IsNonTemporal) const override;
391 
392   bool insertWait(MachineBasicBlock::iterator &MI,
393                   SIAtomicScope Scope,
394                   SIAtomicAddrSpace AddrSpace,
395                   SIMemOp Op,
396                   bool IsCrossAddrSpaceOrdering,
397                   Position Pos) const override;
398 
399   bool insertAcquire(MachineBasicBlock::iterator &MI,
400                      SIAtomicScope Scope,
401                      SIAtomicAddrSpace AddrSpace,
402                      Position Pos) const override;
403 
404   bool insertRelease(MachineBasicBlock::iterator &MI,
405                      SIAtomicScope Scope,
406                      SIAtomicAddrSpace AddrSpace,
407                      bool IsCrossAddrSpaceOrdering,
408                      Position Pos) const override;
409 };
410 
411 class SIGfx7CacheControl : public SIGfx6CacheControl {
412 public:
413 
414   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
415 
416   bool insertAcquire(MachineBasicBlock::iterator &MI,
417                      SIAtomicScope Scope,
418                      SIAtomicAddrSpace AddrSpace,
419                      Position Pos) const override;
420 
421 };
422 
423 class SIGfx90ACacheControl : public SIGfx7CacheControl {
424 public:
425 
426   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
427 
428   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
429                              SIAtomicScope Scope,
430                              SIAtomicAddrSpace AddrSpace) const override;
431 
432   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
433                               SIAtomicScope Scope,
434                               SIAtomicAddrSpace AddrSpace) const override;
435 
436   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
437                             SIAtomicScope Scope,
438                             SIAtomicAddrSpace AddrSpace) const override;
439 
440   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
441                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
442                                       bool IsVolatile,
443                                       bool IsNonTemporal) const override;
444 
445   bool insertWait(MachineBasicBlock::iterator &MI,
446                   SIAtomicScope Scope,
447                   SIAtomicAddrSpace AddrSpace,
448                   SIMemOp Op,
449                   bool IsCrossAddrSpaceOrdering,
450                   Position Pos) const override;
451 
452   bool insertAcquire(MachineBasicBlock::iterator &MI,
453                      SIAtomicScope Scope,
454                      SIAtomicAddrSpace AddrSpace,
455                      Position Pos) const override;
456 
457   bool insertRelease(MachineBasicBlock::iterator &MI,
458                      SIAtomicScope Scope,
459                      SIAtomicAddrSpace AddrSpace,
460                      bool IsCrossAddrSpaceOrdering,
461                      Position Pos) const override;
462 };
463 
464 class SIGfx940CacheControl : public SIGfx90ACacheControl {
465 protected:
466 
467   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
468   /// is modified, false otherwise.
469   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
470     return enableNamedBit(MI, AMDGPU::CPol::SC0);
471   }
472 
473   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
474   /// is modified, false otherwise.
475   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
476     return enableNamedBit(MI, AMDGPU::CPol::SC1);
477   }
478 
479   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
480   /// is modified, false otherwise.
481   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
482     return enableNamedBit(MI, AMDGPU::CPol::NT);
483   }
484 
485 public:
486 
487   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
488 
489   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
490                              SIAtomicScope Scope,
491                              SIAtomicAddrSpace AddrSpace) const override;
492 
493   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
494                               SIAtomicScope Scope,
495                               SIAtomicAddrSpace AddrSpace) const override;
496 
497   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
498                             SIAtomicScope Scope,
499                             SIAtomicAddrSpace AddrSpace) const override;
500 
501   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
502                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
503                                       bool IsVolatile,
504                                       bool IsNonTemporal) const override;
505 
506   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
507                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
508 
509   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
510                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
511                      Position Pos) const override;
512 };
513 
514 class SIGfx10CacheControl : public SIGfx7CacheControl {
515 protected:
516 
517   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
518   /// is modified, false otherwise.
519   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
520     return enableNamedBit(MI, AMDGPU::CPol::DLC);
521   }
522 
523 public:
524 
525   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
526 
527   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
528                              SIAtomicScope Scope,
529                              SIAtomicAddrSpace AddrSpace) const override;
530 
531   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
532                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
533                                       bool IsVolatile,
534                                       bool IsNonTemporal) const override;
535 
536   bool insertWait(MachineBasicBlock::iterator &MI,
537                   SIAtomicScope Scope,
538                   SIAtomicAddrSpace AddrSpace,
539                   SIMemOp Op,
540                   bool IsCrossAddrSpaceOrdering,
541                   Position Pos) const override;
542 
543   bool insertAcquire(MachineBasicBlock::iterator &MI,
544                      SIAtomicScope Scope,
545                      SIAtomicAddrSpace AddrSpace,
546                      Position Pos) const override;
547 };
548 
549 class SIGfx11CacheControl : public SIGfx10CacheControl {
550 public:
551   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
552 
553   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
554                              SIAtomicScope Scope,
555                              SIAtomicAddrSpace AddrSpace) const override;
556 
557   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
558                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
559                                       bool IsVolatile,
560                                       bool IsNonTemporal) const override;
561 };
562 
563 class SIMemoryLegalizer final : public MachineFunctionPass {
564 private:
565 
566   /// Cache Control.
567   std::unique_ptr<SICacheControl> CC = nullptr;
568 
569   /// List of atomic pseudo instructions.
570   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
571 
572   /// Return true iff instruction \p MI is a atomic instruction that
573   /// returns a result.
574   bool isAtomicRet(const MachineInstr &MI) const {
575     return SIInstrInfo::isAtomicRet(MI);
576   }
577 
578   /// Removes all processed atomic pseudo instructions from the current
579   /// function. Returns true if current function is modified, false otherwise.
580   bool removeAtomicPseudoMIs();
581 
582   /// Expands load operation \p MI. Returns true if instructions are
583   /// added/deleted or \p MI is modified, false otherwise.
584   bool expandLoad(const SIMemOpInfo &MOI,
585                   MachineBasicBlock::iterator &MI);
586   /// Expands store operation \p MI. Returns true if instructions are
587   /// added/deleted or \p MI is modified, false otherwise.
588   bool expandStore(const SIMemOpInfo &MOI,
589                    MachineBasicBlock::iterator &MI);
590   /// Expands atomic fence operation \p MI. Returns true if
591   /// instructions are added/deleted or \p MI is modified, false otherwise.
592   bool expandAtomicFence(const SIMemOpInfo &MOI,
593                          MachineBasicBlock::iterator &MI);
594   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
595   /// instructions are added/deleted or \p MI is modified, false otherwise.
596   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
597                                 MachineBasicBlock::iterator &MI);
598 
599 public:
600   static char ID;
601 
602   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
603 
604   void getAnalysisUsage(AnalysisUsage &AU) const override {
605     AU.setPreservesCFG();
606     MachineFunctionPass::getAnalysisUsage(AU);
607   }
608 
609   StringRef getPassName() const override {
610     return PASS_NAME;
611   }
612 
613   bool runOnMachineFunction(MachineFunction &MF) override;
614 };
615 
616 } // end namespace anonymous
617 
618 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
619                                       const char *Msg) const {
620   const Function &Func = MI->getParent()->getParent()->getFunction();
621   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
622   Func.getContext().diagnose(Diag);
623 }
624 
625 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
626 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
627                                SIAtomicAddrSpace InstrAddrSpace) const {
628   if (SSID == SyncScope::System)
629     return std::make_tuple(SIAtomicScope::SYSTEM,
630                            SIAtomicAddrSpace::ATOMIC,
631                            true);
632   if (SSID == MMI->getAgentSSID())
633     return std::make_tuple(SIAtomicScope::AGENT,
634                            SIAtomicAddrSpace::ATOMIC,
635                            true);
636   if (SSID == MMI->getWorkgroupSSID())
637     return std::make_tuple(SIAtomicScope::WORKGROUP,
638                            SIAtomicAddrSpace::ATOMIC,
639                            true);
640   if (SSID == MMI->getWavefrontSSID())
641     return std::make_tuple(SIAtomicScope::WAVEFRONT,
642                            SIAtomicAddrSpace::ATOMIC,
643                            true);
644   if (SSID == SyncScope::SingleThread)
645     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
646                            SIAtomicAddrSpace::ATOMIC,
647                            true);
648   if (SSID == MMI->getSystemOneAddressSpaceSSID())
649     return std::make_tuple(SIAtomicScope::SYSTEM,
650                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
651                            false);
652   if (SSID == MMI->getAgentOneAddressSpaceSSID())
653     return std::make_tuple(SIAtomicScope::AGENT,
654                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
655                            false);
656   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
657     return std::make_tuple(SIAtomicScope::WORKGROUP,
658                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
659                            false);
660   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
661     return std::make_tuple(SIAtomicScope::WAVEFRONT,
662                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
663                            false);
664   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
665     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
666                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
667                            false);
668   return std::nullopt;
669 }
670 
671 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
672   if (AS == AMDGPUAS::FLAT_ADDRESS)
673     return SIAtomicAddrSpace::FLAT;
674   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
675     return SIAtomicAddrSpace::GLOBAL;
676   if (AS == AMDGPUAS::LOCAL_ADDRESS)
677     return SIAtomicAddrSpace::LDS;
678   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
679     return SIAtomicAddrSpace::SCRATCH;
680   if (AS == AMDGPUAS::REGION_ADDRESS)
681     return SIAtomicAddrSpace::GDS;
682 
683   return SIAtomicAddrSpace::OTHER;
684 }
685 
686 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
687   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
688 }
689 
690 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
691     const MachineBasicBlock::iterator &MI) const {
692   assert(MI->getNumMemOperands() > 0);
693 
694   SyncScope::ID SSID = SyncScope::SingleThread;
695   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
696   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
697   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
698   bool IsNonTemporal = true;
699   bool IsVolatile = false;
700 
701   // Validator should check whether or not MMOs cover the entire set of
702   // locations accessed by the memory instruction.
703   for (const auto &MMO : MI->memoperands()) {
704     IsNonTemporal &= MMO->isNonTemporal();
705     IsVolatile |= MMO->isVolatile();
706     InstrAddrSpace |=
707       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
708     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
709     if (OpOrdering != AtomicOrdering::NotAtomic) {
710       const auto &IsSyncScopeInclusion =
711           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
712       if (!IsSyncScopeInclusion) {
713         reportUnsupported(MI,
714           "Unsupported non-inclusive atomic synchronization scope");
715         return std::nullopt;
716       }
717 
718       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
719       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
720       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
721              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
722       FailureOrdering =
723           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
724     }
725   }
726 
727   SIAtomicScope Scope = SIAtomicScope::NONE;
728   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
729   bool IsCrossAddressSpaceOrdering = false;
730   if (Ordering != AtomicOrdering::NotAtomic) {
731     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
732     if (!ScopeOrNone) {
733       reportUnsupported(MI, "Unsupported atomic synchronization scope");
734       return std::nullopt;
735     }
736     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
737         *ScopeOrNone;
738     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
739         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
740         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
741       reportUnsupported(MI, "Unsupported atomic address space");
742       return std::nullopt;
743     }
744   }
745   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
746                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
747                      IsNonTemporal);
748 }
749 
750 std::optional<SIMemOpInfo>
751 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
752   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
753 
754   if (!(MI->mayLoad() && !MI->mayStore()))
755     return std::nullopt;
756 
757   // Be conservative if there are no memory operands.
758   if (MI->getNumMemOperands() == 0)
759     return SIMemOpInfo();
760 
761   return constructFromMIWithMMO(MI);
762 }
763 
764 std::optional<SIMemOpInfo>
765 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
766   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
767 
768   if (!(!MI->mayLoad() && MI->mayStore()))
769     return std::nullopt;
770 
771   // Be conservative if there are no memory operands.
772   if (MI->getNumMemOperands() == 0)
773     return SIMemOpInfo();
774 
775   return constructFromMIWithMMO(MI);
776 }
777 
778 std::optional<SIMemOpInfo>
779 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
780   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
781 
782   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
783     return std::nullopt;
784 
785   AtomicOrdering Ordering =
786     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
787 
788   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
789   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
790   if (!ScopeOrNone) {
791     reportUnsupported(MI, "Unsupported atomic synchronization scope");
792     return std::nullopt;
793   }
794 
795   SIAtomicScope Scope = SIAtomicScope::NONE;
796   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
797   bool IsCrossAddressSpaceOrdering = false;
798   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
799       *ScopeOrNone;
800 
801   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
802       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
803     reportUnsupported(MI, "Unsupported atomic address space");
804     return std::nullopt;
805   }
806 
807   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
808                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
809 }
810 
811 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
812     const MachineBasicBlock::iterator &MI) const {
813   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
814 
815   if (!(MI->mayLoad() && MI->mayStore()))
816     return std::nullopt;
817 
818   // Be conservative if there are no memory operands.
819   if (MI->getNumMemOperands() == 0)
820     return SIMemOpInfo();
821 
822   return constructFromMIWithMMO(MI);
823 }
824 
825 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
826   TII = ST.getInstrInfo();
827   IV = getIsaVersion(ST.getCPU());
828   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
829 }
830 
831 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
832                                     AMDGPU::CPol::CPol Bit) const {
833   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
834   if (!CPol)
835     return false;
836 
837   CPol->setImm(CPol->getImm() | Bit);
838   return true;
839 }
840 
841 /* static */
842 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
843   GCNSubtarget::Generation Generation = ST.getGeneration();
844   if (ST.hasGFX940Insts())
845     return std::make_unique<SIGfx940CacheControl>(ST);
846   if (ST.hasGFX90AInsts())
847     return std::make_unique<SIGfx90ACacheControl>(ST);
848   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
849     return std::make_unique<SIGfx6CacheControl>(ST);
850   if (Generation < AMDGPUSubtarget::GFX10)
851     return std::make_unique<SIGfx7CacheControl>(ST);
852   if (Generation < AMDGPUSubtarget::GFX11)
853     return std::make_unique<SIGfx10CacheControl>(ST);
854   return std::make_unique<SIGfx11CacheControl>(ST);
855 }
856 
857 bool SIGfx6CacheControl::enableLoadCacheBypass(
858     const MachineBasicBlock::iterator &MI,
859     SIAtomicScope Scope,
860     SIAtomicAddrSpace AddrSpace) const {
861   assert(MI->mayLoad() && !MI->mayStore());
862   bool Changed = false;
863 
864   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
865     switch (Scope) {
866     case SIAtomicScope::SYSTEM:
867     case SIAtomicScope::AGENT:
868       // Set L1 cache policy to MISS_EVICT.
869       // Note: there is no L2 cache bypass policy at the ISA level.
870       Changed |= enableGLCBit(MI);
871       break;
872     case SIAtomicScope::WORKGROUP:
873     case SIAtomicScope::WAVEFRONT:
874     case SIAtomicScope::SINGLETHREAD:
875       // No cache to bypass.
876       break;
877     default:
878       llvm_unreachable("Unsupported synchronization scope");
879     }
880   }
881 
882   /// The scratch address space does not need the global memory caches
883   /// to be bypassed as all memory operations by the same thread are
884   /// sequentially consistent, and no other thread can access scratch
885   /// memory.
886 
887   /// Other address spaces do not have a cache.
888 
889   return Changed;
890 }
891 
892 bool SIGfx6CacheControl::enableStoreCacheBypass(
893     const MachineBasicBlock::iterator &MI,
894     SIAtomicScope Scope,
895     SIAtomicAddrSpace AddrSpace) const {
896   assert(!MI->mayLoad() && MI->mayStore());
897   bool Changed = false;
898 
899   /// The L1 cache is write through so does not need to be bypassed. There is no
900   /// bypass control for the L2 cache at the isa level.
901 
902   return Changed;
903 }
904 
905 bool SIGfx6CacheControl::enableRMWCacheBypass(
906     const MachineBasicBlock::iterator &MI,
907     SIAtomicScope Scope,
908     SIAtomicAddrSpace AddrSpace) const {
909   assert(MI->mayLoad() && MI->mayStore());
910   bool Changed = false;
911 
912   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
913   /// bypassed, and the GLC bit is instead used to indicate if they are
914   /// return or no-return.
915   /// Note: there is no L2 cache coherent bypass control at the ISA level.
916 
917   return Changed;
918 }
919 
920 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
921     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
922     bool IsVolatile, bool IsNonTemporal) const {
923   // Only handle load and store, not atomic read-modify-write insructions. The
924   // latter use glc to indicate if the atomic returns a result and so must not
925   // be used for cache control.
926   assert(MI->mayLoad() ^ MI->mayStore());
927 
928   // Only update load and store, not LLVM IR atomic read-modify-write
929   // instructions. The latter are always marked as volatile so cannot sensibly
930   // handle it as do not want to pessimize all atomics. Also they do not support
931   // the nontemporal attribute.
932   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
933 
934   bool Changed = false;
935 
936   if (IsVolatile) {
937     // Set L1 cache policy to be MISS_EVICT for load instructions
938     // and MISS_LRU for store instructions.
939     // Note: there is no L2 cache bypass policy at the ISA level.
940     if (Op == SIMemOp::LOAD)
941       Changed |= enableGLCBit(MI);
942 
943     // Ensure operation has completed at system scope to cause all volatile
944     // operations to be visible outside the program in a global order. Do not
945     // request cross address space as only the global address space can be
946     // observable outside the program, so no need to cause a waitcnt for LDS
947     // address space operations.
948     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
949                           Position::AFTER);
950 
951     return Changed;
952   }
953 
954   if (IsNonTemporal) {
955     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
956     // for both loads and stores, and the L2 cache policy to STREAM.
957     Changed |= enableGLCBit(MI);
958     Changed |= enableSLCBit(MI);
959     return Changed;
960   }
961 
962   return Changed;
963 }
964 
965 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
966                                     SIAtomicScope Scope,
967                                     SIAtomicAddrSpace AddrSpace,
968                                     SIMemOp Op,
969                                     bool IsCrossAddrSpaceOrdering,
970                                     Position Pos) const {
971   bool Changed = false;
972 
973   MachineBasicBlock &MBB = *MI->getParent();
974   DebugLoc DL = MI->getDebugLoc();
975 
976   if (Pos == Position::AFTER)
977     ++MI;
978 
979   bool VMCnt = false;
980   bool LGKMCnt = false;
981 
982   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
983       SIAtomicAddrSpace::NONE) {
984     switch (Scope) {
985     case SIAtomicScope::SYSTEM:
986     case SIAtomicScope::AGENT:
987       VMCnt |= true;
988       break;
989     case SIAtomicScope::WORKGROUP:
990     case SIAtomicScope::WAVEFRONT:
991     case SIAtomicScope::SINGLETHREAD:
992       // The L1 cache keeps all memory operations in order for
993       // wavefronts in the same work-group.
994       break;
995     default:
996       llvm_unreachable("Unsupported synchronization scope");
997     }
998   }
999 
1000   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1001     switch (Scope) {
1002     case SIAtomicScope::SYSTEM:
1003     case SIAtomicScope::AGENT:
1004     case SIAtomicScope::WORKGROUP:
1005       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1006       // not needed as LDS operations for all waves are executed in a total
1007       // global ordering as observed by all waves. Required if also
1008       // synchronizing with global/GDS memory as LDS operations could be
1009       // reordered with respect to later global/GDS memory operations of the
1010       // same wave.
1011       LGKMCnt |= IsCrossAddrSpaceOrdering;
1012       break;
1013     case SIAtomicScope::WAVEFRONT:
1014     case SIAtomicScope::SINGLETHREAD:
1015       // The LDS keeps all memory operations in order for
1016       // the same wavefront.
1017       break;
1018     default:
1019       llvm_unreachable("Unsupported synchronization scope");
1020     }
1021   }
1022 
1023   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1024     switch (Scope) {
1025     case SIAtomicScope::SYSTEM:
1026     case SIAtomicScope::AGENT:
1027       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1028       // is not needed as GDS operations for all waves are executed in a total
1029       // global ordering as observed by all waves. Required if also
1030       // synchronizing with global/LDS memory as GDS operations could be
1031       // reordered with respect to later global/LDS memory operations of the
1032       // same wave.
1033       LGKMCnt |= IsCrossAddrSpaceOrdering;
1034       break;
1035     case SIAtomicScope::WORKGROUP:
1036     case SIAtomicScope::WAVEFRONT:
1037     case SIAtomicScope::SINGLETHREAD:
1038       // The GDS keeps all memory operations in order for
1039       // the same work-group.
1040       break;
1041     default:
1042       llvm_unreachable("Unsupported synchronization scope");
1043     }
1044   }
1045 
1046   if (VMCnt || LGKMCnt) {
1047     unsigned WaitCntImmediate =
1048       AMDGPU::encodeWaitcnt(IV,
1049                             VMCnt ? 0 : getVmcntBitMask(IV),
1050                             getExpcntBitMask(IV),
1051                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1052     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1053     Changed = true;
1054   }
1055 
1056   if (Pos == Position::AFTER)
1057     --MI;
1058 
1059   return Changed;
1060 }
1061 
1062 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1063                                        SIAtomicScope Scope,
1064                                        SIAtomicAddrSpace AddrSpace,
1065                                        Position Pos) const {
1066   if (!InsertCacheInv)
1067     return false;
1068 
1069   bool Changed = false;
1070 
1071   MachineBasicBlock &MBB = *MI->getParent();
1072   DebugLoc DL = MI->getDebugLoc();
1073 
1074   if (Pos == Position::AFTER)
1075     ++MI;
1076 
1077   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1078     switch (Scope) {
1079     case SIAtomicScope::SYSTEM:
1080     case SIAtomicScope::AGENT:
1081       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1082       Changed = true;
1083       break;
1084     case SIAtomicScope::WORKGROUP:
1085     case SIAtomicScope::WAVEFRONT:
1086     case SIAtomicScope::SINGLETHREAD:
1087       // No cache to invalidate.
1088       break;
1089     default:
1090       llvm_unreachable("Unsupported synchronization scope");
1091     }
1092   }
1093 
1094   /// The scratch address space does not need the global memory cache
1095   /// to be flushed as all memory operations by the same thread are
1096   /// sequentially consistent, and no other thread can access scratch
1097   /// memory.
1098 
1099   /// Other address spaces do not have a cache.
1100 
1101   if (Pos == Position::AFTER)
1102     --MI;
1103 
1104   return Changed;
1105 }
1106 
1107 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1108                                        SIAtomicScope Scope,
1109                                        SIAtomicAddrSpace AddrSpace,
1110                                        bool IsCrossAddrSpaceOrdering,
1111                                        Position Pos) const {
1112   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1113                     IsCrossAddrSpaceOrdering, Pos);
1114 }
1115 
1116 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1117                                        SIAtomicScope Scope,
1118                                        SIAtomicAddrSpace AddrSpace,
1119                                        Position Pos) const {
1120   if (!InsertCacheInv)
1121     return false;
1122 
1123   bool Changed = false;
1124 
1125   MachineBasicBlock &MBB = *MI->getParent();
1126   DebugLoc DL = MI->getDebugLoc();
1127 
1128   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1129 
1130   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1131                                     ? AMDGPU::BUFFER_WBINVL1
1132                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1133 
1134   if (Pos == Position::AFTER)
1135     ++MI;
1136 
1137   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1138     switch (Scope) {
1139     case SIAtomicScope::SYSTEM:
1140     case SIAtomicScope::AGENT:
1141       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1142       Changed = true;
1143       break;
1144     case SIAtomicScope::WORKGROUP:
1145     case SIAtomicScope::WAVEFRONT:
1146     case SIAtomicScope::SINGLETHREAD:
1147       // No cache to invalidate.
1148       break;
1149     default:
1150       llvm_unreachable("Unsupported synchronization scope");
1151     }
1152   }
1153 
1154   /// The scratch address space does not need the global memory cache
1155   /// to be flushed as all memory operations by the same thread are
1156   /// sequentially consistent, and no other thread can access scratch
1157   /// memory.
1158 
1159   /// Other address spaces do not have a cache.
1160 
1161   if (Pos == Position::AFTER)
1162     --MI;
1163 
1164   return Changed;
1165 }
1166 
1167 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1168     const MachineBasicBlock::iterator &MI,
1169     SIAtomicScope Scope,
1170     SIAtomicAddrSpace AddrSpace) const {
1171   assert(MI->mayLoad() && !MI->mayStore());
1172   bool Changed = false;
1173 
1174   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1175     switch (Scope) {
1176     case SIAtomicScope::SYSTEM:
1177     case SIAtomicScope::AGENT:
1178       // Set the L1 cache policy to MISS_LRU.
1179       // Note: there is no L2 cache bypass policy at the ISA level.
1180       Changed |= enableGLCBit(MI);
1181       break;
1182     case SIAtomicScope::WORKGROUP:
1183       // In threadgroup split mode the waves of a work-group can be executing on
1184       // different CUs. Therefore need to bypass the L1 which is per CU.
1185       // Otherwise in non-threadgroup split mode all waves of a work-group are
1186       // on the same CU, and so the L1 does not need to be bypassed.
1187       if (ST.isTgSplitEnabled())
1188         Changed |= enableGLCBit(MI);
1189       break;
1190     case SIAtomicScope::WAVEFRONT:
1191     case SIAtomicScope::SINGLETHREAD:
1192       // No cache to bypass.
1193       break;
1194     default:
1195       llvm_unreachable("Unsupported synchronization scope");
1196     }
1197   }
1198 
1199   /// The scratch address space does not need the global memory caches
1200   /// to be bypassed as all memory operations by the same thread are
1201   /// sequentially consistent, and no other thread can access scratch
1202   /// memory.
1203 
1204   /// Other address spaces do not have a cache.
1205 
1206   return Changed;
1207 }
1208 
1209 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1210     const MachineBasicBlock::iterator &MI,
1211     SIAtomicScope Scope,
1212     SIAtomicAddrSpace AddrSpace) const {
1213   assert(!MI->mayLoad() && MI->mayStore());
1214   bool Changed = false;
1215 
1216   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1217     switch (Scope) {
1218     case SIAtomicScope::SYSTEM:
1219     case SIAtomicScope::AGENT:
1220       /// Do not set glc for store atomic operations as they implicitly write
1221       /// through the L1 cache.
1222       break;
1223     case SIAtomicScope::WORKGROUP:
1224     case SIAtomicScope::WAVEFRONT:
1225     case SIAtomicScope::SINGLETHREAD:
1226       // No cache to bypass. Store atomics implicitly write through the L1
1227       // cache.
1228       break;
1229     default:
1230       llvm_unreachable("Unsupported synchronization scope");
1231     }
1232   }
1233 
1234   /// The scratch address space does not need the global memory caches
1235   /// to be bypassed as all memory operations by the same thread are
1236   /// sequentially consistent, and no other thread can access scratch
1237   /// memory.
1238 
1239   /// Other address spaces do not have a cache.
1240 
1241   return Changed;
1242 }
1243 
1244 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1245     const MachineBasicBlock::iterator &MI,
1246     SIAtomicScope Scope,
1247     SIAtomicAddrSpace AddrSpace) const {
1248   assert(MI->mayLoad() && MI->mayStore());
1249   bool Changed = false;
1250 
1251   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1252     switch (Scope) {
1253     case SIAtomicScope::SYSTEM:
1254     case SIAtomicScope::AGENT:
1255       /// Do not set glc for RMW atomic operations as they implicitly bypass
1256       /// the L1 cache, and the glc bit is instead used to indicate if they are
1257       /// return or no-return.
1258       break;
1259     case SIAtomicScope::WORKGROUP:
1260     case SIAtomicScope::WAVEFRONT:
1261     case SIAtomicScope::SINGLETHREAD:
1262       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1263       break;
1264     default:
1265       llvm_unreachable("Unsupported synchronization scope");
1266     }
1267   }
1268 
1269   return Changed;
1270 }
1271 
1272 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1273     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1274     bool IsVolatile, bool IsNonTemporal) const {
1275   // Only handle load and store, not atomic read-modify-write insructions. The
1276   // latter use glc to indicate if the atomic returns a result and so must not
1277   // be used for cache control.
1278   assert(MI->mayLoad() ^ MI->mayStore());
1279 
1280   // Only update load and store, not LLVM IR atomic read-modify-write
1281   // instructions. The latter are always marked as volatile so cannot sensibly
1282   // handle it as do not want to pessimize all atomics. Also they do not support
1283   // the nontemporal attribute.
1284   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1285 
1286   bool Changed = false;
1287 
1288   if (IsVolatile) {
1289     // Set L1 cache policy to be MISS_EVICT for load instructions
1290     // and MISS_LRU for store instructions.
1291     // Note: there is no L2 cache bypass policy at the ISA level.
1292     if (Op == SIMemOp::LOAD)
1293       Changed |= enableGLCBit(MI);
1294 
1295     // Ensure operation has completed at system scope to cause all volatile
1296     // operations to be visible outside the program in a global order. Do not
1297     // request cross address space as only the global address space can be
1298     // observable outside the program, so no need to cause a waitcnt for LDS
1299     // address space operations.
1300     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1301                           Position::AFTER);
1302 
1303     return Changed;
1304   }
1305 
1306   if (IsNonTemporal) {
1307     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1308     // for both loads and stores, and the L2 cache policy to STREAM.
1309     Changed |= enableGLCBit(MI);
1310     Changed |= enableSLCBit(MI);
1311     return Changed;
1312   }
1313 
1314   return Changed;
1315 }
1316 
1317 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1318                                       SIAtomicScope Scope,
1319                                       SIAtomicAddrSpace AddrSpace,
1320                                       SIMemOp Op,
1321                                       bool IsCrossAddrSpaceOrdering,
1322                                       Position Pos) const {
1323   if (ST.isTgSplitEnabled()) {
1324     // In threadgroup split mode the waves of a work-group can be executing on
1325     // different CUs. Therefore need to wait for global or GDS memory operations
1326     // to complete to ensure they are visible to waves in the other CUs.
1327     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1328     // the same CU, so no need to wait for global memory as all waves in the
1329     // work-group access the same the L1, nor wait for GDS as access are ordered
1330     // on a CU.
1331     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1332                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1333         (Scope == SIAtomicScope::WORKGROUP)) {
1334       // Same as GFX7 using agent scope.
1335       Scope = SIAtomicScope::AGENT;
1336     }
1337     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1338     // LDS memory operations.
1339     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1340   }
1341   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1342                                         IsCrossAddrSpaceOrdering, Pos);
1343 }
1344 
1345 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1346                                          SIAtomicScope Scope,
1347                                          SIAtomicAddrSpace AddrSpace,
1348                                          Position Pos) const {
1349   if (!InsertCacheInv)
1350     return false;
1351 
1352   bool Changed = false;
1353 
1354   MachineBasicBlock &MBB = *MI->getParent();
1355   DebugLoc DL = MI->getDebugLoc();
1356 
1357   if (Pos == Position::AFTER)
1358     ++MI;
1359 
1360   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1361     switch (Scope) {
1362     case SIAtomicScope::SYSTEM:
1363       // Ensures that following loads will not see stale remote VMEM data or
1364       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1365       // CC will never be stale due to the local memory probes.
1366       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1367       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1368       // hardware does not reorder memory operations by the same wave with
1369       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1370       // remove any cache lines of earlier writes by the same wave and ensures
1371       // later reads by the same wave will refetch the cache lines.
1372       Changed = true;
1373       break;
1374     case SIAtomicScope::AGENT:
1375       // Same as GFX7.
1376       break;
1377     case SIAtomicScope::WORKGROUP:
1378       // In threadgroup split mode the waves of a work-group can be executing on
1379       // different CUs. Therefore need to invalidate the L1 which is per CU.
1380       // Otherwise in non-threadgroup split mode all waves of a work-group are
1381       // on the same CU, and so the L1 does not need to be invalidated.
1382       if (ST.isTgSplitEnabled()) {
1383         // Same as GFX7 using agent scope.
1384         Scope = SIAtomicScope::AGENT;
1385       }
1386       break;
1387     case SIAtomicScope::WAVEFRONT:
1388     case SIAtomicScope::SINGLETHREAD:
1389       // Same as GFX7.
1390       break;
1391     default:
1392       llvm_unreachable("Unsupported synchronization scope");
1393     }
1394   }
1395 
1396   /// The scratch address space does not need the global memory cache
1397   /// to be flushed as all memory operations by the same thread are
1398   /// sequentially consistent, and no other thread can access scratch
1399   /// memory.
1400 
1401   /// Other address spaces do not have a cache.
1402 
1403   if (Pos == Position::AFTER)
1404     --MI;
1405 
1406   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1407 
1408   return Changed;
1409 }
1410 
1411 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1412                                          SIAtomicScope Scope,
1413                                          SIAtomicAddrSpace AddrSpace,
1414                                          bool IsCrossAddrSpaceOrdering,
1415                                          Position Pos) const {
1416   bool Changed = false;
1417 
1418   MachineBasicBlock &MBB = *MI->getParent();
1419   DebugLoc DL = MI->getDebugLoc();
1420 
1421   if (Pos == Position::AFTER)
1422     ++MI;
1423 
1424   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1425     switch (Scope) {
1426     case SIAtomicScope::SYSTEM:
1427       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1428       // hardware does not reorder memory operations by the same wave with
1429       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1430       // to initiate writeback of any dirty cache lines of earlier writes by the
1431       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1432       // writeback has completed.
1433       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1434         // Set SC bits to indicate system scope.
1435         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1436       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1437       // vmcnt(0)" needed by the "BUFFER_WBL2".
1438       Changed = true;
1439       break;
1440     case SIAtomicScope::AGENT:
1441     case SIAtomicScope::WORKGROUP:
1442     case SIAtomicScope::WAVEFRONT:
1443     case SIAtomicScope::SINGLETHREAD:
1444       // Same as GFX7.
1445       break;
1446     default:
1447       llvm_unreachable("Unsupported synchronization scope");
1448     }
1449   }
1450 
1451   if (Pos == Position::AFTER)
1452     --MI;
1453 
1454   Changed |=
1455       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1456                                         IsCrossAddrSpaceOrdering, Pos);
1457 
1458   return Changed;
1459 }
1460 
1461 bool SIGfx940CacheControl::enableLoadCacheBypass(
1462     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1463     SIAtomicAddrSpace AddrSpace) const {
1464   assert(MI->mayLoad() && !MI->mayStore());
1465   bool Changed = false;
1466 
1467   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1468     switch (Scope) {
1469     case SIAtomicScope::SYSTEM:
1470       // Set SC bits to indicate system scope.
1471       Changed |= enableSC0Bit(MI);
1472       Changed |= enableSC1Bit(MI);
1473       break;
1474     case SIAtomicScope::AGENT:
1475       // Set SC bits to indicate agent scope.
1476       Changed |= enableSC1Bit(MI);
1477       break;
1478     case SIAtomicScope::WORKGROUP:
1479       // In threadgroup split mode the waves of a work-group can be executing on
1480       // different CUs. Therefore need to bypass the L1 which is per CU.
1481       // Otherwise in non-threadgroup split mode all waves of a work-group are
1482       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1483       // bits to indicate work-group scope will do this automatically.
1484       Changed |= enableSC0Bit(MI);
1485       break;
1486     case SIAtomicScope::WAVEFRONT:
1487     case SIAtomicScope::SINGLETHREAD:
1488       // Leave SC bits unset to indicate wavefront scope.
1489       break;
1490     default:
1491       llvm_unreachable("Unsupported synchronization scope");
1492     }
1493   }
1494 
1495   /// The scratch address space does not need the global memory caches
1496   /// to be bypassed as all memory operations by the same thread are
1497   /// sequentially consistent, and no other thread can access scratch
1498   /// memory.
1499 
1500   /// Other address spaces do not have a cache.
1501 
1502   return Changed;
1503 }
1504 
1505 bool SIGfx940CacheControl::enableStoreCacheBypass(
1506     const MachineBasicBlock::iterator &MI,
1507     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1508   assert(!MI->mayLoad() && MI->mayStore());
1509   bool Changed = false;
1510 
1511   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1512     switch (Scope) {
1513     case SIAtomicScope::SYSTEM:
1514       // Set SC bits to indicate system scope.
1515       Changed |= enableSC0Bit(MI);
1516       Changed |= enableSC1Bit(MI);
1517       break;
1518     case SIAtomicScope::AGENT:
1519       // Set SC bits to indicate agent scope.
1520       Changed |= enableSC1Bit(MI);
1521       break;
1522     case SIAtomicScope::WORKGROUP:
1523       // Set SC bits to indicate workgroup scope.
1524       Changed |= enableSC0Bit(MI);
1525       break;
1526     case SIAtomicScope::WAVEFRONT:
1527     case SIAtomicScope::SINGLETHREAD:
1528       // Leave SC bits unset to indicate wavefront scope.
1529       break;
1530     default:
1531       llvm_unreachable("Unsupported synchronization scope");
1532     }
1533   }
1534 
1535   /// The scratch address space does not need the global memory caches
1536   /// to be bypassed as all memory operations by the same thread are
1537   /// sequentially consistent, and no other thread can access scratch
1538   /// memory.
1539 
1540   /// Other address spaces do not have a cache.
1541 
1542   return Changed;
1543 }
1544 
1545 bool SIGfx940CacheControl::enableRMWCacheBypass(
1546     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1547     SIAtomicAddrSpace AddrSpace) const {
1548   assert(MI->mayLoad() && MI->mayStore());
1549   bool Changed = false;
1550 
1551   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1552     switch (Scope) {
1553     case SIAtomicScope::SYSTEM:
1554       // Set SC1 bit to indicate system scope.
1555       Changed |= enableSC1Bit(MI);
1556       break;
1557     case SIAtomicScope::AGENT:
1558     case SIAtomicScope::WORKGROUP:
1559     case SIAtomicScope::WAVEFRONT:
1560     case SIAtomicScope::SINGLETHREAD:
1561       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1562       // to indicate system or agent scope. The SC0 bit is used to indicate if
1563       // they are return or no-return. Leave SC1 bit unset to indicate agent
1564       // scope.
1565       break;
1566     default:
1567       llvm_unreachable("Unsupported synchronization scope");
1568     }
1569   }
1570 
1571   return Changed;
1572 }
1573 
1574 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1575     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1576     bool IsVolatile, bool IsNonTemporal) const {
1577   // Only handle load and store, not atomic read-modify-write insructions. The
1578   // latter use glc to indicate if the atomic returns a result and so must not
1579   // be used for cache control.
1580   assert(MI->mayLoad() ^ MI->mayStore());
1581 
1582   // Only update load and store, not LLVM IR atomic read-modify-write
1583   // instructions. The latter are always marked as volatile so cannot sensibly
1584   // handle it as do not want to pessimize all atomics. Also they do not support
1585   // the nontemporal attribute.
1586   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1587 
1588   bool Changed = false;
1589 
1590   if (IsVolatile) {
1591     // Set SC bits to indicate system scope.
1592     Changed |= enableSC0Bit(MI);
1593     Changed |= enableSC1Bit(MI);
1594 
1595     // Ensure operation has completed at system scope to cause all volatile
1596     // operations to be visible outside the program in a global order. Do not
1597     // request cross address space as only the global address space can be
1598     // observable outside the program, so no need to cause a waitcnt for LDS
1599     // address space operations.
1600     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1601                           Position::AFTER);
1602 
1603     return Changed;
1604   }
1605 
1606   if (IsNonTemporal) {
1607     Changed |= enableNTBit(MI);
1608     return Changed;
1609   }
1610 
1611   return Changed;
1612 }
1613 
1614 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1615                                          SIAtomicScope Scope,
1616                                          SIAtomicAddrSpace AddrSpace,
1617                                          Position Pos) const {
1618   if (!InsertCacheInv)
1619     return false;
1620 
1621   bool Changed = false;
1622 
1623   MachineBasicBlock &MBB = *MI->getParent();
1624   DebugLoc DL = MI->getDebugLoc();
1625 
1626   if (Pos == Position::AFTER)
1627     ++MI;
1628 
1629   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1630     switch (Scope) {
1631     case SIAtomicScope::SYSTEM:
1632       // Ensures that following loads will not see stale remote VMEM data or
1633       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1634       // CC will never be stale due to the local memory probes.
1635       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1636           // Set SC bits to indicate system scope.
1637           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1638       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1639       // hardware does not reorder memory operations by the same wave with
1640       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1641       // remove any cache lines of earlier writes by the same wave and ensures
1642       // later reads by the same wave will refetch the cache lines.
1643       Changed = true;
1644       break;
1645     case SIAtomicScope::AGENT:
1646       // Ensures that following loads will not see stale remote date or local
1647       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1648       // due to the memory probes.
1649       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1650           // Set SC bits to indicate agent scope.
1651           .addImm(AMDGPU::CPol::SC1);
1652       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1653       // does not reorder memory operations with respect to preceeding buffer
1654       // invalidate. The invalidate is guaranteed to remove any cache lines of
1655       // earlier writes and ensures later writes will refetch the cache lines.
1656       Changed = true;
1657       break;
1658     case SIAtomicScope::WORKGROUP:
1659       // In threadgroup split mode the waves of a work-group can be executing on
1660       // different CUs. Therefore need to invalidate the L1 which is per CU.
1661       // Otherwise in non-threadgroup split mode all waves of a work-group are
1662       // on the same CU, and so the L1 does not need to be invalidated.
1663       if (ST.isTgSplitEnabled()) {
1664         // Ensures L1 is invalidated if in threadgroup split mode. In
1665         // non-threadgroup split mode it is a NOP, but no point generating it in
1666         // that case if know not in that mode.
1667         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1668             // Set SC bits to indicate work-group scope.
1669             .addImm(AMDGPU::CPol::SC0);
1670         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1671         // does not reorder memory operations with respect to preceeding buffer
1672         // invalidate. The invalidate is guaranteed to remove any cache lines of
1673         // earlier writes and ensures later writes will refetch the cache lines.
1674         Changed = true;
1675       }
1676       break;
1677     case SIAtomicScope::WAVEFRONT:
1678     case SIAtomicScope::SINGLETHREAD:
1679       // Could generate "BUFFER_INV" but it would do nothing as there are no
1680       // caches to invalidate.
1681       break;
1682     default:
1683       llvm_unreachable("Unsupported synchronization scope");
1684     }
1685   }
1686 
1687   /// The scratch address space does not need the global memory cache
1688   /// to be flushed as all memory operations by the same thread are
1689   /// sequentially consistent, and no other thread can access scratch
1690   /// memory.
1691 
1692   /// Other address spaces do not have a cache.
1693 
1694   if (Pos == Position::AFTER)
1695     --MI;
1696 
1697   return Changed;
1698 }
1699 
1700 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1701                                          SIAtomicScope Scope,
1702                                          SIAtomicAddrSpace AddrSpace,
1703                                          bool IsCrossAddrSpaceOrdering,
1704                                          Position Pos) const {
1705   bool Changed = false;
1706 
1707   MachineBasicBlock &MBB = *MI->getParent();
1708   DebugLoc DL = MI->getDebugLoc();
1709 
1710   if (Pos == Position::AFTER)
1711     ++MI;
1712 
1713   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1714     switch (Scope) {
1715     case SIAtomicScope::SYSTEM:
1716       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1717       // hardware does not reorder memory operations by the same wave with
1718       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1719       // to initiate writeback of any dirty cache lines of earlier writes by the
1720       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1721       // writeback has completed.
1722       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1723           // Set SC bits to indicate system scope.
1724           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1725       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1726       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1727       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1728       Changed = true;
1729       break;
1730     case SIAtomicScope::AGENT:
1731       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1732           // Set SC bits to indicate agent scope.
1733           .addImm(AMDGPU::CPol::SC1);
1734 
1735       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1736       // SIAtomicScope::AGENT, the following insertWait will generate the
1737       // required "S_WAITCNT vmcnt(0)".
1738       Changed = true;
1739       break;
1740     case SIAtomicScope::WORKGROUP:
1741     case SIAtomicScope::WAVEFRONT:
1742     case SIAtomicScope::SINGLETHREAD:
1743       // Do not generate "BUFFER_WBL2" as there are no caches it would
1744       // writeback, and would require an otherwise unnecessary
1745       // "S_WAITCNT vmcnt(0)".
1746       break;
1747     default:
1748       llvm_unreachable("Unsupported synchronization scope");
1749     }
1750   }
1751 
1752   if (Pos == Position::AFTER)
1753     --MI;
1754 
1755   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1756   // S_WAITCNT needed.
1757   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1758                         IsCrossAddrSpaceOrdering, Pos);
1759 
1760   return Changed;
1761 }
1762 
1763 bool SIGfx10CacheControl::enableLoadCacheBypass(
1764     const MachineBasicBlock::iterator &MI,
1765     SIAtomicScope Scope,
1766     SIAtomicAddrSpace AddrSpace) const {
1767   assert(MI->mayLoad() && !MI->mayStore());
1768   bool Changed = false;
1769 
1770   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1771     switch (Scope) {
1772     case SIAtomicScope::SYSTEM:
1773     case SIAtomicScope::AGENT:
1774       // Set the L0 and L1 cache policies to MISS_EVICT.
1775       // Note: there is no L2 cache coherent bypass control at the ISA level.
1776       Changed |= enableGLCBit(MI);
1777       Changed |= enableDLCBit(MI);
1778       break;
1779     case SIAtomicScope::WORKGROUP:
1780       // In WGP mode the waves of a work-group can be executing on either CU of
1781       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1782       // CU mode all waves of a work-group are on the same CU, and so the L0
1783       // does not need to be bypassed.
1784       if (!ST.isCuModeEnabled())
1785         Changed |= enableGLCBit(MI);
1786       break;
1787     case SIAtomicScope::WAVEFRONT:
1788     case SIAtomicScope::SINGLETHREAD:
1789       // No cache to bypass.
1790       break;
1791     default:
1792       llvm_unreachable("Unsupported synchronization scope");
1793     }
1794   }
1795 
1796   /// The scratch address space does not need the global memory caches
1797   /// to be bypassed as all memory operations by the same thread are
1798   /// sequentially consistent, and no other thread can access scratch
1799   /// memory.
1800 
1801   /// Other address spaces do not have a cache.
1802 
1803   return Changed;
1804 }
1805 
1806 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1807     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1808     bool IsVolatile, bool IsNonTemporal) const {
1809 
1810   // Only handle load and store, not atomic read-modify-write insructions. The
1811   // latter use glc to indicate if the atomic returns a result and so must not
1812   // be used for cache control.
1813   assert(MI->mayLoad() ^ MI->mayStore());
1814 
1815   // Only update load and store, not LLVM IR atomic read-modify-write
1816   // instructions. The latter are always marked as volatile so cannot sensibly
1817   // handle it as do not want to pessimize all atomics. Also they do not support
1818   // the nontemporal attribute.
1819   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1820 
1821   bool Changed = false;
1822 
1823   if (IsVolatile) {
1824     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1825     // and MISS_LRU for store instructions.
1826     // Note: there is no L2 cache coherent bypass control at the ISA level.
1827     if (Op == SIMemOp::LOAD) {
1828       Changed |= enableGLCBit(MI);
1829       Changed |= enableDLCBit(MI);
1830     }
1831 
1832     // Ensure operation has completed at system scope to cause all volatile
1833     // operations to be visible outside the program in a global order. Do not
1834     // request cross address space as only the global address space can be
1835     // observable outside the program, so no need to cause a waitcnt for LDS
1836     // address space operations.
1837     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1838                           Position::AFTER);
1839     return Changed;
1840   }
1841 
1842   if (IsNonTemporal) {
1843     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1844     // and L2 cache policy to STREAM.
1845     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1846     // to MISS_EVICT and the L2 cache policy to STREAM.
1847     if (Op == SIMemOp::STORE)
1848       Changed |= enableGLCBit(MI);
1849     Changed |= enableSLCBit(MI);
1850 
1851     return Changed;
1852   }
1853 
1854   return Changed;
1855 }
1856 
1857 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1858                                      SIAtomicScope Scope,
1859                                      SIAtomicAddrSpace AddrSpace,
1860                                      SIMemOp Op,
1861                                      bool IsCrossAddrSpaceOrdering,
1862                                      Position Pos) const {
1863   bool Changed = false;
1864 
1865   MachineBasicBlock &MBB = *MI->getParent();
1866   DebugLoc DL = MI->getDebugLoc();
1867 
1868   if (Pos == Position::AFTER)
1869     ++MI;
1870 
1871   bool VMCnt = false;
1872   bool VSCnt = false;
1873   bool LGKMCnt = false;
1874 
1875   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1876       SIAtomicAddrSpace::NONE) {
1877     switch (Scope) {
1878     case SIAtomicScope::SYSTEM:
1879     case SIAtomicScope::AGENT:
1880       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1881         VMCnt |= true;
1882       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1883         VSCnt |= true;
1884       break;
1885     case SIAtomicScope::WORKGROUP:
1886       // In WGP mode the waves of a work-group can be executing on either CU of
1887       // the WGP. Therefore need to wait for operations to complete to ensure
1888       // they are visible to waves in the other CU as the L0 is per CU.
1889       // Otherwise in CU mode and all waves of a work-group are on the same CU
1890       // which shares the same L0.
1891       if (!ST.isCuModeEnabled()) {
1892         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1893           VMCnt |= true;
1894         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1895           VSCnt |= true;
1896       }
1897       break;
1898     case SIAtomicScope::WAVEFRONT:
1899     case SIAtomicScope::SINGLETHREAD:
1900       // The L0 cache keeps all memory operations in order for
1901       // work-items in the same wavefront.
1902       break;
1903     default:
1904       llvm_unreachable("Unsupported synchronization scope");
1905     }
1906   }
1907 
1908   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1909     switch (Scope) {
1910     case SIAtomicScope::SYSTEM:
1911     case SIAtomicScope::AGENT:
1912     case SIAtomicScope::WORKGROUP:
1913       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1914       // not needed as LDS operations for all waves are executed in a total
1915       // global ordering as observed by all waves. Required if also
1916       // synchronizing with global/GDS memory as LDS operations could be
1917       // reordered with respect to later global/GDS memory operations of the
1918       // same wave.
1919       LGKMCnt |= IsCrossAddrSpaceOrdering;
1920       break;
1921     case SIAtomicScope::WAVEFRONT:
1922     case SIAtomicScope::SINGLETHREAD:
1923       // The LDS keeps all memory operations in order for
1924       // the same wavefront.
1925       break;
1926     default:
1927       llvm_unreachable("Unsupported synchronization scope");
1928     }
1929   }
1930 
1931   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1932     switch (Scope) {
1933     case SIAtomicScope::SYSTEM:
1934     case SIAtomicScope::AGENT:
1935       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1936       // is not needed as GDS operations for all waves are executed in a total
1937       // global ordering as observed by all waves. Required if also
1938       // synchronizing with global/LDS memory as GDS operations could be
1939       // reordered with respect to later global/LDS memory operations of the
1940       // same wave.
1941       LGKMCnt |= IsCrossAddrSpaceOrdering;
1942       break;
1943     case SIAtomicScope::WORKGROUP:
1944     case SIAtomicScope::WAVEFRONT:
1945     case SIAtomicScope::SINGLETHREAD:
1946       // The GDS keeps all memory operations in order for
1947       // the same work-group.
1948       break;
1949     default:
1950       llvm_unreachable("Unsupported synchronization scope");
1951     }
1952   }
1953 
1954   if (VMCnt || LGKMCnt) {
1955     unsigned WaitCntImmediate =
1956       AMDGPU::encodeWaitcnt(IV,
1957                             VMCnt ? 0 : getVmcntBitMask(IV),
1958                             getExpcntBitMask(IV),
1959                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1960     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1961     Changed = true;
1962   }
1963 
1964   if (VSCnt) {
1965     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1966       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1967       .addImm(0);
1968     Changed = true;
1969   }
1970 
1971   if (Pos == Position::AFTER)
1972     --MI;
1973 
1974   return Changed;
1975 }
1976 
1977 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1978                                         SIAtomicScope Scope,
1979                                         SIAtomicAddrSpace AddrSpace,
1980                                         Position Pos) const {
1981   if (!InsertCacheInv)
1982     return false;
1983 
1984   bool Changed = false;
1985 
1986   MachineBasicBlock &MBB = *MI->getParent();
1987   DebugLoc DL = MI->getDebugLoc();
1988 
1989   if (Pos == Position::AFTER)
1990     ++MI;
1991 
1992   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1993     switch (Scope) {
1994     case SIAtomicScope::SYSTEM:
1995     case SIAtomicScope::AGENT:
1996       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1997       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1998       Changed = true;
1999       break;
2000     case SIAtomicScope::WORKGROUP:
2001       // In WGP mode the waves of a work-group can be executing on either CU of
2002       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2003       // in CU mode and all waves of a work-group are on the same CU, and so the
2004       // L0 does not need to be invalidated.
2005       if (!ST.isCuModeEnabled()) {
2006         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2007         Changed = true;
2008       }
2009       break;
2010     case SIAtomicScope::WAVEFRONT:
2011     case SIAtomicScope::SINGLETHREAD:
2012       // No cache to invalidate.
2013       break;
2014     default:
2015       llvm_unreachable("Unsupported synchronization scope");
2016     }
2017   }
2018 
2019   /// The scratch address space does not need the global memory cache
2020   /// to be flushed as all memory operations by the same thread are
2021   /// sequentially consistent, and no other thread can access scratch
2022   /// memory.
2023 
2024   /// Other address spaces do not have a cache.
2025 
2026   if (Pos == Position::AFTER)
2027     --MI;
2028 
2029   return Changed;
2030 }
2031 
2032 bool SIGfx11CacheControl::enableLoadCacheBypass(
2033     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2034     SIAtomicAddrSpace AddrSpace) const {
2035   assert(MI->mayLoad() && !MI->mayStore());
2036   bool Changed = false;
2037 
2038   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2039     switch (Scope) {
2040     case SIAtomicScope::SYSTEM:
2041     case SIAtomicScope::AGENT:
2042       // Set the L0 and L1 cache policies to MISS_EVICT.
2043       // Note: there is no L2 cache coherent bypass control at the ISA level.
2044       Changed |= enableGLCBit(MI);
2045       break;
2046     case SIAtomicScope::WORKGROUP:
2047       // In WGP mode the waves of a work-group can be executing on either CU of
2048       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2049       // CU mode all waves of a work-group are on the same CU, and so the L0
2050       // does not need to be bypassed.
2051       if (!ST.isCuModeEnabled())
2052         Changed |= enableGLCBit(MI);
2053       break;
2054     case SIAtomicScope::WAVEFRONT:
2055     case SIAtomicScope::SINGLETHREAD:
2056       // No cache to bypass.
2057       break;
2058     default:
2059       llvm_unreachable("Unsupported synchronization scope");
2060     }
2061   }
2062 
2063   /// The scratch address space does not need the global memory caches
2064   /// to be bypassed as all memory operations by the same thread are
2065   /// sequentially consistent, and no other thread can access scratch
2066   /// memory.
2067 
2068   /// Other address spaces do not have a cache.
2069 
2070   return Changed;
2071 }
2072 
2073 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2074     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2075     bool IsVolatile, bool IsNonTemporal) const {
2076 
2077   // Only handle load and store, not atomic read-modify-write insructions. The
2078   // latter use glc to indicate if the atomic returns a result and so must not
2079   // be used for cache control.
2080   assert(MI->mayLoad() ^ MI->mayStore());
2081 
2082   // Only update load and store, not LLVM IR atomic read-modify-write
2083   // instructions. The latter are always marked as volatile so cannot sensibly
2084   // handle it as do not want to pessimize all atomics. Also they do not support
2085   // the nontemporal attribute.
2086   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2087 
2088   bool Changed = false;
2089 
2090   if (IsVolatile) {
2091     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2092     // and MISS_LRU for store instructions.
2093     // Note: there is no L2 cache coherent bypass control at the ISA level.
2094     if (Op == SIMemOp::LOAD)
2095       Changed |= enableGLCBit(MI);
2096 
2097     // Set MALL NOALLOC for load and store instructions.
2098     Changed |= enableDLCBit(MI);
2099 
2100     // Ensure operation has completed at system scope to cause all volatile
2101     // operations to be visible outside the program in a global order. Do not
2102     // request cross address space as only the global address space can be
2103     // observable outside the program, so no need to cause a waitcnt for LDS
2104     // address space operations.
2105     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2106                           Position::AFTER);
2107     return Changed;
2108   }
2109 
2110   if (IsNonTemporal) {
2111     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2112     // and L2 cache policy to STREAM.
2113     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2114     // to MISS_EVICT and the L2 cache policy to STREAM.
2115     if (Op == SIMemOp::STORE)
2116       Changed |= enableGLCBit(MI);
2117     Changed |= enableSLCBit(MI);
2118 
2119     // Set MALL NOALLOC for load and store instructions.
2120     Changed |= enableDLCBit(MI);
2121     return Changed;
2122   }
2123 
2124   return Changed;
2125 }
2126 
2127 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2128   if (AtomicPseudoMIs.empty())
2129     return false;
2130 
2131   for (auto &MI : AtomicPseudoMIs)
2132     MI->eraseFromParent();
2133 
2134   AtomicPseudoMIs.clear();
2135   return true;
2136 }
2137 
2138 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2139                                    MachineBasicBlock::iterator &MI) {
2140   assert(MI->mayLoad() && !MI->mayStore());
2141 
2142   bool Changed = false;
2143 
2144   if (MOI.isAtomic()) {
2145     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2146         MOI.getOrdering() == AtomicOrdering::Acquire ||
2147         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2148       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2149                                            MOI.getOrderingAddrSpace());
2150     }
2151 
2152     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2153       Changed |= CC->insertWait(MI, MOI.getScope(),
2154                                 MOI.getOrderingAddrSpace(),
2155                                 SIMemOp::LOAD | SIMemOp::STORE,
2156                                 MOI.getIsCrossAddressSpaceOrdering(),
2157                                 Position::BEFORE);
2158 
2159     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2160         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2161       Changed |= CC->insertWait(MI, MOI.getScope(),
2162                                 MOI.getInstrAddrSpace(),
2163                                 SIMemOp::LOAD,
2164                                 MOI.getIsCrossAddressSpaceOrdering(),
2165                                 Position::AFTER);
2166       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2167                                    MOI.getOrderingAddrSpace(),
2168                                    Position::AFTER);
2169     }
2170 
2171     return Changed;
2172   }
2173 
2174   // Atomic instructions already bypass caches to the scope specified by the
2175   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2176   // need additional treatment.
2177   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2178                                                 SIMemOp::LOAD, MOI.isVolatile(),
2179                                                 MOI.isNonTemporal());
2180   return Changed;
2181 }
2182 
2183 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2184                                     MachineBasicBlock::iterator &MI) {
2185   assert(!MI->mayLoad() && MI->mayStore());
2186 
2187   bool Changed = false;
2188 
2189   if (MOI.isAtomic()) {
2190     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2191         MOI.getOrdering() == AtomicOrdering::Release ||
2192         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2193       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2194                                             MOI.getOrderingAddrSpace());
2195     }
2196 
2197     if (MOI.getOrdering() == AtomicOrdering::Release ||
2198         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2199       Changed |= CC->insertRelease(MI, MOI.getScope(),
2200                                    MOI.getOrderingAddrSpace(),
2201                                    MOI.getIsCrossAddressSpaceOrdering(),
2202                                    Position::BEFORE);
2203 
2204     return Changed;
2205   }
2206 
2207   // Atomic instructions already bypass caches to the scope specified by the
2208   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2209   // need additional treatment.
2210   Changed |= CC->enableVolatileAndOrNonTemporal(
2211       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2212       MOI.isNonTemporal());
2213   return Changed;
2214 }
2215 
2216 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2217                                           MachineBasicBlock::iterator &MI) {
2218   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2219 
2220   AtomicPseudoMIs.push_back(MI);
2221   bool Changed = false;
2222 
2223   if (MOI.isAtomic()) {
2224     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2225         MOI.getOrdering() == AtomicOrdering::Release ||
2226         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2227         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2228       /// TODO: This relies on a barrier always generating a waitcnt
2229       /// for LDS to ensure it is not reordered with the completion of
2230       /// the proceeding LDS operations. If barrier had a memory
2231       /// ordering and memory scope, then library does not need to
2232       /// generate a fence. Could add support in this file for
2233       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2234       /// adding S_WAITCNT before a S_BARRIER.
2235       Changed |= CC->insertRelease(MI, MOI.getScope(),
2236                                    MOI.getOrderingAddrSpace(),
2237                                    MOI.getIsCrossAddressSpaceOrdering(),
2238                                    Position::BEFORE);
2239 
2240     // TODO: If both release and invalidate are happening they could be combined
2241     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2242     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2243     // track cache invalidate and write back instructions.
2244 
2245     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2246         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2247         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2248       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2249                                    MOI.getOrderingAddrSpace(),
2250                                    Position::BEFORE);
2251 
2252     return Changed;
2253   }
2254 
2255   return Changed;
2256 }
2257 
2258 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2259   MachineBasicBlock::iterator &MI) {
2260   assert(MI->mayLoad() && MI->mayStore());
2261 
2262   bool Changed = false;
2263 
2264   if (MOI.isAtomic()) {
2265     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2266         MOI.getOrdering() == AtomicOrdering::Acquire ||
2267         MOI.getOrdering() == AtomicOrdering::Release ||
2268         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2269         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2270       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2271                                           MOI.getInstrAddrSpace());
2272     }
2273 
2274     if (MOI.getOrdering() == AtomicOrdering::Release ||
2275         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2276         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2277         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2278       Changed |= CC->insertRelease(MI, MOI.getScope(),
2279                                    MOI.getOrderingAddrSpace(),
2280                                    MOI.getIsCrossAddressSpaceOrdering(),
2281                                    Position::BEFORE);
2282 
2283     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2284         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2285         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2286         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2287         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2288       Changed |= CC->insertWait(MI, MOI.getScope(),
2289                                 MOI.getInstrAddrSpace(),
2290                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2291                                                    SIMemOp::STORE,
2292                                 MOI.getIsCrossAddressSpaceOrdering(),
2293                                 Position::AFTER);
2294       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2295                                    MOI.getOrderingAddrSpace(),
2296                                    Position::AFTER);
2297     }
2298 
2299     return Changed;
2300   }
2301 
2302   return Changed;
2303 }
2304 
2305 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2306   bool Changed = false;
2307 
2308   SIMemOpAccess MOA(MF);
2309   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2310 
2311   for (auto &MBB : MF) {
2312     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2313 
2314       // Unbundle instructions after the post-RA scheduler.
2315       if (MI->isBundle() && MI->mayLoadOrStore()) {
2316         MachineBasicBlock::instr_iterator II(MI->getIterator());
2317         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2318              I != E && I->isBundledWithPred(); ++I) {
2319           I->unbundleFromPred();
2320           for (MachineOperand &MO : I->operands())
2321             if (MO.isReg())
2322               MO.setIsInternalRead(false);
2323         }
2324 
2325         MI->eraseFromParent();
2326         MI = II->getIterator();
2327       }
2328 
2329       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2330         continue;
2331 
2332       if (const auto &MOI = MOA.getLoadInfo(MI))
2333         Changed |= expandLoad(MOI.value(), MI);
2334       else if (const auto &MOI = MOA.getStoreInfo(MI))
2335         Changed |= expandStore(MOI.value(), MI);
2336       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2337         Changed |= expandAtomicFence(MOI.value(), MI);
2338       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2339         Changed |= expandAtomicCmpxchgOrRmw(MOI.value(), MI);
2340     }
2341   }
2342 
2343   Changed |= removeAtomicPseudoMIs();
2344   return Changed;
2345 }
2346 
2347 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2348 
2349 char SIMemoryLegalizer::ID = 0;
2350 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2351 
2352 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2353   return new SIMemoryLegalizer();
2354 }
2355