xref: /llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (revision 27ce5121ee875c337dd0f977b00afcb756977f62)
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/ADT/BitmaskEnum.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/Support/AtomicOrdering.h"
25 #include "llvm/TargetParser/TargetParser.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
33 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35     cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
39 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43   NONE = 0u,
44   LOAD = 1u << 0,
45   STORE = 1u << 1,
46   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52   BEFORE,
53   AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58   NONE,
59   SINGLETHREAD,
60   WAVEFRONT,
61   WORKGROUP,
62   AGENT,
63   SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69   NONE = 0u,
70   GLOBAL = 1u << 0,
71   LDS = 1u << 1,
72   SCRATCH = 1u << 2,
73   GDS = 1u << 3,
74   OTHER = 1u << 4,
75 
76   /// The address spaces that can be accessed by a FLAT instruction.
77   FLAT = GLOBAL | LDS | SCRATCH,
78 
79   /// The address spaces that support atomic instructions.
80   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82   /// All address spaces.
83   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91   friend class SIMemOpAccess;
92 
93   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98   bool IsCrossAddressSpaceOrdering = false;
99   bool IsVolatile = false;
100   bool IsNonTemporal = false;
101 
102   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106               bool IsCrossAddressSpaceOrdering = true,
107               AtomicOrdering FailureOrdering =
108                 AtomicOrdering::SequentiallyConsistent,
109               bool IsVolatile = false,
110               bool IsNonTemporal = false)
111     : Ordering(Ordering), FailureOrdering(FailureOrdering),
112       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113       InstrAddrSpace(InstrAddrSpace),
114       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115       IsVolatile(IsVolatile),
116       IsNonTemporal(IsNonTemporal) {
117 
118     if (Ordering == AtomicOrdering::NotAtomic) {
119       assert(Scope == SIAtomicScope::NONE &&
120              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121              !IsCrossAddressSpaceOrdering &&
122              FailureOrdering == AtomicOrdering::NotAtomic);
123       return;
124     }
125 
126     assert(Scope != SIAtomicScope::NONE &&
127            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128                SIAtomicAddrSpace::NONE &&
129            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130                SIAtomicAddrSpace::NONE);
131 
132     // There is also no cross address space ordering if the ordering
133     // address space is the same as the instruction address space and
134     // only contains a single address space.
135     if ((OrderingAddrSpace == InstrAddrSpace) &&
136         isPowerOf2_32(uint32_t(InstrAddrSpace)))
137       this->IsCrossAddressSpaceOrdering = false;
138 
139     // Limit the scope to the maximum supported by the instruction's address
140     // spaces.
141     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142         SIAtomicAddrSpace::NONE) {
143       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144     } else if ((InstrAddrSpace &
145                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146                SIAtomicAddrSpace::NONE) {
147       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148     } else if ((InstrAddrSpace &
149                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152     }
153   }
154 
155 public:
156   /// \returns Atomic synchronization scope of the machine instruction used to
157   /// create this SIMemOpInfo.
158   SIAtomicScope getScope() const {
159     return Scope;
160   }
161 
162   /// \returns Ordering constraint of the machine instruction used to
163   /// create this SIMemOpInfo.
164   AtomicOrdering getOrdering() const {
165     return Ordering;
166   }
167 
168   /// \returns Failure ordering constraint of the machine instruction used to
169   /// create this SIMemOpInfo.
170   AtomicOrdering getFailureOrdering() const {
171     return FailureOrdering;
172   }
173 
174   /// \returns The address spaces be accessed by the machine
175   /// instruction used to create this SIMemOpInfo.
176   SIAtomicAddrSpace getInstrAddrSpace() const {
177     return InstrAddrSpace;
178   }
179 
180   /// \returns The address spaces that must be ordered by the machine
181   /// instruction used to create this SIMemOpInfo.
182   SIAtomicAddrSpace getOrderingAddrSpace() const {
183     return OrderingAddrSpace;
184   }
185 
186   /// \returns Return true iff memory ordering of operations on
187   /// different address spaces is required.
188   bool getIsCrossAddressSpaceOrdering() const {
189     return IsCrossAddressSpaceOrdering;
190   }
191 
192   /// \returns True if memory access of the machine instruction used to
193   /// create this SIMemOpInfo is volatile, false otherwise.
194   bool isVolatile() const {
195     return IsVolatile;
196   }
197 
198   /// \returns True if memory access of the machine instruction used to
199   /// create this SIMemOpInfo is nontemporal, false otherwise.
200   bool isNonTemporal() const {
201     return IsNonTemporal;
202   }
203 
204   /// \returns True if ordering constraint of the machine instruction used to
205   /// create this SIMemOpInfo is unordered or higher, false otherwise.
206   bool isAtomic() const {
207     return Ordering != AtomicOrdering::NotAtomic;
208   }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214   AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216   /// Reports unsupported message \p Msg for \p MI to LLVM context.
217   void reportUnsupported(const MachineBasicBlock::iterator &MI,
218                          const char *Msg) const;
219 
220   /// Inspects the target synchronization scope \p SSID and determines
221   /// the SI atomic scope it corresponds to, the address spaces it
222   /// covers, and whether the memory ordering applies between address
223   /// spaces.
224   std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
225   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227   /// \return Return a bit set of the address spaces accessed by \p AS.
228   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230   /// \returns Info constructed from \p MI, which has at least machine memory
231   /// operand.
232   std::optional<SIMemOpInfo>
233   constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236   /// Construct class to support accessing the machine memory operands
237   /// of instructions in the machine function \p MF.
238   SIMemOpAccess(MachineFunction &MF);
239 
240   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
241   std::optional<SIMemOpInfo>
242   getLoadInfo(const MachineBasicBlock::iterator &MI) const;
243 
244   /// \returns Store info if \p MI is a store operation, "std::nullopt"
245   /// otherwise.
246   std::optional<SIMemOpInfo>
247   getStoreInfo(const MachineBasicBlock::iterator &MI) const;
248 
249   /// \returns Atomic fence info if \p MI is an atomic fence operation,
250   /// "std::nullopt" otherwise.
251   std::optional<SIMemOpInfo>
252   getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
253 
254   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
255   /// rmw operation, "std::nullopt" otherwise.
256   std::optional<SIMemOpInfo>
257   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
258 };
259 
260 class SICacheControl {
261 protected:
262 
263   /// AMDGPU subtarget info.
264   const GCNSubtarget &ST;
265 
266   /// Instruction info.
267   const SIInstrInfo *TII = nullptr;
268 
269   IsaVersion IV;
270 
271   /// Whether to insert cache invalidating instructions.
272   bool InsertCacheInv;
273 
274   SICacheControl(const GCNSubtarget &ST);
275 
276   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
277   /// \returns Returns true if \p MI is modified, false otherwise.
278   bool enableNamedBit(const MachineBasicBlock::iterator MI,
279                       AMDGPU::CPol::CPol Bit) const;
280 
281 public:
282 
283   /// Create a cache control for the subtarget \p ST.
284   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
285 
286   /// Update \p MI memory load instruction to bypass any caches up to
287   /// the \p Scope memory scope for address spaces \p
288   /// AddrSpace. Return true iff the instruction was modified.
289   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
290                                      SIAtomicScope Scope,
291                                      SIAtomicAddrSpace AddrSpace) const = 0;
292 
293   /// Update \p MI memory store instruction to bypass any caches up to
294   /// the \p Scope memory scope for address spaces \p
295   /// AddrSpace. Return true iff the instruction was modified.
296   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
297                                       SIAtomicScope Scope,
298                                       SIAtomicAddrSpace AddrSpace) const = 0;
299 
300   /// Update \p MI memory read-modify-write instruction to bypass any caches up
301   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
302   /// iff the instruction was modified.
303   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
304                                     SIAtomicScope Scope,
305                                     SIAtomicAddrSpace AddrSpace) const = 0;
306 
307   /// Update \p MI memory instruction of kind \p Op associated with address
308   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309   /// true iff the instruction was modified.
310   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311                                               SIAtomicAddrSpace AddrSpace,
312                                               SIMemOp Op, bool IsVolatile,
313                                               bool IsNonTemporal) const = 0;
314 
315   virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
316     return false;
317   };
318 
319   /// Inserts any necessary instructions at position \p Pos relative
320   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
321   /// \p Op associated with address spaces \p AddrSpace have completed. Used
322   /// between memory instructions to enforce the order they become visible as
323   /// observed by other memory instructions executing in memory scope \p Scope.
324   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
325   /// address spaces. Returns true iff any instructions inserted.
326   virtual bool insertWait(MachineBasicBlock::iterator &MI,
327                           SIAtomicScope Scope,
328                           SIAtomicAddrSpace AddrSpace,
329                           SIMemOp Op,
330                           bool IsCrossAddrSpaceOrdering,
331                           Position Pos) const = 0;
332 
333   /// Inserts any necessary instructions at position \p Pos relative to
334   /// instruction \p MI to ensure any subsequent memory instructions of this
335   /// thread with address spaces \p AddrSpace will observe the previous memory
336   /// operations by any thread for memory scopes up to memory scope \p Scope .
337   /// Returns true iff any instructions inserted.
338   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
339                              SIAtomicScope Scope,
340                              SIAtomicAddrSpace AddrSpace,
341                              Position Pos) const = 0;
342 
343   /// Inserts any necessary instructions at position \p Pos relative to
344   /// instruction \p MI to ensure previous memory instructions by this thread
345   /// with address spaces \p AddrSpace have completed and can be observed by
346   /// subsequent memory instructions by any thread executing in memory scope \p
347   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
348   /// between address spaces. Returns true iff any instructions inserted.
349   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
350                              SIAtomicScope Scope,
351                              SIAtomicAddrSpace AddrSpace,
352                              bool IsCrossAddrSpaceOrdering,
353                              Position Pos) const = 0;
354 
355   /// Virtual destructor to allow derivations to be deleted.
356   virtual ~SICacheControl() = default;
357 
358   virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
359                                    MachineBasicBlock::iterator &MI) const {
360     return false;
361   }
362 };
363 
364 class SIGfx6CacheControl : public SICacheControl {
365 protected:
366 
367   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
368   /// is modified, false otherwise.
369   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
370     return enableNamedBit(MI, AMDGPU::CPol::GLC);
371   }
372 
373   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
374   /// is modified, false otherwise.
375   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
376     return enableNamedBit(MI, AMDGPU::CPol::SLC);
377   }
378 
379 public:
380 
381   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
382 
383   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
384                              SIAtomicScope Scope,
385                              SIAtomicAddrSpace AddrSpace) const override;
386 
387   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
388                               SIAtomicScope Scope,
389                               SIAtomicAddrSpace AddrSpace) const override;
390 
391   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
392                             SIAtomicScope Scope,
393                             SIAtomicAddrSpace AddrSpace) const override;
394 
395   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
396                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
397                                       bool IsVolatile,
398                                       bool IsNonTemporal) const override;
399 
400   bool insertWait(MachineBasicBlock::iterator &MI,
401                   SIAtomicScope Scope,
402                   SIAtomicAddrSpace AddrSpace,
403                   SIMemOp Op,
404                   bool IsCrossAddrSpaceOrdering,
405                   Position Pos) const override;
406 
407   bool insertAcquire(MachineBasicBlock::iterator &MI,
408                      SIAtomicScope Scope,
409                      SIAtomicAddrSpace AddrSpace,
410                      Position Pos) const override;
411 
412   bool insertRelease(MachineBasicBlock::iterator &MI,
413                      SIAtomicScope Scope,
414                      SIAtomicAddrSpace AddrSpace,
415                      bool IsCrossAddrSpaceOrdering,
416                      Position Pos) const override;
417 };
418 
419 class SIGfx7CacheControl : public SIGfx6CacheControl {
420 public:
421 
422   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
423 
424   bool insertAcquire(MachineBasicBlock::iterator &MI,
425                      SIAtomicScope Scope,
426                      SIAtomicAddrSpace AddrSpace,
427                      Position Pos) const override;
428 
429 };
430 
431 class SIGfx90ACacheControl : public SIGfx7CacheControl {
432 public:
433 
434   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
435 
436   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
437                              SIAtomicScope Scope,
438                              SIAtomicAddrSpace AddrSpace) const override;
439 
440   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
441                               SIAtomicScope Scope,
442                               SIAtomicAddrSpace AddrSpace) const override;
443 
444   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
445                             SIAtomicScope Scope,
446                             SIAtomicAddrSpace AddrSpace) const override;
447 
448   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
449                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
450                                       bool IsVolatile,
451                                       bool IsNonTemporal) const override;
452 
453   bool insertWait(MachineBasicBlock::iterator &MI,
454                   SIAtomicScope Scope,
455                   SIAtomicAddrSpace AddrSpace,
456                   SIMemOp Op,
457                   bool IsCrossAddrSpaceOrdering,
458                   Position Pos) const override;
459 
460   bool insertAcquire(MachineBasicBlock::iterator &MI,
461                      SIAtomicScope Scope,
462                      SIAtomicAddrSpace AddrSpace,
463                      Position Pos) const override;
464 
465   bool insertRelease(MachineBasicBlock::iterator &MI,
466                      SIAtomicScope Scope,
467                      SIAtomicAddrSpace AddrSpace,
468                      bool IsCrossAddrSpaceOrdering,
469                      Position Pos) const override;
470 };
471 
472 class SIGfx940CacheControl : public SIGfx90ACacheControl {
473 protected:
474 
475   /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
476   /// is modified, false otherwise.
477   bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
478     return enableNamedBit(MI, AMDGPU::CPol::SC0);
479   }
480 
481   /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
482   /// is modified, false otherwise.
483   bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
484     return enableNamedBit(MI, AMDGPU::CPol::SC1);
485   }
486 
487   /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
488   /// is modified, false otherwise.
489   bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
490     return enableNamedBit(MI, AMDGPU::CPol::NT);
491   }
492 
493 public:
494 
495   SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
496 
497   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
498                              SIAtomicScope Scope,
499                              SIAtomicAddrSpace AddrSpace) const override;
500 
501   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
502                               SIAtomicScope Scope,
503                               SIAtomicAddrSpace AddrSpace) const override;
504 
505   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
506                             SIAtomicScope Scope,
507                             SIAtomicAddrSpace AddrSpace) const override;
508 
509   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
510                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
511                                       bool IsVolatile,
512                                       bool IsNonTemporal) const override;
513 
514   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
515                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
516 
517   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
518                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
519                      Position Pos) const override;
520 
521   bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
522                            MachineBasicBlock::iterator &MI) const override {
523     bool Changed = false;
524     if (ST.hasForceStoreSC0SC1() &&
525         (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
526                                     SIAtomicAddrSpace::GLOBAL |
527                                     SIAtomicAddrSpace::OTHER)) !=
528          SIAtomicAddrSpace::NONE) {
529       Changed |= enableSC0Bit(MI);
530       Changed |= enableSC1Bit(MI);
531     }
532     return Changed;
533   }
534 };
535 
536 class SIGfx10CacheControl : public SIGfx7CacheControl {
537 protected:
538 
539   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
540   /// is modified, false otherwise.
541   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
542     return enableNamedBit(MI, AMDGPU::CPol::DLC);
543   }
544 
545 public:
546 
547   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
548 
549   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
550                              SIAtomicScope Scope,
551                              SIAtomicAddrSpace AddrSpace) const override;
552 
553   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
554                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
555                                       bool IsVolatile,
556                                       bool IsNonTemporal) const override;
557 
558   bool insertWait(MachineBasicBlock::iterator &MI,
559                   SIAtomicScope Scope,
560                   SIAtomicAddrSpace AddrSpace,
561                   SIMemOp Op,
562                   bool IsCrossAddrSpaceOrdering,
563                   Position Pos) const override;
564 
565   bool insertAcquire(MachineBasicBlock::iterator &MI,
566                      SIAtomicScope Scope,
567                      SIAtomicAddrSpace AddrSpace,
568                      Position Pos) const override;
569 };
570 
571 class SIGfx11CacheControl : public SIGfx10CacheControl {
572 public:
573   SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
574 
575   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
576                              SIAtomicScope Scope,
577                              SIAtomicAddrSpace AddrSpace) const override;
578 
579   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
580                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
581                                       bool IsVolatile,
582                                       bool IsNonTemporal) const override;
583 };
584 
585 class SIGfx12CacheControl : public SIGfx11CacheControl {
586 protected:
587   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
588   // \returns Returns true if \p MI is modified, false otherwise.
589   bool setTH(const MachineBasicBlock::iterator MI,
590              AMDGPU::CPol::CPol Value) const;
591   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
592   // MI. \returns Returns true if \p MI is modified, false otherwise.
593   bool setScope(const MachineBasicBlock::iterator MI,
594                 AMDGPU::CPol::CPol Value) const;
595 
596   // Stores with system scope (SCOPE_SYS) need to wait for:
597   // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
598   // - non-returning-atomics       - wait for STORECNT==0
599   //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
600   //   since it does not distinguish atomics-with-return from regular stores.
601   // There is no need to wait if memory is cached (mtype != UC).
602   bool
603   insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
604 
605 public:
606   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
607 
608   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
609                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
610                   bool IsCrossAddrSpaceOrdering, Position Pos) const override;
611 
612   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
613                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
614 
615   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
616                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
617                                       bool IsVolatile,
618                                       bool IsNonTemporal) const override;
619 
620   bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
621 };
622 
623 class SIMemoryLegalizer final : public MachineFunctionPass {
624 private:
625 
626   /// Cache Control.
627   std::unique_ptr<SICacheControl> CC = nullptr;
628 
629   /// List of atomic pseudo instructions.
630   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
631 
632   /// Return true iff instruction \p MI is a atomic instruction that
633   /// returns a result.
634   bool isAtomicRet(const MachineInstr &MI) const {
635     return SIInstrInfo::isAtomicRet(MI);
636   }
637 
638   /// Removes all processed atomic pseudo instructions from the current
639   /// function. Returns true if current function is modified, false otherwise.
640   bool removeAtomicPseudoMIs();
641 
642   /// Expands load operation \p MI. Returns true if instructions are
643   /// added/deleted or \p MI is modified, false otherwise.
644   bool expandLoad(const SIMemOpInfo &MOI,
645                   MachineBasicBlock::iterator &MI);
646   /// Expands store operation \p MI. Returns true if instructions are
647   /// added/deleted or \p MI is modified, false otherwise.
648   bool expandStore(const SIMemOpInfo &MOI,
649                    MachineBasicBlock::iterator &MI);
650   /// Expands atomic fence operation \p MI. Returns true if
651   /// instructions are added/deleted or \p MI is modified, false otherwise.
652   bool expandAtomicFence(const SIMemOpInfo &MOI,
653                          MachineBasicBlock::iterator &MI);
654   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
655   /// instructions are added/deleted or \p MI is modified, false otherwise.
656   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
657                                 MachineBasicBlock::iterator &MI);
658 
659 public:
660   static char ID;
661 
662   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
663 
664   void getAnalysisUsage(AnalysisUsage &AU) const override {
665     AU.setPreservesCFG();
666     MachineFunctionPass::getAnalysisUsage(AU);
667   }
668 
669   StringRef getPassName() const override {
670     return PASS_NAME;
671   }
672 
673   bool runOnMachineFunction(MachineFunction &MF) override;
674 };
675 
676 } // end namespace anonymous
677 
678 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
679                                       const char *Msg) const {
680   const Function &Func = MI->getParent()->getParent()->getFunction();
681   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
682   Func.getContext().diagnose(Diag);
683 }
684 
685 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
686 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
687                                SIAtomicAddrSpace InstrAddrSpace) const {
688   if (SSID == SyncScope::System)
689     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
690   if (SSID == MMI->getAgentSSID())
691     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
692   if (SSID == MMI->getWorkgroupSSID())
693     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
694                       true);
695   if (SSID == MMI->getWavefrontSSID())
696     return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
697                       true);
698   if (SSID == SyncScope::SingleThread)
699     return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
700                       true);
701   if (SSID == MMI->getSystemOneAddressSpaceSSID())
702     return std::tuple(SIAtomicScope::SYSTEM,
703                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
704   if (SSID == MMI->getAgentOneAddressSpaceSSID())
705     return std::tuple(SIAtomicScope::AGENT,
706                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
707   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
708     return std::tuple(SIAtomicScope::WORKGROUP,
709                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
710   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
711     return std::tuple(SIAtomicScope::WAVEFRONT,
712                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
713   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
714     return std::tuple(SIAtomicScope::SINGLETHREAD,
715                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
716   return std::nullopt;
717 }
718 
719 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
720   if (AS == AMDGPUAS::FLAT_ADDRESS)
721     return SIAtomicAddrSpace::FLAT;
722   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
723     return SIAtomicAddrSpace::GLOBAL;
724   if (AS == AMDGPUAS::LOCAL_ADDRESS)
725     return SIAtomicAddrSpace::LDS;
726   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
727     return SIAtomicAddrSpace::SCRATCH;
728   if (AS == AMDGPUAS::REGION_ADDRESS)
729     return SIAtomicAddrSpace::GDS;
730 
731   return SIAtomicAddrSpace::OTHER;
732 }
733 
734 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
735   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
736 }
737 
738 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
739     const MachineBasicBlock::iterator &MI) const {
740   assert(MI->getNumMemOperands() > 0);
741 
742   SyncScope::ID SSID = SyncScope::SingleThread;
743   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
744   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
745   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
746   bool IsNonTemporal = true;
747   bool IsVolatile = false;
748 
749   // Validator should check whether or not MMOs cover the entire set of
750   // locations accessed by the memory instruction.
751   for (const auto &MMO : MI->memoperands()) {
752     IsNonTemporal &= MMO->isNonTemporal();
753     IsVolatile |= MMO->isVolatile();
754     InstrAddrSpace |=
755       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
756     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
757     if (OpOrdering != AtomicOrdering::NotAtomic) {
758       const auto &IsSyncScopeInclusion =
759           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
760       if (!IsSyncScopeInclusion) {
761         reportUnsupported(MI,
762           "Unsupported non-inclusive atomic synchronization scope");
763         return std::nullopt;
764       }
765 
766       SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
767       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
768       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
769              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
770       FailureOrdering =
771           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
772     }
773   }
774 
775   SIAtomicScope Scope = SIAtomicScope::NONE;
776   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
777   bool IsCrossAddressSpaceOrdering = false;
778   if (Ordering != AtomicOrdering::NotAtomic) {
779     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
780     if (!ScopeOrNone) {
781       reportUnsupported(MI, "Unsupported atomic synchronization scope");
782       return std::nullopt;
783     }
784     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
785         *ScopeOrNone;
786     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
787         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
788         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
789       reportUnsupported(MI, "Unsupported atomic address space");
790       return std::nullopt;
791     }
792   }
793   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
794                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
795                      IsNonTemporal);
796 }
797 
798 std::optional<SIMemOpInfo>
799 SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
800   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
801 
802   if (!(MI->mayLoad() && !MI->mayStore()))
803     return std::nullopt;
804 
805   // Be conservative if there are no memory operands.
806   if (MI->getNumMemOperands() == 0)
807     return SIMemOpInfo();
808 
809   return constructFromMIWithMMO(MI);
810 }
811 
812 std::optional<SIMemOpInfo>
813 SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
814   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
815 
816   if (!(!MI->mayLoad() && MI->mayStore()))
817     return std::nullopt;
818 
819   // Be conservative if there are no memory operands.
820   if (MI->getNumMemOperands() == 0)
821     return SIMemOpInfo();
822 
823   return constructFromMIWithMMO(MI);
824 }
825 
826 std::optional<SIMemOpInfo>
827 SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
828   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
829 
830   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
831     return std::nullopt;
832 
833   AtomicOrdering Ordering =
834     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
835 
836   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
837   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
838   if (!ScopeOrNone) {
839     reportUnsupported(MI, "Unsupported atomic synchronization scope");
840     return std::nullopt;
841   }
842 
843   SIAtomicScope Scope = SIAtomicScope::NONE;
844   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
845   bool IsCrossAddressSpaceOrdering = false;
846   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
847       *ScopeOrNone;
848 
849   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
850       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
851     reportUnsupported(MI, "Unsupported atomic address space");
852     return std::nullopt;
853   }
854 
855   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
856                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
857 }
858 
859 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
860     const MachineBasicBlock::iterator &MI) const {
861   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
862 
863   if (!(MI->mayLoad() && MI->mayStore()))
864     return std::nullopt;
865 
866   // Be conservative if there are no memory operands.
867   if (MI->getNumMemOperands() == 0)
868     return SIMemOpInfo();
869 
870   return constructFromMIWithMMO(MI);
871 }
872 
873 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
874   TII = ST.getInstrInfo();
875   IV = getIsaVersion(ST.getCPU());
876   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
877 }
878 
879 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
880                                     AMDGPU::CPol::CPol Bit) const {
881   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
882   if (!CPol)
883     return false;
884 
885   CPol->setImm(CPol->getImm() | Bit);
886   return true;
887 }
888 
889 /* static */
890 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
891   GCNSubtarget::Generation Generation = ST.getGeneration();
892   if (ST.hasGFX940Insts())
893     return std::make_unique<SIGfx940CacheControl>(ST);
894   if (ST.hasGFX90AInsts())
895     return std::make_unique<SIGfx90ACacheControl>(ST);
896   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
897     return std::make_unique<SIGfx6CacheControl>(ST);
898   if (Generation < AMDGPUSubtarget::GFX10)
899     return std::make_unique<SIGfx7CacheControl>(ST);
900   if (Generation < AMDGPUSubtarget::GFX11)
901     return std::make_unique<SIGfx10CacheControl>(ST);
902   if (Generation < AMDGPUSubtarget::GFX12)
903     return std::make_unique<SIGfx11CacheControl>(ST);
904   return std::make_unique<SIGfx12CacheControl>(ST);
905 }
906 
907 bool SIGfx6CacheControl::enableLoadCacheBypass(
908     const MachineBasicBlock::iterator &MI,
909     SIAtomicScope Scope,
910     SIAtomicAddrSpace AddrSpace) const {
911   assert(MI->mayLoad() && !MI->mayStore());
912   bool Changed = false;
913 
914   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
915     switch (Scope) {
916     case SIAtomicScope::SYSTEM:
917     case SIAtomicScope::AGENT:
918       // Set L1 cache policy to MISS_EVICT.
919       // Note: there is no L2 cache bypass policy at the ISA level.
920       Changed |= enableGLCBit(MI);
921       break;
922     case SIAtomicScope::WORKGROUP:
923     case SIAtomicScope::WAVEFRONT:
924     case SIAtomicScope::SINGLETHREAD:
925       // No cache to bypass.
926       break;
927     default:
928       llvm_unreachable("Unsupported synchronization scope");
929     }
930   }
931 
932   /// The scratch address space does not need the global memory caches
933   /// to be bypassed as all memory operations by the same thread are
934   /// sequentially consistent, and no other thread can access scratch
935   /// memory.
936 
937   /// Other address spaces do not have a cache.
938 
939   return Changed;
940 }
941 
942 bool SIGfx6CacheControl::enableStoreCacheBypass(
943     const MachineBasicBlock::iterator &MI,
944     SIAtomicScope Scope,
945     SIAtomicAddrSpace AddrSpace) const {
946   assert(!MI->mayLoad() && MI->mayStore());
947   bool Changed = false;
948 
949   /// The L1 cache is write through so does not need to be bypassed. There is no
950   /// bypass control for the L2 cache at the isa level.
951 
952   return Changed;
953 }
954 
955 bool SIGfx6CacheControl::enableRMWCacheBypass(
956     const MachineBasicBlock::iterator &MI,
957     SIAtomicScope Scope,
958     SIAtomicAddrSpace AddrSpace) const {
959   assert(MI->mayLoad() && MI->mayStore());
960   bool Changed = false;
961 
962   /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
963   /// bypassed, and the GLC bit is instead used to indicate if they are
964   /// return or no-return.
965   /// Note: there is no L2 cache coherent bypass control at the ISA level.
966 
967   return Changed;
968 }
969 
970 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
971     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
972     bool IsVolatile, bool IsNonTemporal) const {
973   // Only handle load and store, not atomic read-modify-write insructions. The
974   // latter use glc to indicate if the atomic returns a result and so must not
975   // be used for cache control.
976   assert(MI->mayLoad() ^ MI->mayStore());
977 
978   // Only update load and store, not LLVM IR atomic read-modify-write
979   // instructions. The latter are always marked as volatile so cannot sensibly
980   // handle it as do not want to pessimize all atomics. Also they do not support
981   // the nontemporal attribute.
982   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
983 
984   bool Changed = false;
985 
986   if (IsVolatile) {
987     // Set L1 cache policy to be MISS_EVICT for load instructions
988     // and MISS_LRU for store instructions.
989     // Note: there is no L2 cache bypass policy at the ISA level.
990     if (Op == SIMemOp::LOAD)
991       Changed |= enableGLCBit(MI);
992 
993     // Ensure operation has completed at system scope to cause all volatile
994     // operations to be visible outside the program in a global order. Do not
995     // request cross address space as only the global address space can be
996     // observable outside the program, so no need to cause a waitcnt for LDS
997     // address space operations.
998     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
999                           Position::AFTER);
1000 
1001     return Changed;
1002   }
1003 
1004   if (IsNonTemporal) {
1005     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1006     // for both loads and stores, and the L2 cache policy to STREAM.
1007     Changed |= enableGLCBit(MI);
1008     Changed |= enableSLCBit(MI);
1009     return Changed;
1010   }
1011 
1012   return Changed;
1013 }
1014 
1015 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1016                                     SIAtomicScope Scope,
1017                                     SIAtomicAddrSpace AddrSpace,
1018                                     SIMemOp Op,
1019                                     bool IsCrossAddrSpaceOrdering,
1020                                     Position Pos) const {
1021   bool Changed = false;
1022 
1023   MachineBasicBlock &MBB = *MI->getParent();
1024   DebugLoc DL = MI->getDebugLoc();
1025 
1026   if (Pos == Position::AFTER)
1027     ++MI;
1028 
1029   bool VMCnt = false;
1030   bool LGKMCnt = false;
1031 
1032   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1033       SIAtomicAddrSpace::NONE) {
1034     switch (Scope) {
1035     case SIAtomicScope::SYSTEM:
1036     case SIAtomicScope::AGENT:
1037       VMCnt |= true;
1038       break;
1039     case SIAtomicScope::WORKGROUP:
1040     case SIAtomicScope::WAVEFRONT:
1041     case SIAtomicScope::SINGLETHREAD:
1042       // The L1 cache keeps all memory operations in order for
1043       // wavefronts in the same work-group.
1044       break;
1045     default:
1046       llvm_unreachable("Unsupported synchronization scope");
1047     }
1048   }
1049 
1050   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1051     switch (Scope) {
1052     case SIAtomicScope::SYSTEM:
1053     case SIAtomicScope::AGENT:
1054     case SIAtomicScope::WORKGROUP:
1055       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1056       // not needed as LDS operations for all waves are executed in a total
1057       // global ordering as observed by all waves. Required if also
1058       // synchronizing with global/GDS memory as LDS operations could be
1059       // reordered with respect to later global/GDS memory operations of the
1060       // same wave.
1061       LGKMCnt |= IsCrossAddrSpaceOrdering;
1062       break;
1063     case SIAtomicScope::WAVEFRONT:
1064     case SIAtomicScope::SINGLETHREAD:
1065       // The LDS keeps all memory operations in order for
1066       // the same wavefront.
1067       break;
1068     default:
1069       llvm_unreachable("Unsupported synchronization scope");
1070     }
1071   }
1072 
1073   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1074     switch (Scope) {
1075     case SIAtomicScope::SYSTEM:
1076     case SIAtomicScope::AGENT:
1077       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1078       // is not needed as GDS operations for all waves are executed in a total
1079       // global ordering as observed by all waves. Required if also
1080       // synchronizing with global/LDS memory as GDS operations could be
1081       // reordered with respect to later global/LDS memory operations of the
1082       // same wave.
1083       LGKMCnt |= IsCrossAddrSpaceOrdering;
1084       break;
1085     case SIAtomicScope::WORKGROUP:
1086     case SIAtomicScope::WAVEFRONT:
1087     case SIAtomicScope::SINGLETHREAD:
1088       // The GDS keeps all memory operations in order for
1089       // the same work-group.
1090       break;
1091     default:
1092       llvm_unreachable("Unsupported synchronization scope");
1093     }
1094   }
1095 
1096   if (VMCnt || LGKMCnt) {
1097     unsigned WaitCntImmediate =
1098       AMDGPU::encodeWaitcnt(IV,
1099                             VMCnt ? 0 : getVmcntBitMask(IV),
1100                             getExpcntBitMask(IV),
1101                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1102     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1103         .addImm(WaitCntImmediate);
1104     Changed = true;
1105   }
1106 
1107   if (Pos == Position::AFTER)
1108     --MI;
1109 
1110   return Changed;
1111 }
1112 
1113 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1114                                        SIAtomicScope Scope,
1115                                        SIAtomicAddrSpace AddrSpace,
1116                                        Position Pos) const {
1117   if (!InsertCacheInv)
1118     return false;
1119 
1120   bool Changed = false;
1121 
1122   MachineBasicBlock &MBB = *MI->getParent();
1123   DebugLoc DL = MI->getDebugLoc();
1124 
1125   if (Pos == Position::AFTER)
1126     ++MI;
1127 
1128   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1129     switch (Scope) {
1130     case SIAtomicScope::SYSTEM:
1131     case SIAtomicScope::AGENT:
1132       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1133       Changed = true;
1134       break;
1135     case SIAtomicScope::WORKGROUP:
1136     case SIAtomicScope::WAVEFRONT:
1137     case SIAtomicScope::SINGLETHREAD:
1138       // No cache to invalidate.
1139       break;
1140     default:
1141       llvm_unreachable("Unsupported synchronization scope");
1142     }
1143   }
1144 
1145   /// The scratch address space does not need the global memory cache
1146   /// to be flushed as all memory operations by the same thread are
1147   /// sequentially consistent, and no other thread can access scratch
1148   /// memory.
1149 
1150   /// Other address spaces do not have a cache.
1151 
1152   if (Pos == Position::AFTER)
1153     --MI;
1154 
1155   return Changed;
1156 }
1157 
1158 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1159                                        SIAtomicScope Scope,
1160                                        SIAtomicAddrSpace AddrSpace,
1161                                        bool IsCrossAddrSpaceOrdering,
1162                                        Position Pos) const {
1163   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1164                     IsCrossAddrSpaceOrdering, Pos);
1165 }
1166 
1167 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1168                                        SIAtomicScope Scope,
1169                                        SIAtomicAddrSpace AddrSpace,
1170                                        Position Pos) const {
1171   if (!InsertCacheInv)
1172     return false;
1173 
1174   bool Changed = false;
1175 
1176   MachineBasicBlock &MBB = *MI->getParent();
1177   DebugLoc DL = MI->getDebugLoc();
1178 
1179   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1180 
1181   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1182                                     ? AMDGPU::BUFFER_WBINVL1
1183                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1184 
1185   if (Pos == Position::AFTER)
1186     ++MI;
1187 
1188   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1189     switch (Scope) {
1190     case SIAtomicScope::SYSTEM:
1191     case SIAtomicScope::AGENT:
1192       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1193       Changed = true;
1194       break;
1195     case SIAtomicScope::WORKGROUP:
1196     case SIAtomicScope::WAVEFRONT:
1197     case SIAtomicScope::SINGLETHREAD:
1198       // No cache to invalidate.
1199       break;
1200     default:
1201       llvm_unreachable("Unsupported synchronization scope");
1202     }
1203   }
1204 
1205   /// The scratch address space does not need the global memory cache
1206   /// to be flushed as all memory operations by the same thread are
1207   /// sequentially consistent, and no other thread can access scratch
1208   /// memory.
1209 
1210   /// Other address spaces do not have a cache.
1211 
1212   if (Pos == Position::AFTER)
1213     --MI;
1214 
1215   return Changed;
1216 }
1217 
1218 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1219     const MachineBasicBlock::iterator &MI,
1220     SIAtomicScope Scope,
1221     SIAtomicAddrSpace AddrSpace) const {
1222   assert(MI->mayLoad() && !MI->mayStore());
1223   bool Changed = false;
1224 
1225   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1226     switch (Scope) {
1227     case SIAtomicScope::SYSTEM:
1228     case SIAtomicScope::AGENT:
1229       // Set the L1 cache policy to MISS_LRU.
1230       // Note: there is no L2 cache bypass policy at the ISA level.
1231       Changed |= enableGLCBit(MI);
1232       break;
1233     case SIAtomicScope::WORKGROUP:
1234       // In threadgroup split mode the waves of a work-group can be executing on
1235       // different CUs. Therefore need to bypass the L1 which is per CU.
1236       // Otherwise in non-threadgroup split mode all waves of a work-group are
1237       // on the same CU, and so the L1 does not need to be bypassed.
1238       if (ST.isTgSplitEnabled())
1239         Changed |= enableGLCBit(MI);
1240       break;
1241     case SIAtomicScope::WAVEFRONT:
1242     case SIAtomicScope::SINGLETHREAD:
1243       // No cache to bypass.
1244       break;
1245     default:
1246       llvm_unreachable("Unsupported synchronization scope");
1247     }
1248   }
1249 
1250   /// The scratch address space does not need the global memory caches
1251   /// to be bypassed as all memory operations by the same thread are
1252   /// sequentially consistent, and no other thread can access scratch
1253   /// memory.
1254 
1255   /// Other address spaces do not have a cache.
1256 
1257   return Changed;
1258 }
1259 
1260 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1261     const MachineBasicBlock::iterator &MI,
1262     SIAtomicScope Scope,
1263     SIAtomicAddrSpace AddrSpace) const {
1264   assert(!MI->mayLoad() && MI->mayStore());
1265   bool Changed = false;
1266 
1267   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1268     switch (Scope) {
1269     case SIAtomicScope::SYSTEM:
1270     case SIAtomicScope::AGENT:
1271       /// Do not set glc for store atomic operations as they implicitly write
1272       /// through the L1 cache.
1273       break;
1274     case SIAtomicScope::WORKGROUP:
1275     case SIAtomicScope::WAVEFRONT:
1276     case SIAtomicScope::SINGLETHREAD:
1277       // No cache to bypass. Store atomics implicitly write through the L1
1278       // cache.
1279       break;
1280     default:
1281       llvm_unreachable("Unsupported synchronization scope");
1282     }
1283   }
1284 
1285   /// The scratch address space does not need the global memory caches
1286   /// to be bypassed as all memory operations by the same thread are
1287   /// sequentially consistent, and no other thread can access scratch
1288   /// memory.
1289 
1290   /// Other address spaces do not have a cache.
1291 
1292   return Changed;
1293 }
1294 
1295 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1296     const MachineBasicBlock::iterator &MI,
1297     SIAtomicScope Scope,
1298     SIAtomicAddrSpace AddrSpace) const {
1299   assert(MI->mayLoad() && MI->mayStore());
1300   bool Changed = false;
1301 
1302   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1303     switch (Scope) {
1304     case SIAtomicScope::SYSTEM:
1305     case SIAtomicScope::AGENT:
1306       /// Do not set glc for RMW atomic operations as they implicitly bypass
1307       /// the L1 cache, and the glc bit is instead used to indicate if they are
1308       /// return or no-return.
1309       break;
1310     case SIAtomicScope::WORKGROUP:
1311     case SIAtomicScope::WAVEFRONT:
1312     case SIAtomicScope::SINGLETHREAD:
1313       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1314       break;
1315     default:
1316       llvm_unreachable("Unsupported synchronization scope");
1317     }
1318   }
1319 
1320   return Changed;
1321 }
1322 
1323 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1324     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1325     bool IsVolatile, bool IsNonTemporal) const {
1326   // Only handle load and store, not atomic read-modify-write insructions. The
1327   // latter use glc to indicate if the atomic returns a result and so must not
1328   // be used for cache control.
1329   assert(MI->mayLoad() ^ MI->mayStore());
1330 
1331   // Only update load and store, not LLVM IR atomic read-modify-write
1332   // instructions. The latter are always marked as volatile so cannot sensibly
1333   // handle it as do not want to pessimize all atomics. Also they do not support
1334   // the nontemporal attribute.
1335   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1336 
1337   bool Changed = false;
1338 
1339   if (IsVolatile) {
1340     // Set L1 cache policy to be MISS_EVICT for load instructions
1341     // and MISS_LRU for store instructions.
1342     // Note: there is no L2 cache bypass policy at the ISA level.
1343     if (Op == SIMemOp::LOAD)
1344       Changed |= enableGLCBit(MI);
1345 
1346     // Ensure operation has completed at system scope to cause all volatile
1347     // operations to be visible outside the program in a global order. Do not
1348     // request cross address space as only the global address space can be
1349     // observable outside the program, so no need to cause a waitcnt for LDS
1350     // address space operations.
1351     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1352                           Position::AFTER);
1353 
1354     return Changed;
1355   }
1356 
1357   if (IsNonTemporal) {
1358     // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1359     // for both loads and stores, and the L2 cache policy to STREAM.
1360     Changed |= enableGLCBit(MI);
1361     Changed |= enableSLCBit(MI);
1362     return Changed;
1363   }
1364 
1365   return Changed;
1366 }
1367 
1368 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1369                                       SIAtomicScope Scope,
1370                                       SIAtomicAddrSpace AddrSpace,
1371                                       SIMemOp Op,
1372                                       bool IsCrossAddrSpaceOrdering,
1373                                       Position Pos) const {
1374   if (ST.isTgSplitEnabled()) {
1375     // In threadgroup split mode the waves of a work-group can be executing on
1376     // different CUs. Therefore need to wait for global or GDS memory operations
1377     // to complete to ensure they are visible to waves in the other CUs.
1378     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1379     // the same CU, so no need to wait for global memory as all waves in the
1380     // work-group access the same the L1, nor wait for GDS as access are ordered
1381     // on a CU.
1382     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1383                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1384         (Scope == SIAtomicScope::WORKGROUP)) {
1385       // Same as GFX7 using agent scope.
1386       Scope = SIAtomicScope::AGENT;
1387     }
1388     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1389     // LDS memory operations.
1390     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1391   }
1392   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1393                                         IsCrossAddrSpaceOrdering, Pos);
1394 }
1395 
1396 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1397                                          SIAtomicScope Scope,
1398                                          SIAtomicAddrSpace AddrSpace,
1399                                          Position Pos) const {
1400   if (!InsertCacheInv)
1401     return false;
1402 
1403   bool Changed = false;
1404 
1405   MachineBasicBlock &MBB = *MI->getParent();
1406   DebugLoc DL = MI->getDebugLoc();
1407 
1408   if (Pos == Position::AFTER)
1409     ++MI;
1410 
1411   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1412     switch (Scope) {
1413     case SIAtomicScope::SYSTEM:
1414       // Ensures that following loads will not see stale remote VMEM data or
1415       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1416       // CC will never be stale due to the local memory probes.
1417       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1418       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1419       // hardware does not reorder memory operations by the same wave with
1420       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1421       // remove any cache lines of earlier writes by the same wave and ensures
1422       // later reads by the same wave will refetch the cache lines.
1423       Changed = true;
1424       break;
1425     case SIAtomicScope::AGENT:
1426       // Same as GFX7.
1427       break;
1428     case SIAtomicScope::WORKGROUP:
1429       // In threadgroup split mode the waves of a work-group can be executing on
1430       // different CUs. Therefore need to invalidate the L1 which is per CU.
1431       // Otherwise in non-threadgroup split mode all waves of a work-group are
1432       // on the same CU, and so the L1 does not need to be invalidated.
1433       if (ST.isTgSplitEnabled()) {
1434         // Same as GFX7 using agent scope.
1435         Scope = SIAtomicScope::AGENT;
1436       }
1437       break;
1438     case SIAtomicScope::WAVEFRONT:
1439     case SIAtomicScope::SINGLETHREAD:
1440       // Same as GFX7.
1441       break;
1442     default:
1443       llvm_unreachable("Unsupported synchronization scope");
1444     }
1445   }
1446 
1447   /// The scratch address space does not need the global memory cache
1448   /// to be flushed as all memory operations by the same thread are
1449   /// sequentially consistent, and no other thread can access scratch
1450   /// memory.
1451 
1452   /// Other address spaces do not have a cache.
1453 
1454   if (Pos == Position::AFTER)
1455     --MI;
1456 
1457   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1458 
1459   return Changed;
1460 }
1461 
1462 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1463                                          SIAtomicScope Scope,
1464                                          SIAtomicAddrSpace AddrSpace,
1465                                          bool IsCrossAddrSpaceOrdering,
1466                                          Position Pos) const {
1467   bool Changed = false;
1468 
1469   MachineBasicBlock &MBB = *MI->getParent();
1470   const DebugLoc &DL = MI->getDebugLoc();
1471 
1472   if (Pos == Position::AFTER)
1473     ++MI;
1474 
1475   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1476     switch (Scope) {
1477     case SIAtomicScope::SYSTEM:
1478       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1479       // hardware does not reorder memory operations by the same wave with
1480       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1481       // to initiate writeback of any dirty cache lines of earlier writes by the
1482       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1483       // writeback has completed.
1484       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1485         // Set SC bits to indicate system scope.
1486         .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1487       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1488       // vmcnt(0)" needed by the "BUFFER_WBL2".
1489       Changed = true;
1490       break;
1491     case SIAtomicScope::AGENT:
1492     case SIAtomicScope::WORKGROUP:
1493     case SIAtomicScope::WAVEFRONT:
1494     case SIAtomicScope::SINGLETHREAD:
1495       // Same as GFX7.
1496       break;
1497     default:
1498       llvm_unreachable("Unsupported synchronization scope");
1499     }
1500   }
1501 
1502   if (Pos == Position::AFTER)
1503     --MI;
1504 
1505   Changed |=
1506       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1507                                         IsCrossAddrSpaceOrdering, Pos);
1508 
1509   return Changed;
1510 }
1511 
1512 bool SIGfx940CacheControl::enableLoadCacheBypass(
1513     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1514     SIAtomicAddrSpace AddrSpace) const {
1515   assert(MI->mayLoad() && !MI->mayStore());
1516   bool Changed = false;
1517 
1518   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1519     switch (Scope) {
1520     case SIAtomicScope::SYSTEM:
1521       // Set SC bits to indicate system scope.
1522       Changed |= enableSC0Bit(MI);
1523       Changed |= enableSC1Bit(MI);
1524       break;
1525     case SIAtomicScope::AGENT:
1526       // Set SC bits to indicate agent scope.
1527       Changed |= enableSC1Bit(MI);
1528       break;
1529     case SIAtomicScope::WORKGROUP:
1530       // In threadgroup split mode the waves of a work-group can be executing on
1531       // different CUs. Therefore need to bypass the L1 which is per CU.
1532       // Otherwise in non-threadgroup split mode all waves of a work-group are
1533       // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1534       // bits to indicate work-group scope will do this automatically.
1535       Changed |= enableSC0Bit(MI);
1536       break;
1537     case SIAtomicScope::WAVEFRONT:
1538     case SIAtomicScope::SINGLETHREAD:
1539       // Leave SC bits unset to indicate wavefront scope.
1540       break;
1541     default:
1542       llvm_unreachable("Unsupported synchronization scope");
1543     }
1544   }
1545 
1546   /// The scratch address space does not need the global memory caches
1547   /// to be bypassed as all memory operations by the same thread are
1548   /// sequentially consistent, and no other thread can access scratch
1549   /// memory.
1550 
1551   /// Other address spaces do not have a cache.
1552 
1553   return Changed;
1554 }
1555 
1556 bool SIGfx940CacheControl::enableStoreCacheBypass(
1557     const MachineBasicBlock::iterator &MI,
1558     SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1559   assert(!MI->mayLoad() && MI->mayStore());
1560   bool Changed = false;
1561 
1562   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1563     switch (Scope) {
1564     case SIAtomicScope::SYSTEM:
1565       // Set SC bits to indicate system scope.
1566       Changed |= enableSC0Bit(MI);
1567       Changed |= enableSC1Bit(MI);
1568       break;
1569     case SIAtomicScope::AGENT:
1570       // Set SC bits to indicate agent scope.
1571       Changed |= enableSC1Bit(MI);
1572       break;
1573     case SIAtomicScope::WORKGROUP:
1574       // Set SC bits to indicate workgroup scope.
1575       Changed |= enableSC0Bit(MI);
1576       break;
1577     case SIAtomicScope::WAVEFRONT:
1578     case SIAtomicScope::SINGLETHREAD:
1579       // Leave SC bits unset to indicate wavefront scope.
1580       break;
1581     default:
1582       llvm_unreachable("Unsupported synchronization scope");
1583     }
1584   }
1585 
1586   /// The scratch address space does not need the global memory caches
1587   /// to be bypassed as all memory operations by the same thread are
1588   /// sequentially consistent, and no other thread can access scratch
1589   /// memory.
1590 
1591   /// Other address spaces do not have a cache.
1592 
1593   return Changed;
1594 }
1595 
1596 bool SIGfx940CacheControl::enableRMWCacheBypass(
1597     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1598     SIAtomicAddrSpace AddrSpace) const {
1599   assert(MI->mayLoad() && MI->mayStore());
1600   bool Changed = false;
1601 
1602   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1603     switch (Scope) {
1604     case SIAtomicScope::SYSTEM:
1605       // Set SC1 bit to indicate system scope.
1606       Changed |= enableSC1Bit(MI);
1607       break;
1608     case SIAtomicScope::AGENT:
1609     case SIAtomicScope::WORKGROUP:
1610     case SIAtomicScope::WAVEFRONT:
1611     case SIAtomicScope::SINGLETHREAD:
1612       // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1613       // to indicate system or agent scope. The SC0 bit is used to indicate if
1614       // they are return or no-return. Leave SC1 bit unset to indicate agent
1615       // scope.
1616       break;
1617     default:
1618       llvm_unreachable("Unsupported synchronization scope");
1619     }
1620   }
1621 
1622   return Changed;
1623 }
1624 
1625 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1626     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1627     bool IsVolatile, bool IsNonTemporal) const {
1628   // Only handle load and store, not atomic read-modify-write insructions. The
1629   // latter use glc to indicate if the atomic returns a result and so must not
1630   // be used for cache control.
1631   assert(MI->mayLoad() ^ MI->mayStore());
1632 
1633   // Only update load and store, not LLVM IR atomic read-modify-write
1634   // instructions. The latter are always marked as volatile so cannot sensibly
1635   // handle it as do not want to pessimize all atomics. Also they do not support
1636   // the nontemporal attribute.
1637   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1638 
1639   bool Changed = false;
1640 
1641   if (IsVolatile) {
1642     // Set SC bits to indicate system scope.
1643     Changed |= enableSC0Bit(MI);
1644     Changed |= enableSC1Bit(MI);
1645 
1646     // Ensure operation has completed at system scope to cause all volatile
1647     // operations to be visible outside the program in a global order. Do not
1648     // request cross address space as only the global address space can be
1649     // observable outside the program, so no need to cause a waitcnt for LDS
1650     // address space operations.
1651     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1652                           Position::AFTER);
1653 
1654     return Changed;
1655   }
1656 
1657   if (IsNonTemporal) {
1658     Changed |= enableNTBit(MI);
1659     return Changed;
1660   }
1661 
1662   return Changed;
1663 }
1664 
1665 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1666                                          SIAtomicScope Scope,
1667                                          SIAtomicAddrSpace AddrSpace,
1668                                          Position Pos) const {
1669   if (!InsertCacheInv)
1670     return false;
1671 
1672   bool Changed = false;
1673 
1674   MachineBasicBlock &MBB = *MI->getParent();
1675   DebugLoc DL = MI->getDebugLoc();
1676 
1677   if (Pos == Position::AFTER)
1678     ++MI;
1679 
1680   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1681     switch (Scope) {
1682     case SIAtomicScope::SYSTEM:
1683       // Ensures that following loads will not see stale remote VMEM data or
1684       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1685       // CC will never be stale due to the local memory probes.
1686       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1687           // Set SC bits to indicate system scope.
1688           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1689       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1690       // hardware does not reorder memory operations by the same wave with
1691       // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1692       // remove any cache lines of earlier writes by the same wave and ensures
1693       // later reads by the same wave will refetch the cache lines.
1694       Changed = true;
1695       break;
1696     case SIAtomicScope::AGENT:
1697       // Ensures that following loads will not see stale remote date or local
1698       // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1699       // due to the memory probes.
1700       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1701           // Set SC bits to indicate agent scope.
1702           .addImm(AMDGPU::CPol::SC1);
1703       // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1704       // does not reorder memory operations with respect to preceeding buffer
1705       // invalidate. The invalidate is guaranteed to remove any cache lines of
1706       // earlier writes and ensures later writes will refetch the cache lines.
1707       Changed = true;
1708       break;
1709     case SIAtomicScope::WORKGROUP:
1710       // In threadgroup split mode the waves of a work-group can be executing on
1711       // different CUs. Therefore need to invalidate the L1 which is per CU.
1712       // Otherwise in non-threadgroup split mode all waves of a work-group are
1713       // on the same CU, and so the L1 does not need to be invalidated.
1714       if (ST.isTgSplitEnabled()) {
1715         // Ensures L1 is invalidated if in threadgroup split mode. In
1716         // non-threadgroup split mode it is a NOP, but no point generating it in
1717         // that case if know not in that mode.
1718         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1719             // Set SC bits to indicate work-group scope.
1720             .addImm(AMDGPU::CPol::SC0);
1721         // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1722         // does not reorder memory operations with respect to preceeding buffer
1723         // invalidate. The invalidate is guaranteed to remove any cache lines of
1724         // earlier writes and ensures later writes will refetch the cache lines.
1725         Changed = true;
1726       }
1727       break;
1728     case SIAtomicScope::WAVEFRONT:
1729     case SIAtomicScope::SINGLETHREAD:
1730       // Could generate "BUFFER_INV" but it would do nothing as there are no
1731       // caches to invalidate.
1732       break;
1733     default:
1734       llvm_unreachable("Unsupported synchronization scope");
1735     }
1736   }
1737 
1738   /// The scratch address space does not need the global memory cache
1739   /// to be flushed as all memory operations by the same thread are
1740   /// sequentially consistent, and no other thread can access scratch
1741   /// memory.
1742 
1743   /// Other address spaces do not have a cache.
1744 
1745   if (Pos == Position::AFTER)
1746     --MI;
1747 
1748   return Changed;
1749 }
1750 
1751 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1752                                          SIAtomicScope Scope,
1753                                          SIAtomicAddrSpace AddrSpace,
1754                                          bool IsCrossAddrSpaceOrdering,
1755                                          Position Pos) const {
1756   bool Changed = false;
1757 
1758   MachineBasicBlock &MBB = *MI->getParent();
1759   DebugLoc DL = MI->getDebugLoc();
1760 
1761   if (Pos == Position::AFTER)
1762     ++MI;
1763 
1764   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1765     switch (Scope) {
1766     case SIAtomicScope::SYSTEM:
1767       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1768       // hardware does not reorder memory operations by the same wave with
1769       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1770       // to initiate writeback of any dirty cache lines of earlier writes by the
1771       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1772       // writeback has completed.
1773       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1774           // Set SC bits to indicate system scope.
1775           .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
1776       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1777       // SIAtomicScope::SYSTEM, the following insertWait will generate the
1778       // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1779       Changed = true;
1780       break;
1781     case SIAtomicScope::AGENT:
1782       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1783           // Set SC bits to indicate agent scope.
1784           .addImm(AMDGPU::CPol::SC1);
1785 
1786       // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1787       // SIAtomicScope::AGENT, the following insertWait will generate the
1788       // required "S_WAITCNT vmcnt(0)".
1789       Changed = true;
1790       break;
1791     case SIAtomicScope::WORKGROUP:
1792     case SIAtomicScope::WAVEFRONT:
1793     case SIAtomicScope::SINGLETHREAD:
1794       // Do not generate "BUFFER_WBL2" as there are no caches it would
1795       // writeback, and would require an otherwise unnecessary
1796       // "S_WAITCNT vmcnt(0)".
1797       break;
1798     default:
1799       llvm_unreachable("Unsupported synchronization scope");
1800     }
1801   }
1802 
1803   if (Pos == Position::AFTER)
1804     --MI;
1805 
1806   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1807   // S_WAITCNT needed.
1808   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1809                         IsCrossAddrSpaceOrdering, Pos);
1810 
1811   return Changed;
1812 }
1813 
1814 bool SIGfx10CacheControl::enableLoadCacheBypass(
1815     const MachineBasicBlock::iterator &MI,
1816     SIAtomicScope Scope,
1817     SIAtomicAddrSpace AddrSpace) const {
1818   assert(MI->mayLoad() && !MI->mayStore());
1819   bool Changed = false;
1820 
1821   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1822     switch (Scope) {
1823     case SIAtomicScope::SYSTEM:
1824     case SIAtomicScope::AGENT:
1825       // Set the L0 and L1 cache policies to MISS_EVICT.
1826       // Note: there is no L2 cache coherent bypass control at the ISA level.
1827       Changed |= enableGLCBit(MI);
1828       Changed |= enableDLCBit(MI);
1829       break;
1830     case SIAtomicScope::WORKGROUP:
1831       // In WGP mode the waves of a work-group can be executing on either CU of
1832       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1833       // CU mode all waves of a work-group are on the same CU, and so the L0
1834       // does not need to be bypassed.
1835       if (!ST.isCuModeEnabled())
1836         Changed |= enableGLCBit(MI);
1837       break;
1838     case SIAtomicScope::WAVEFRONT:
1839     case SIAtomicScope::SINGLETHREAD:
1840       // No cache to bypass.
1841       break;
1842     default:
1843       llvm_unreachable("Unsupported synchronization scope");
1844     }
1845   }
1846 
1847   /// The scratch address space does not need the global memory caches
1848   /// to be bypassed as all memory operations by the same thread are
1849   /// sequentially consistent, and no other thread can access scratch
1850   /// memory.
1851 
1852   /// Other address spaces do not have a cache.
1853 
1854   return Changed;
1855 }
1856 
1857 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1858     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1859     bool IsVolatile, bool IsNonTemporal) const {
1860 
1861   // Only handle load and store, not atomic read-modify-write insructions. The
1862   // latter use glc to indicate if the atomic returns a result and so must not
1863   // be used for cache control.
1864   assert(MI->mayLoad() ^ MI->mayStore());
1865 
1866   // Only update load and store, not LLVM IR atomic read-modify-write
1867   // instructions. The latter are always marked as volatile so cannot sensibly
1868   // handle it as do not want to pessimize all atomics. Also they do not support
1869   // the nontemporal attribute.
1870   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1871 
1872   bool Changed = false;
1873 
1874   if (IsVolatile) {
1875     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1876     // and MISS_LRU for store instructions.
1877     // Note: there is no L2 cache coherent bypass control at the ISA level.
1878     if (Op == SIMemOp::LOAD) {
1879       Changed |= enableGLCBit(MI);
1880       Changed |= enableDLCBit(MI);
1881     }
1882 
1883     // Ensure operation has completed at system scope to cause all volatile
1884     // operations to be visible outside the program in a global order. Do not
1885     // request cross address space as only the global address space can be
1886     // observable outside the program, so no need to cause a waitcnt for LDS
1887     // address space operations.
1888     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1889                           Position::AFTER);
1890     return Changed;
1891   }
1892 
1893   if (IsNonTemporal) {
1894     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1895     // and L2 cache policy to STREAM.
1896     // For stores setting both GLC and SLC configures L0 and L1 cache policy
1897     // to MISS_EVICT and the L2 cache policy to STREAM.
1898     if (Op == SIMemOp::STORE)
1899       Changed |= enableGLCBit(MI);
1900     Changed |= enableSLCBit(MI);
1901 
1902     return Changed;
1903   }
1904 
1905   return Changed;
1906 }
1907 
1908 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1909                                      SIAtomicScope Scope,
1910                                      SIAtomicAddrSpace AddrSpace,
1911                                      SIMemOp Op,
1912                                      bool IsCrossAddrSpaceOrdering,
1913                                      Position Pos) const {
1914   bool Changed = false;
1915 
1916   MachineBasicBlock &MBB = *MI->getParent();
1917   DebugLoc DL = MI->getDebugLoc();
1918 
1919   if (Pos == Position::AFTER)
1920     ++MI;
1921 
1922   bool VMCnt = false;
1923   bool VSCnt = false;
1924   bool LGKMCnt = false;
1925 
1926   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1927       SIAtomicAddrSpace::NONE) {
1928     switch (Scope) {
1929     case SIAtomicScope::SYSTEM:
1930     case SIAtomicScope::AGENT:
1931       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1932         VMCnt |= true;
1933       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1934         VSCnt |= true;
1935       break;
1936     case SIAtomicScope::WORKGROUP:
1937       // In WGP mode the waves of a work-group can be executing on either CU of
1938       // the WGP. Therefore need to wait for operations to complete to ensure
1939       // they are visible to waves in the other CU as the L0 is per CU.
1940       // Otherwise in CU mode and all waves of a work-group are on the same CU
1941       // which shares the same L0.
1942       if (!ST.isCuModeEnabled()) {
1943         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1944           VMCnt |= true;
1945         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1946           VSCnt |= true;
1947       }
1948       break;
1949     case SIAtomicScope::WAVEFRONT:
1950     case SIAtomicScope::SINGLETHREAD:
1951       // The L0 cache keeps all memory operations in order for
1952       // work-items in the same wavefront.
1953       break;
1954     default:
1955       llvm_unreachable("Unsupported synchronization scope");
1956     }
1957   }
1958 
1959   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1960     switch (Scope) {
1961     case SIAtomicScope::SYSTEM:
1962     case SIAtomicScope::AGENT:
1963     case SIAtomicScope::WORKGROUP:
1964       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1965       // not needed as LDS operations for all waves are executed in a total
1966       // global ordering as observed by all waves. Required if also
1967       // synchronizing with global/GDS memory as LDS operations could be
1968       // reordered with respect to later global/GDS memory operations of the
1969       // same wave.
1970       LGKMCnt |= IsCrossAddrSpaceOrdering;
1971       break;
1972     case SIAtomicScope::WAVEFRONT:
1973     case SIAtomicScope::SINGLETHREAD:
1974       // The LDS keeps all memory operations in order for
1975       // the same wavefront.
1976       break;
1977     default:
1978       llvm_unreachable("Unsupported synchronization scope");
1979     }
1980   }
1981 
1982   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1983     switch (Scope) {
1984     case SIAtomicScope::SYSTEM:
1985     case SIAtomicScope::AGENT:
1986       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1987       // is not needed as GDS operations for all waves are executed in a total
1988       // global ordering as observed by all waves. Required if also
1989       // synchronizing with global/LDS memory as GDS operations could be
1990       // reordered with respect to later global/LDS memory operations of the
1991       // same wave.
1992       LGKMCnt |= IsCrossAddrSpaceOrdering;
1993       break;
1994     case SIAtomicScope::WORKGROUP:
1995     case SIAtomicScope::WAVEFRONT:
1996     case SIAtomicScope::SINGLETHREAD:
1997       // The GDS keeps all memory operations in order for
1998       // the same work-group.
1999       break;
2000     default:
2001       llvm_unreachable("Unsupported synchronization scope");
2002     }
2003   }
2004 
2005   if (VMCnt || LGKMCnt) {
2006     unsigned WaitCntImmediate =
2007       AMDGPU::encodeWaitcnt(IV,
2008                             VMCnt ? 0 : getVmcntBitMask(IV),
2009                             getExpcntBitMask(IV),
2010                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2011     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2012         .addImm(WaitCntImmediate);
2013     Changed = true;
2014   }
2015 
2016   if (VSCnt) {
2017     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2018         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2019         .addImm(0);
2020     Changed = true;
2021   }
2022 
2023   if (Pos == Position::AFTER)
2024     --MI;
2025 
2026   return Changed;
2027 }
2028 
2029 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2030                                         SIAtomicScope Scope,
2031                                         SIAtomicAddrSpace AddrSpace,
2032                                         Position Pos) const {
2033   if (!InsertCacheInv)
2034     return false;
2035 
2036   bool Changed = false;
2037 
2038   MachineBasicBlock &MBB = *MI->getParent();
2039   DebugLoc DL = MI->getDebugLoc();
2040 
2041   if (Pos == Position::AFTER)
2042     ++MI;
2043 
2044   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2045     switch (Scope) {
2046     case SIAtomicScope::SYSTEM:
2047     case SIAtomicScope::AGENT:
2048       // The order of invalidates matter here. We must invalidate "outer in"
2049       // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2050       // invalidated.
2051       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2052       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2053       Changed = true;
2054       break;
2055     case SIAtomicScope::WORKGROUP:
2056       // In WGP mode the waves of a work-group can be executing on either CU of
2057       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2058       // in CU mode and all waves of a work-group are on the same CU, and so the
2059       // L0 does not need to be invalidated.
2060       if (!ST.isCuModeEnabled()) {
2061         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2062         Changed = true;
2063       }
2064       break;
2065     case SIAtomicScope::WAVEFRONT:
2066     case SIAtomicScope::SINGLETHREAD:
2067       // No cache to invalidate.
2068       break;
2069     default:
2070       llvm_unreachable("Unsupported synchronization scope");
2071     }
2072   }
2073 
2074   /// The scratch address space does not need the global memory cache
2075   /// to be flushed as all memory operations by the same thread are
2076   /// sequentially consistent, and no other thread can access scratch
2077   /// memory.
2078 
2079   /// Other address spaces do not have a cache.
2080 
2081   if (Pos == Position::AFTER)
2082     --MI;
2083 
2084   return Changed;
2085 }
2086 
2087 bool SIGfx11CacheControl::enableLoadCacheBypass(
2088     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2089     SIAtomicAddrSpace AddrSpace) const {
2090   assert(MI->mayLoad() && !MI->mayStore());
2091   bool Changed = false;
2092 
2093   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2094     switch (Scope) {
2095     case SIAtomicScope::SYSTEM:
2096     case SIAtomicScope::AGENT:
2097       // Set the L0 and L1 cache policies to MISS_EVICT.
2098       // Note: there is no L2 cache coherent bypass control at the ISA level.
2099       Changed |= enableGLCBit(MI);
2100       break;
2101     case SIAtomicScope::WORKGROUP:
2102       // In WGP mode the waves of a work-group can be executing on either CU of
2103       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2104       // CU mode all waves of a work-group are on the same CU, and so the L0
2105       // does not need to be bypassed.
2106       if (!ST.isCuModeEnabled())
2107         Changed |= enableGLCBit(MI);
2108       break;
2109     case SIAtomicScope::WAVEFRONT:
2110     case SIAtomicScope::SINGLETHREAD:
2111       // No cache to bypass.
2112       break;
2113     default:
2114       llvm_unreachable("Unsupported synchronization scope");
2115     }
2116   }
2117 
2118   /// The scratch address space does not need the global memory caches
2119   /// to be bypassed as all memory operations by the same thread are
2120   /// sequentially consistent, and no other thread can access scratch
2121   /// memory.
2122 
2123   /// Other address spaces do not have a cache.
2124 
2125   return Changed;
2126 }
2127 
2128 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2129     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2130     bool IsVolatile, bool IsNonTemporal) const {
2131 
2132   // Only handle load and store, not atomic read-modify-write insructions. The
2133   // latter use glc to indicate if the atomic returns a result and so must not
2134   // be used for cache control.
2135   assert(MI->mayLoad() ^ MI->mayStore());
2136 
2137   // Only update load and store, not LLVM IR atomic read-modify-write
2138   // instructions. The latter are always marked as volatile so cannot sensibly
2139   // handle it as do not want to pessimize all atomics. Also they do not support
2140   // the nontemporal attribute.
2141   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2142 
2143   bool Changed = false;
2144 
2145   if (IsVolatile) {
2146     // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2147     // and MISS_LRU for store instructions.
2148     // Note: there is no L2 cache coherent bypass control at the ISA level.
2149     if (Op == SIMemOp::LOAD)
2150       Changed |= enableGLCBit(MI);
2151 
2152     // Set MALL NOALLOC for load and store instructions.
2153     Changed |= enableDLCBit(MI);
2154 
2155     // Ensure operation has completed at system scope to cause all volatile
2156     // operations to be visible outside the program in a global order. Do not
2157     // request cross address space as only the global address space can be
2158     // observable outside the program, so no need to cause a waitcnt for LDS
2159     // address space operations.
2160     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2161                           Position::AFTER);
2162     return Changed;
2163   }
2164 
2165   if (IsNonTemporal) {
2166     // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2167     // and L2 cache policy to STREAM.
2168     // For stores setting both GLC and SLC configures L0 and L1 cache policy
2169     // to MISS_EVICT and the L2 cache policy to STREAM.
2170     if (Op == SIMemOp::STORE)
2171       Changed |= enableGLCBit(MI);
2172     Changed |= enableSLCBit(MI);
2173 
2174     // Set MALL NOALLOC for load and store instructions.
2175     Changed |= enableDLCBit(MI);
2176     return Changed;
2177   }
2178 
2179   return Changed;
2180 }
2181 
2182 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2183                                 AMDGPU::CPol::CPol Value) const {
2184   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2185   if (!CPol)
2186     return false;
2187 
2188   uint64_t NewTH = Value & AMDGPU::CPol::TH;
2189   if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2190     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2191     return true;
2192   }
2193 
2194   return false;
2195 }
2196 
2197 bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2198                                    AMDGPU::CPol::CPol Value) const {
2199   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2200   if (!CPol)
2201     return false;
2202 
2203   uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2204   if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2205     CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2206     return true;
2207   }
2208 
2209   return false;
2210 }
2211 
2212 bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2213     const MachineBasicBlock::iterator MI) const {
2214   // TODO: implement flag for frontend to give us a hint not to insert waits.
2215 
2216   MachineBasicBlock &MBB = *MI->getParent();
2217   const DebugLoc &DL = MI->getDebugLoc();
2218 
2219   BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2220   BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2221   BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2222   BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2223   BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2224 
2225   return true;
2226 }
2227 
2228 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2229                                      SIAtomicScope Scope,
2230                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2231                                      bool IsCrossAddrSpaceOrdering,
2232                                      Position Pos) const {
2233   bool Changed = false;
2234 
2235   MachineBasicBlock &MBB = *MI->getParent();
2236   DebugLoc DL = MI->getDebugLoc();
2237 
2238   bool LOADCnt = false;
2239   bool DSCnt = false;
2240   bool STORECnt = false;
2241 
2242   if (Pos == Position::AFTER)
2243     ++MI;
2244 
2245   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2246       SIAtomicAddrSpace::NONE) {
2247     switch (Scope) {
2248     case SIAtomicScope::SYSTEM:
2249     case SIAtomicScope::AGENT:
2250       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2251         LOADCnt |= true;
2252       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2253         STORECnt |= true;
2254       break;
2255     case SIAtomicScope::WORKGROUP:
2256       // In WGP mode the waves of a work-group can be executing on either CU of
2257       // the WGP. Therefore need to wait for operations to complete to ensure
2258       // they are visible to waves in the other CU as the L0 is per CU.
2259       // Otherwise in CU mode and all waves of a work-group are on the same CU
2260       // which shares the same L0.
2261       if (!ST.isCuModeEnabled()) {
2262         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2263           LOADCnt |= true;
2264         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2265           STORECnt |= true;
2266       }
2267       break;
2268     case SIAtomicScope::WAVEFRONT:
2269     case SIAtomicScope::SINGLETHREAD:
2270       // The L0 cache keeps all memory operations in order for
2271       // work-items in the same wavefront.
2272       break;
2273     default:
2274       llvm_unreachable("Unsupported synchronization scope");
2275     }
2276   }
2277 
2278   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2279     switch (Scope) {
2280     case SIAtomicScope::SYSTEM:
2281     case SIAtomicScope::AGENT:
2282     case SIAtomicScope::WORKGROUP:
2283       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2284       // not needed as LDS operations for all waves are executed in a total
2285       // global ordering as observed by all waves. Required if also
2286       // synchronizing with global/GDS memory as LDS operations could be
2287       // reordered with respect to later global/GDS memory operations of the
2288       // same wave.
2289       DSCnt |= IsCrossAddrSpaceOrdering;
2290       break;
2291     case SIAtomicScope::WAVEFRONT:
2292     case SIAtomicScope::SINGLETHREAD:
2293       // The LDS keeps all memory operations in order for
2294       // the same wavefront.
2295       break;
2296     default:
2297       llvm_unreachable("Unsupported synchronization scope");
2298     }
2299   }
2300 
2301   if (LOADCnt) {
2302     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2303     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2304     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2305     Changed = true;
2306   }
2307 
2308   if (STORECnt) {
2309     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2310     Changed = true;
2311   }
2312 
2313   if (DSCnt) {
2314     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2315     Changed = true;
2316   }
2317 
2318   if (Pos == Position::AFTER)
2319     --MI;
2320 
2321   return Changed;
2322 }
2323 
2324 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2325                                         SIAtomicScope Scope,
2326                                         SIAtomicAddrSpace AddrSpace,
2327                                         Position Pos) const {
2328   if (!InsertCacheInv)
2329     return false;
2330 
2331   MachineBasicBlock &MBB = *MI->getParent();
2332   DebugLoc DL = MI->getDebugLoc();
2333 
2334   /// The scratch address space does not need the global memory cache
2335   /// to be flushed as all memory operations by the same thread are
2336   /// sequentially consistent, and no other thread can access scratch
2337   /// memory.
2338 
2339   /// Other address spaces do not have a cache.
2340   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2341     return false;
2342 
2343   AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2344   switch (Scope) {
2345   case SIAtomicScope::SYSTEM:
2346     ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2347     break;
2348   case SIAtomicScope::AGENT:
2349     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2350     break;
2351   case SIAtomicScope::WORKGROUP:
2352     // In WGP mode the waves of a work-group can be executing on either CU of
2353     // the WGP. Therefore we need to invalidate the L0 which is per CU.
2354     // Otherwise in CU mode all waves of a work-group are on the same CU, and so
2355     // the L0 does not need to be invalidated.
2356     if (ST.isCuModeEnabled())
2357       return false;
2358 
2359     ScopeImm = AMDGPU::CPol::SCOPE_SE;
2360     break;
2361   case SIAtomicScope::WAVEFRONT:
2362   case SIAtomicScope::SINGLETHREAD:
2363     // No cache to invalidate.
2364     return false;
2365   default:
2366     llvm_unreachable("Unsupported synchronization scope");
2367   }
2368 
2369   if (Pos == Position::AFTER)
2370     ++MI;
2371 
2372   BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2373 
2374   if (Pos == Position::AFTER)
2375     --MI;
2376 
2377   return true;
2378 }
2379 
2380 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2381     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2382     bool IsVolatile, bool IsNonTemporal) const {
2383 
2384   // Only handle load and store, not atomic read-modify-write instructions.
2385   assert(MI->mayLoad() ^ MI->mayStore());
2386 
2387   // Only update load and store, not LLVM IR atomic read-modify-write
2388   // instructions. The latter are always marked as volatile so cannot sensibly
2389   // handle it as do not want to pessimize all atomics. Also they do not support
2390   // the nontemporal attribute.
2391   assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2392 
2393   bool Changed = false;
2394 
2395   if (IsNonTemporal) {
2396     // Set non-temporal hint for all cache levels.
2397     Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2398   }
2399 
2400   if (IsVolatile) {
2401     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2402 
2403     if (Op == SIMemOp::STORE)
2404       Changed |= insertWaitsBeforeSystemScopeStore(MI);
2405 
2406     // Ensure operation has completed at system scope to cause all volatile
2407     // operations to be visible outside the program in a global order. Do not
2408     // request cross address space as only the global address space can be
2409     // observable outside the program, so no need to cause a waitcnt for LDS
2410     // address space operations.
2411     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2412                           Position::AFTER);
2413   }
2414 
2415   return Changed;
2416 }
2417 
2418 bool SIGfx12CacheControl::expandSystemScopeStore(
2419     MachineBasicBlock::iterator &MI) const {
2420   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2421   if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2422     return insertWaitsBeforeSystemScopeStore(MI);
2423 
2424   return false;
2425 }
2426 
2427 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2428   if (AtomicPseudoMIs.empty())
2429     return false;
2430 
2431   for (auto &MI : AtomicPseudoMIs)
2432     MI->eraseFromParent();
2433 
2434   AtomicPseudoMIs.clear();
2435   return true;
2436 }
2437 
2438 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2439                                    MachineBasicBlock::iterator &MI) {
2440   assert(MI->mayLoad() && !MI->mayStore());
2441 
2442   bool Changed = false;
2443 
2444   if (MOI.isAtomic()) {
2445     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2446         MOI.getOrdering() == AtomicOrdering::Acquire ||
2447         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2448       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2449                                            MOI.getOrderingAddrSpace());
2450     }
2451 
2452     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2453       Changed |= CC->insertWait(MI, MOI.getScope(),
2454                                 MOI.getOrderingAddrSpace(),
2455                                 SIMemOp::LOAD | SIMemOp::STORE,
2456                                 MOI.getIsCrossAddressSpaceOrdering(),
2457                                 Position::BEFORE);
2458 
2459     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2460         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2461       Changed |= CC->insertWait(MI, MOI.getScope(),
2462                                 MOI.getInstrAddrSpace(),
2463                                 SIMemOp::LOAD,
2464                                 MOI.getIsCrossAddressSpaceOrdering(),
2465                                 Position::AFTER);
2466       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2467                                    MOI.getOrderingAddrSpace(),
2468                                    Position::AFTER);
2469     }
2470 
2471     return Changed;
2472   }
2473 
2474   // Atomic instructions already bypass caches to the scope specified by the
2475   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2476   // need additional treatment.
2477   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2478                                                 SIMemOp::LOAD, MOI.isVolatile(),
2479                                                 MOI.isNonTemporal());
2480   return Changed;
2481 }
2482 
2483 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2484                                     MachineBasicBlock::iterator &MI) {
2485   assert(!MI->mayLoad() && MI->mayStore());
2486 
2487   bool Changed = false;
2488 
2489   if (MOI.isAtomic()) {
2490     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2491         MOI.getOrdering() == AtomicOrdering::Release ||
2492         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2493       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2494                                             MOI.getOrderingAddrSpace());
2495     }
2496 
2497     if (MOI.getOrdering() == AtomicOrdering::Release ||
2498         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2499       Changed |= CC->insertRelease(MI, MOI.getScope(),
2500                                    MOI.getOrderingAddrSpace(),
2501                                    MOI.getIsCrossAddressSpaceOrdering(),
2502                                    Position::BEFORE);
2503 
2504     return Changed;
2505   }
2506 
2507   // Atomic instructions already bypass caches to the scope specified by the
2508   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2509   // need additional treatment.
2510   Changed |= CC->enableVolatileAndOrNonTemporal(
2511       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2512       MOI.isNonTemporal());
2513 
2514   // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2515   // instruction field, do not confuse it with atomic scope.
2516   Changed |= CC->expandSystemScopeStore(MI);
2517   return Changed;
2518 }
2519 
2520 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2521                                           MachineBasicBlock::iterator &MI) {
2522   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2523 
2524   AtomicPseudoMIs.push_back(MI);
2525   bool Changed = false;
2526 
2527   if (MOI.isAtomic()) {
2528     if (MOI.getOrdering() == AtomicOrdering::Acquire)
2529       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2530                                 SIMemOp::LOAD | SIMemOp::STORE,
2531                                 MOI.getIsCrossAddressSpaceOrdering(),
2532                                 Position::BEFORE);
2533 
2534     if (MOI.getOrdering() == AtomicOrdering::Release ||
2535         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2536         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2537       /// TODO: This relies on a barrier always generating a waitcnt
2538       /// for LDS to ensure it is not reordered with the completion of
2539       /// the proceeding LDS operations. If barrier had a memory
2540       /// ordering and memory scope, then library does not need to
2541       /// generate a fence. Could add support in this file for
2542       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2543       /// adding S_WAITCNT before a S_BARRIER.
2544       Changed |= CC->insertRelease(MI, MOI.getScope(),
2545                                    MOI.getOrderingAddrSpace(),
2546                                    MOI.getIsCrossAddressSpaceOrdering(),
2547                                    Position::BEFORE);
2548 
2549     // TODO: If both release and invalidate are happening they could be combined
2550     // to use the single "BUFFER_WBINV*" instruction. This could be done by
2551     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2552     // track cache invalidate and write back instructions.
2553 
2554     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2555         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2556         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2557       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2558                                    MOI.getOrderingAddrSpace(),
2559                                    Position::BEFORE);
2560 
2561     return Changed;
2562   }
2563 
2564   return Changed;
2565 }
2566 
2567 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2568   MachineBasicBlock::iterator &MI) {
2569   assert(MI->mayLoad() && MI->mayStore());
2570 
2571   bool Changed = false;
2572 
2573   if (MOI.isAtomic()) {
2574     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2575         MOI.getOrdering() == AtomicOrdering::Acquire ||
2576         MOI.getOrdering() == AtomicOrdering::Release ||
2577         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2578         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2579       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2580                                           MOI.getInstrAddrSpace());
2581     }
2582 
2583     if (MOI.getOrdering() == AtomicOrdering::Release ||
2584         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2585         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2586         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2587       Changed |= CC->insertRelease(MI, MOI.getScope(),
2588                                    MOI.getOrderingAddrSpace(),
2589                                    MOI.getIsCrossAddressSpaceOrdering(),
2590                                    Position::BEFORE);
2591 
2592     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2593         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2594         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2595         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2596         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2597       Changed |= CC->insertWait(MI, MOI.getScope(),
2598                                 MOI.getInstrAddrSpace(),
2599                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
2600                                                    SIMemOp::STORE,
2601                                 MOI.getIsCrossAddressSpaceOrdering(),
2602                                 Position::AFTER);
2603       Changed |= CC->insertAcquire(MI, MOI.getScope(),
2604                                    MOI.getOrderingAddrSpace(),
2605                                    Position::AFTER);
2606     }
2607 
2608     return Changed;
2609   }
2610 
2611   return Changed;
2612 }
2613 
2614 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2615   bool Changed = false;
2616 
2617   SIMemOpAccess MOA(MF);
2618   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2619 
2620   for (auto &MBB : MF) {
2621     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2622 
2623       // Unbundle instructions after the post-RA scheduler.
2624       if (MI->isBundle() && MI->mayLoadOrStore()) {
2625         MachineBasicBlock::instr_iterator II(MI->getIterator());
2626         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2627              I != E && I->isBundledWithPred(); ++I) {
2628           I->unbundleFromPred();
2629           for (MachineOperand &MO : I->operands())
2630             if (MO.isReg())
2631               MO.setIsInternalRead(false);
2632         }
2633 
2634         MI->eraseFromParent();
2635         MI = II->getIterator();
2636       }
2637 
2638       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2639         continue;
2640 
2641       if (const auto &MOI = MOA.getLoadInfo(MI))
2642         Changed |= expandLoad(*MOI, MI);
2643       else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2644         Changed |= expandStore(*MOI, MI);
2645         Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
2646       } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2647         Changed |= expandAtomicFence(*MOI, MI);
2648       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2649         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2650     }
2651   }
2652 
2653   Changed |= removeAtomicPseudoMIs();
2654   return Changed;
2655 }
2656 
2657 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2658 
2659 char SIMemoryLegalizer::ID = 0;
2660 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2661 
2662 FunctionPass *llvm::createSIMemoryLegalizerPass() {
2663   return new SIMemoryLegalizer();
2664 }
2665